In [None]:
!python3 -m venv .venv
!source .venv/bin/activate
# !pip install transformers
!pip install -r requirements.txt

In [None]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

In [None]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

In [None]:
def predict_step(image_paths):
    images = []
    for image_path in image_paths:
        i_image = Image.open(image_path)
        if i_image.mode != "RGB":
            i_image = i_image.convert(mode="RGB")

        images.append(i_image)

    pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    output_ids = model.generate(pixel_values, **gen_kwargs)

    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return preds

In [None]:
# test the model with sample image
# need to upload image to google colab first
image_paths = ["/content/family.jpg"]
img = Image.open("/content/family.jpeg")
display(img)
predict_step(image_paths)

In [None]:
# Alternative model from Salesforce
# Model page: https://huggingface.co/Salesforce/blip-image-captioning-baseß
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

In [None]:
# Load model directly
from transformers import AutoProcessor, AutoModelForVision2Seq
import requests
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = AutoModelForVision2Seq.from_pretrained("Salesforce/blip-image-captioning-base")

In [None]:
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

# conditional image captioning
text = "a photography of"
inputs = processor(raw_image, text, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))
# >>> a photography of a woman and her dog

# unconditional image captioning
inputs = processor(raw_image, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))
# >>> a woman sitting on the beach with her dog

In [None]:
from transformers import AutoModelForCausalLM, AutoModelForVision2Seq, AutoTokenizer, AutoProcessor

In [None]:
qwen_model_id = "Qwen/Qwen2.5-VL-7B-Instruct"

qwen_model = AutoModelForVision2Seq.from_pretrained(
    qwen_model_id,
    dtype=torch.float16,
    device_map="auto"
)

qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_id)
qwen_processor = AutoProcessor.from_pretrained(qwen_model_id)

In [None]:
long_prompt = """
You are a compassionate storyteller. Using the attached photo, craft a 120–180 word micro‑story intended for an older adult and their family. The story should gently evoke memories, spark warm conversation, and support emotional well‑being.

Guidelines:
- Focus on mood, place, season, and relationships; avoid listing objects.
- Weave in 2–3 sensory details (sounds, scents, textures, light).
- Use warm, respectful language and short, vivid sentences.
- Avoid definitive claims about names, ages, or locations. Use gentle, tentative phrasing (perhaps, it seems, maybe).
- If people appear, emphasize connection and small rituals rather than appearance.
- Be inclusive and avoid stereotypes; balance nostalgia with quiet hope.
- If text is clearly legible in the image, you may thoughtfully incorporate it.
- If the scene is ambiguous, lean into universal themes (family, gatherings, journeys, everyday moments).

Output:
- 1–2 paragraphs of story.

Variants (pick one voice if you want to steer style):
- Voice A (third‑person close): Tell the story from a gentle narrator’s view.
- Voice B (first‑person elder): Write as if an older adult is recalling the moment in the photo.
- Voice C (second person): Address a loved one directly, with tenderness and gratitude.

Examples of style knobs you can add:
- Tone: warm and hopeful; lightly bittersweet; playful nostalgia.
- Era cues: hint at a decade only if strongly suggested by the image.
- Cultural touch: include respectful, non‑stereotyped details only if clearly present.

"""

short_prompt = """Write a 120–180 word micro‑story inspired by this photo for an older adult and their family. Describe the photo honestly, write the story to evoke gentle reminiscence and well‑being. Use warm, simple language, 2–3 sensory details, and avoid object lists. Use tentative phrasing for uncertain facts. Emphasize connection and small rituals."""

# Both long and short prompt works well. Use long one to generate story with variants (different perspective), use short one to generate clean and short story.
# I also tried two outputs of generating 1. 1–2 paragraphs of story 2. Then add “Conversation starters:” followed by two open‑ended, gentle questions that invite sharing (e.g., “What songs did you hear at gatherings like this?”).
# However, generating two open-ended questions that invite sharing (to echo with our proposal of sparking meaningful family conversation) did not work well (sometimes it is off topic or too general) so I removed it


In [None]:
def generate_story_qwen(image_path, prompt=short_prompt):
    image = Image.open(image_path).convert("RGB")

    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": prompt}
        ]}
    ]

    text_prompt = qwen_processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = qwen_processor(text=[text_prompt], images=[image], return_tensors="pt").to("cuda")

    output_ids = qwen_model.generate(**inputs, max_new_tokens=300)

    generated_text = qwen_tokenizer.decode(
        output_ids[0][inputs["input_ids"].shape[1]:],
        skip_special_tokens=True
    )

    return generated_text.strip()


In [None]:
image_path = "/content/family.jpeg" # should later replace with input
# three examples for testing ["img_1.jpg", "img_2.jpg", "img_3.png"]
image_story = generate_story_qwen(image_path)
### usually takes about 2 mins to generate the result on T4 GPU

print(image_story)

In [None]:
# test output using short prompt and image_path = "/content/family.jpeg"
# The sun was warm on their backs as they sat together on the soft sand, 
# the ocean's gentle lapping behind them. 
# The sky was a clear blue, and the air was filled with the salty scent of the sea. 
# They were a family, four of them, all smiling at the camera. 
# The older adult, perhaps their grandmother, had her arm around her daughter, who was holding a young boy close. 
# The boy, with his bright eyes and curious smile, looked up at his parents, who were also beaming. 
# It felt like a perfect day, a moment frozen in time, a reminder of simpler times when life was less rushed and more about enjoying each other’s company.
# The sound of laughter mingled with the waves, creating a symphony of happiness that filled the air.
