In [1]:
# Run once
!pip install -q "transformers>=4.57.0" sentencepiece safetensors accelerate einops ftfy regex pillow requests streamlit pyngrok streamlit-chat


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from google.colab import drive
drive.mount('/content/drive')  # follow the prompt
# Set storage path (change if you want)
CHAT_STORAGE_DIR = "/content/drive/MyDrive/kosmos2_streamlit_vqa"
import os
os.makedirs(CHAT_STORAGE_DIR, exist_ok=True)
print("Chat storage dir:", CHAT_STORAGE_DIR)


Mounted at /content/drive
Chat storage dir: /content/drive/MyDrive/kosmos2_streamlit_vqa


In [4]:
from google.colab import userdata
import os

HF_TOKEN = userdata.get('HF_TOKEN')
NGROK_AUTH_TOKEN = userdata.get('NGROK_AUTH_TOKEN')  # optional

if HF_TOKEN:
    os.environ['HF_TOKEN'] = HF_TOKEN
    print("HF_TOKEN loaded.")
else:
    print("⚠️ HF_TOKEN not found in userdata. Public model loading may fail for private models or be rate-limited.")

if NGROK_AUTH_TOKEN:
    os.environ['NGROK_AUTH_TOKEN'] = NGROK_AUTH_TOKEN
    print("NGROK token loaded.")
else:
    print("NGROK token not found. Tunnel may still work but is less stable.")


HF_TOKEN loaded.
NGROK token loaded.


In [5]:
%%bash
cat > app.py <<'PY'
import os
import json
import time
from pathlib import Path

import streamlit as st
from PIL import Image
import torch
from transformers import AutoProcessor, Kosmos2ForConditionalGeneration

# Optional nice chat UI component
try:
    from streamlit_chat import message as st_message
except Exception:
    st_message = None

# Storage path
DRIVE_PATH = "/content/drive/MyDrive/kosmos2_streamlit_vqa"
if Path(DRIVE_PATH).exists():
    STORAGE_DIR = Path(DRIVE_PATH)
else:
    STORAGE_DIR = Path("/content")
HISTORY_FILE = STORAGE_DIR / "kosmos2_vqa_chat_history.json"

# Ensure file exists
if not HISTORY_FILE.exists():
    HISTORY_FILE.write_text("[]")

# Load HF token from environment (set by Colab before launching)
HF_TOKEN = os.environ.get("HF_TOKEN", None)

@st.cache_resource(show_spinner=False)
def load_model_and_processor(model_id="microsoft/kosmos-2-patch14-224"):
    kwargs = {}
    if HF_TOKEN:
        kwargs["use_auth_token"] = HF_TOKEN
    st.info("Loading model (this may take ~1-2 minutes)...")
    processor = AutoProcessor.from_pretrained(model_id, **kwargs)
    model = Kosmos2ForConditionalGeneration.from_pretrained(model_id, **kwargs)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    return processor, model, device

def load_history():
    try:
        with open(HISTORY_FILE, "r") as f:
            return json.load(f)
    except Exception:
        return []

def save_history(history):
    with open(HISTORY_FILE, "w") as f:
        json.dump(history, f, indent=2)

def append_to_history(entry):
    h = load_history()
    h.append(entry)
    save_history(h)

# --- Streamlit UI ---
st.set_page_config(page_title="KOSMOS-2 VQA", layout="centered")
st.title("KOSMOS-2 VQA — Upload image & ask questions")

col1, col2 = st.columns([1,2])

with col1:
    st.header("Image")
    uploaded = st.file_uploader("Upload an image", type=["jpg","jpeg","png"])
    if uploaded:
        image = Image.open(uploaded).convert("RGB")
    else:
        if st.button("Use sample snowman"):
            import requests
            url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
            image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
        else:
            image = None

    if image:
        st.image(image, use_column_width=True)

with col2:
    st.header("Chat / VQA")
    history = load_history()
    # show previous messages
    if history:
        for item in history:
            role = item.get("role")
            text = item.get("text")
            ts = item.get("time", "")
            if role == "user":
                if st_message:
                    st_message(text, is_user=True, avatar_style="micah")
                else:
                    st.markdown(f"**You:** {text}")
            else:
                if st_message:
                    st_message(text, is_user=False, avatar_style="bottts")
                else:
                    st.markdown(f"**KOSMOS-2:** {text}")

    user_q = st.text_input("Ask a question about the image (VQA):", key="vqa_input")
    if st.button("Ask", key="ask_btn"):
        if image is None:
            st.warning("Please upload or choose a sample image first.")
        elif not user_q:
            st.warning("Type a question before hitting Ask.")
        else:
            # load model
            processor, model, device = load_model_and_processor()
            # prepare prompt; using grounding token like earlier
            prompt = "<grounding> " + user_q
            inputs = processor(text=prompt, images=image, return_tensors="pt")
            # move tensors
            for k,v in inputs.items():
                inputs[k] = v.to(device)
            # generate
            with st.spinner("Generating answer from KOSMOS-2..."):
                generated_ids = model.generate(
                    pixel_values=inputs.get("pixel_values"),
                    input_ids=inputs.get("input_ids"),
                    attention_mask=inputs.get("attention_mask"),
                    image_embeds=None,
                    image_embeds_position_mask=inputs.get("image_embeds_position_mask"),
                    use_cache=True,
                    max_new_tokens=128,
                )
                generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
                # attempt structured postprocessing
                try:
                    caption, entities = processor.post_process_generation(generated_text)
                    answer_text = caption if caption else generated_text
                except Exception:
                    answer_text = generated_text

            # display
            if st_message:
                st_message(user_q, is_user=True, avatar_style="micah")
                st_message(answer_text, is_user=False, avatar_style="bottts")
            else:
                st.markdown(f"**You:** {user_q}")
                st.markdown(f"**KOSMOS-2:** {answer_text}")

            # save to history
            append_to_history({
                "role": "user",
                "text": user_q,
                "time": time.strftime("%Y-%m-%d %H:%M:%S")
            })
            append_to_history({
                "role": "assistant",
                "text": answer_text,
                "time": time.strftime("%Y-%m-%d %H:%M:%S")
            })
            st.success("Saved to chat history.")

st.markdown("---")
st.write("Chat history stored at:", str(HISTORY_FILE))
PY


In [6]:
# Start streamlit and ngrok tunnel from Colab
import os, signal, subprocess, time
from pyngrok import ngrok

# set ngrok auth token if present
NGROK_AUTH_TOKEN = os.environ.get("NGROK_AUTH_TOKEN")
if NGROK_AUTH_TOKEN:
    ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# kill any running streamlit or ngrok (clean start)
def kill_process_by_name(name):
    try:
        subprocess.run(["pkill", "-f", name], check=False)
    except Exception:
        pass

kill_process_by_name("streamlit")
kill_process_by_name("ngrok")

# open ngrok tunnel for streamlit port 8501
public_url = ngrok.connect(8501, "http").public_url
print("ngrok tunnel URL:", public_url)

# Run streamlit in background
cmd = ["streamlit", "run", "app.py", "--server.port", "8501", "--server.runOnSave", "false"]
print("Launching Streamlit...")
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=os.setsid)

# give app a moment to start
time.sleep(4)
print("Streamlit should be starting — open the ngrok URL above (it may take a few seconds).")
print("To stop the app, run: kill -9", proc.pid)


ngrok tunnel URL: https://daniele-ritziest-saundra.ngrok-free.dev
Launching Streamlit...
Streamlit should be starting — open the ngrok URL above (it may take a few seconds).
To stop the app, run: kill -9 1363


In [8]:
!pkill -f streamlit; !pkill -f ngrok


^C
