In [None]:

# 1) Install dependencies
!pip -q install streamlit pyngrok PyPDF2 sentence_transformers faiss-cpu transformers accelerate


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
pip -q install pyngrok

In [None]:
!pip install -q --upgrade torch
!pip install -q transformers triton==3.4 kernels
!pip uninstall -q torchvision torchaudio -y


In [None]:
#login
from huggingface_hub import login
#Set your HUGGINGFACE auth token
login("your_token_here")


In [None]:
# Colab cell 2 – clone GPT‑OSS (20 B) with Git‑LFS
!git clone https://huggingface.co/openai/gpt-oss-20b
# If you want the smaller 7 B version, replace the URL above with gpt-oss-7b
# (or whichever GGUF you have).
# The clone will download ~70 GB; use a Pro+ runtime or a persistent drive!

Cloning into 'gpt-oss-20b'...
remote: Enumerating objects: 67, done.[K
remote: Counting objects: 100% (63/63), done.[K
remote: Compressing objects: 100% (62/62), done.[K
remote: Total 67 (delta 27), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (67/67), 35.88 KiB | 1.49 MiB/s, done.
You can inspect what was checked out with 'git status'
and retry with 'git restore --source=HEAD :/'


Exiting because of "interrupt" signal.
^C


In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import torch
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AutoModelForCausalLM, AutoTokenizer
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import hashlib


# --- Load model + tokenizer ---

@st.cache_resource
def load_model():
    model_path = "/content/gpt-oss-20b"

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(
       model_path,
       torch_dtype="auto",
       device_map="auto",   # auto = GPU if available, fallback CPU
   )
    return tokenizer, model

tokenizer, model = load_model()

# --------------------------
# Load embedding model
# --------------------------
@st.cache_resource
def load_embedder():
    return SentenceTransformer("all-MiniLM-L6-v2")

embedder = load_embedder()

# --------------------------
# PDF extraction
# --------------------------
def extract_text_from_pdf(pdf_file):
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

def chunk_text(text, chunk_size=500):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# --------------------------
# Cached PDF embeddings
# --------------------------
# @st.cache_resource(show_spinner=False)
# def get_pdf_index(file_bytes):
#     """Cache embeddings per unique PDF"""
#     file_hash = hashlib.md5(file_bytes).hexdigest()
#     text = extract_text_from_pdf(file_bytes)
#     pdf_chunks = chunk_text(text)
#     vectors = embedder.encode(pdf_chunks, convert_to_numpy=True)
#     index = faiss.IndexFlatL2(vectors.shape[1])
#     index.add(vectors)
#     return pdf_chunks, index

@st.cache_resource(show_spinner=False)
def get_pdf_index(file_bytes):
    """Cache embeddings per unique PDF"""
    file_hash = hashlib.md5(file_bytes).hexdigest()

    # Read PDF from bytes
    import io
    reader = PdfReader(io.BytesIO(file_bytes))
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"

    pdf_chunks = chunk_text(text)
    vectors = embedder.encode(pdf_chunks, convert_to_numpy=True)
    index = faiss.IndexFlatL2(vectors.shape[1])
    index.add(vectors)
    return pdf_chunks, index

# --------------------------
# Retrieval + prompt
# --------------------------
def make_prompt(question, df_context, pdf_chunks, pdf_index):
    pdf_context = ""
    if pdf_chunks and pdf_index:
        q_vec = embedder.encode([question], convert_to_numpy=True)
        D, I = pdf_index.search(q_vec, k=3)
        pdf_context = "\n".join([pdf_chunks[i] for i in I[0]])

    csv_context = ""
    if df_context is not None:
        csv_context = "\n".join(
            f"{row['timestamp']} | {row['sensor_id']} | {row['event_type']} | {row['payload']}"
            for _, row in df_context.iterrows()
        )

    return f"""
You are Eco-GPT, a scientific assistant.

Logs:
{csv_context}

Scientific Notes:
{pdf_context}

Question: {question}
Answer (be accurate, concise, and eco friendly):
"""

# --------------------------
# Streamlit UI
# --------------------------
st.set_page_config(page_title="Eco-GPT", page_icon="🌱")
st.title("🌱 Eco-GPT: Scientific Q&A")
st.caption("Upload CSV logs + scientific PDFs, then ask natural questions.")

df = None
pdf_chunks, pdf_index = None, None

# File uploaders
uploaded_csv = st.file_uploader("Upload CSV logs", type=["csv"])
uploaded_pdf = st.file_uploader("Upload scientific PDF", type=["pdf"])

# Process CSV
if uploaded_csv:
    df = pd.read_csv(uploaded_csv)
    st.subheader("📜 CSV Preview")
    st.dataframe(df.head(20))

# Process PDF
if uploaded_pdf:
    with st.spinner("Extracting & indexing PDF... (cached after first run)"):
        file_bytes = uploaded_pdf.getvalue()  # ✅ convert to bytes
        pdf_chunks, pdf_index = get_pdf_index(file_bytes)
    st.success("PDF indexed successfully ✅")

# Question input
user_input = st.text_area("Ask a question:", "What animals were near stream #3?")
if st.button("Ask Eco-GPT"):
    if user_input.strip():
        with st.spinner("Thinking in riddles..."):
            messages = [
                {"role": "system", "content": "Always respond in riddles"},
                {"role": "user", "content": user_input},
            ]

            inputs = tokenizer.apply_chat_template(
                messages,
                add_generation_prompt=True,
                return_tensors="pt",
                return_dict=True,
            ).to(model.device)

            generated = model.generate(**inputs, max_new_tokens=200)
            output_text = tokenizer.decode(
                generated[0][inputs["input_ids"].shape[-1]:],
                skip_special_tokens=True
            )

        st.subheader("🌍 Eco-GPT Answer")
        st.write(output_text)
    else:
        st.warning("Please type a message first.")


Overwriting app.py


In [None]:
import os, subprocess, time, threading
from pyngrok import ngrok

# Set your ngrok auth token (get free token from ngrok.com)

# (Optional) Paste your ngrok auth token here for a more stable tunnel
NGROK_AUTH_TOKEN = ""  # e.g. "2Qx...your_token...Abc"
if NGROK_AUTH_TOKEN:
    ngrok.set_auth_token(NGROK_AUTH_TOKEN)

PORT = 8501

# Close any existing tunnels to avoid duplicates
for t in ngrok.get_tunnels():
    try:
        ngrok.disconnect(t.public_url)
    except:
        pass

# Start Streamlit in the background
cmd = [
    "streamlit", "run", "app.py",
    "--server.port", str(PORT),
    "--server.address", "0.0.0.0",
    "--server.headless", "true",
]
log_path = "/content/streamlit.log"
log_file = open(log_path, "w")
proc = subprocess.Popen(cmd, stdout=log_file, stderr=log_file, text=True)

# Give Streamlit a moment to boot
time.sleep(3)

# Start ngrok tunnel
tunnel = ngrok.connect(addr=PORT, proto="http")
public_url = tunnel.public_url
print("Your app is live at:", public_url)

# Live-tail the Streamlit logs so you can see when it's ready
def tail_logs(path):
    with open(path, "r") as f:
        f.seek(0, os.SEEK_END)
        while True:
            line = f.readline()
            if line:
                print(line, end="")
            else:
                time.sleep(0.5)

threading.Thread(target=tail_logs, args=(log_path,), daemon=True).start()

# Keep the cell alive so the tunnel stays open
try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    pass


In [None]:
%%writefile sample_logs.csv
timestamp,sensor_id,event_type,payload,metadata
2024-08-01T06:14:00,cam_trap_01,image,Jaguar observed near stream #3,"{"lat": -3.45, "lon": -62.78}"
2024-08-01T06:15:00,mic_01,audio,Cicada chorus intensity: high,"{"temp_c": 27.3}"
2024-08-01T06:20:00,env_01,temperature,Air temperature 28°C,"{"humidity":71}"


Writing sample_logs.csv
