In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import requests

# --- Config ---
CSV_PATH = "../clustering/intermediate_data/embeddings_added.csv"
OUTPUT_CSV = "incremental_topic_titles.csv"
TARGET_FILE = "210.json"
OLLAMA_MODEL = "phi3:mini"  # or "phi:latest" if using that
OLLAMA_URL = "http://localhost:11434/api/generate"

TOP_N_KEYWORDS = 5
SIMILARITY_THRESHOLD = 0.35
MAX_INPUT_CHARS = 3000

# --- Setup ---
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
kw_model = KeyBERT(model=embedding_model)

def extract_keywords(text, top_n=TOP_N_KEYWORDS):
    keywords = kw_model.extract_keywords(text, top_n=top_n, stop_words="english")
    return [kw for kw, _ in keywords]

def compute_similarity(keywords1, keywords2):
    texts = [' '.join(keywords1), ' '.join(keywords2)]
    vec = TfidfVectorizer().fit_transform(texts)
    return cosine_similarity(vec[0:1], vec[1:2])[0][0]

def generate_title_with_requests(text, keywords):
    text = text[:MAX_INPUT_CHARS]
    prompt = f"""
You are an expert teacher helping generate section titles for lecture videos.

Suggest a short, human-friendly TITLE for this section of transcript.

Keywords: {', '.join(keywords)}

Transcript:
\"\"\"{text}\"\"\"

Only return the title. No explanation.
"""

    try:
        response = requests.post(
            OLLAMA_URL,
            json={"model": OLLAMA_MODEL, "prompt": prompt, "stream": False},
        )
        title = response.json().get("response", "").strip()
        if not title:
            title = "Untitled (Empty response)"
        return title
    except Exception as e:
        print(f"[Error] while generating title: {e}")
        return f"Error - {type(e).__name__}"

# --- Load & Filter ---
df = pd.read_csv(CSV_PATH)
df = df[df["file"] == TARGET_FILE]
df = df[df["text"].notnull() & df["text"].str.strip().astype(bool)]
df = df.sort_values("start").reset_index(drop=True)

# --- Track topics ---
results = []

current_topic_chunks = []
current_topic_keywords = []

i = 0
while i < len(df):
    row = df.iloc[i]
    text = row["text"]
    start = row["start"]

    if len(current_topic_chunks) < 5 and not current_topic_keywords:
        current_topic_chunks.append(text)
        current_topic_keywords.extend(extract_keywords(text))
        i += 1
        continue

    current_keywords = extract_keywords(text)
    similarity = compute_similarity(current_topic_keywords, current_keywords)

    if similarity >= SIMILARITY_THRESHOLD:
        current_topic_chunks.append(text)
        current_topic_keywords.extend(current_keywords)
        i += 1
    else:
        combined_text = " ".join(current_topic_chunks)
        title = generate_title_with_requests(combined_text, current_topic_keywords)

        results.append({
            "file": TARGET_FILE,
            "start": df.iloc[i - len(current_topic_chunks)]["start"],
            "title": title
        })

        current_topic_chunks = [text]
        current_topic_keywords = current_keywords
        i += 1

# Handle final topic
if current_topic_chunks:
    combined_text = " ".join(current_topic_chunks)
    title = generate_title_with_requests(combined_text, current_topic_keywords)
    results.append({
        "file": TARGET_FILE,
        "start": df.iloc[len(df) - len(current_topic_chunks)]["start"],
        "title": title
    })

# --- Save ---
output_df = pd.DataFrame(results)
output_df.to_csv(OUTPUT_CSV, index=False)
print(f"\nSaved to {OUTPUT_CSV}")
print(output_df.head())


  from .autonotebook import tqdm as notebook_tqdm



