In [6]:
!pip install ollama
!pip install tqdm ipywidgets 




[notice] A new release of pip is available: 23.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
import pandas as pd
import json
from tqdm import tqdm
import ollama


CSV_PATH = "../topic_identification/BERTopic_clustered_topics.csv"
OUTPUT_JSON = "titles_qwen(BERTopic).json"
OLLAMA_MODEL_NAME = "qwen2.5:3b"
MAX_INPUT_TOKENS = 5000
MAX_OUTPUT_TOKENS = 256

print(f"Loading data from: {CSV_PATH}")
df = pd.read_csv(CSV_PATH)

# Check columns exist
expected_cols = {'topic_num', 'top_words', 'representative_docs'}
if not expected_cols.issubset(df.columns):
    raise ValueError(f"CSV is missing required columns. Expected at least: {expected_cols}")

print(f"Loaded {len(df)} topics.")

output = {}

print(f"\nStarting title generation with Ollama model: {OLLAMA_MODEL_NAME}")
for i, row in tqdm(df.iterrows(), total=len(df), desc="Generating Titles"):
    topic_id = str(row["topic_num"])
    top_words = row["top_words"]
    representative_text = row["representative_docs"]
    
    if not isinstance(representative_text, str) or representative_text.strip() == "":
        continue

    # Truncate to avoid overloading input
    representative_text = representative_text[:MAX_INPUT_TOKENS]

    # Prompt
    messages = [
        {"role": "system", "content": "You are an AI assistant. Generate a short, clear, and human-friendly title for a lecture topic based on representative text and topic keywords."},
        {"role": "user", "content": f"Topic keywords: {top_words}\n\nTranscript:\n\"\"\"{representative_text}\"\"\"\n\nTitle:"}
    ]

    title = ""

    try:
        response = ollama.chat(
            model=OLLAMA_MODEL_NAME,
            messages=messages,
            options={
                "temperature": 0.7,
                "top_p": 0.9,
                "num_predict": MAX_OUTPUT_TOKENS
            }
        )

        if response and 'message' in response and 'content' in response['message']:
            generated = response['message']['content']
            title = generated.split("Title:")[-1].strip().strip('"').strip("'")
        else:
            title = "Error: Unexpected Ollama response"
            print(f"\nWarning: No valid title generated for topic {topic_id}. Response: {response}")

    except Exception as e:
        title = f"Error: {type(e).__name__} - {e}"
        print(f"\nError generating title for topic {topic_id}: {e}")

    output[f"topic_{topic_id}"] = {
        "title": title,
        "top_words": top_words
    }

# Save to JSON
print(f"\nSaving titles to {OUTPUT_JSON}")
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(output, f, indent=2, ensure_ascii=False)

print(f"\n✅ Process completed! Titles saved to {OUTPUT_JSON}")


Loading data from: ../topic_identification/BERTopic_clustered_topics.csv
Loaded 181 topics.

Starting title generation with Ollama model: qwen2.5:3b


Generating Titles: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 181/181 [23:57<00:00,  7.94s/it]


Saving titles to titles_qwen(BERTopic).json

✅ Process completed! Titles saved to titles_qwen(BERTopic).json



