In [1]:
!pip install ollama
!pip install tqdm ipywidgets 




[notice] A new release of pip is available: 23.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd
import json
from tqdm.notebook import tqdm 
import ollama 
import re 


def convert_timestamp_to_seconds(timestamp_str):
    if pd.isna(timestamp_str): 
        return None
    
    
    timestamp_str = str(timestamp_str).strip()

    
    parts = list(map(int, timestamp_str.split(':')))

    if len(parts) == 2: 
        minutes, seconds = parts
        return float(minutes * 60 + seconds)
    elif len(parts) == 3: 
        hours, minutes, seconds = parts
        return float(hours * 3600 + minutes * 60 + seconds)
    else:
        
        print(f"Warning: Unexpected timestamp format '{timestamp_str}'. Returning None.")
        return None

CSV_PATH = "../clustering/intermediate_data/clustered_embeddings.csv"
OUTPUT_JSON = "titles_tinyllama.json"
OLLAMA_MODEL_NAME = "tinyllama:latest"

print(f"Loading data from: {CSV_PATH}")
df = pd.read_csv(CSV_PATH)


df['start'] = df['start'].apply(convert_timestamp_to_seconds)
df['end'] = df['end'].apply(convert_timestamp_to_seconds)

df = df[df["text"].notnull() & df["cluster"].notnull() & df["start"].notnull() & df["end"].notnull()]
print(f"Loaded {len(df)} rows after cleaning and timestamp conversion.")

grouped = df.groupby(["file", "cluster"]).agg({
    "start": "min", 
    "end": "max",
    "text": lambda x: " ".join(x)
}).reset_index()
print(f"Grouped into {len(grouped)} unique file-cluster segments.")


max_length = 256 

output = {}

print(f"\nStarting title generation with Ollama model: {OLLAMA_MODEL_NAME}")
for i, row in tqdm(grouped.iterrows(), total=len(grouped), desc="Generating Titles"):
    file = row["file"]
    cluster = str(row["cluster"])
    
    start = row["start"]
    end = row["end"]
    combined_text = row["text"][:5000] 

    
    messages = [
        {"role": "system", "content": "You are an AI assistant. Please generate a short, human-friendly title for the following transcript segment of an educational lecture. Keep it within 10 words maximum."},
        {"role": "user", "content": f"Transcript:\n\"\"\"{combined_text}\"\"\"\n\nTitle:"}
    ]

    title = "" 

    try:
        
        response = ollama.chat(
            model=OLLAMA_MODEL_NAME,
            messages=messages,
            options={
                "temperature": 0.7,
                "top_p": 0.9,
                "num_predict": max_length
            }
        )

    
        if response and 'message' in response and 'content' in response['message']:
            generated_content = response['message']['content']
            title_parts = generated_content.split("Title:")
            if len(title_parts) > 1:
                title = title_parts[-1].strip().replace("\n", " ")
            else:
                title = generated_content.strip().replace("\n", " ")

            if title.startswith('"') and title.endswith('"'):
                title = title[1:-1]
            if title.startswith("'") and title.endswith("'"):
                title = title[1:-1]
        else:
            title = "Error: Could not parse Ollama response"
            print(f"\nWarning: Unexpected Ollama response structure for file {file}, cluster {cluster}. Response: {response}")

    except Exception as e:
        title = f"Error: {type(e).__name__} - {e}"
        print(f"\nError generating title for file {file}, cluster {cluster}: {e}")

    if file not in output:
        output[file] = {}

    output[file][f"cluster{cluster}"] = {
        "title": title,
        "start": start,
        "end": end
    }


print(f"\nSaving titles to {OUTPUT_JSON}")
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(output, f, indent=2, ensure_ascii=False)

print(f"\nProcess completed! Titles saved to {OUTPUT_JSON}")

Loading data from: ../clustering/intermediate_data/clustered_embeddings.csv
Loaded 3474 rows after cleaning and timestamp conversion.
Grouped into 368 unique file-cluster segments.

Starting title generation with Ollama model: tinyllama:latest


Generating Titles:   0%|          | 0/368 [00:00<?, ?it/s]