In [12]:
import pandas as pd
import requests
import numpy as np
from tqdm import tqdm
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from hashlib import md5

# --- CONFIG ---
INPUT_CSV = "../clustering/intermediate_data/embeddings_added.csv"
OUTPUT_CSV = "incremental_topic_titles.csv"
TARGET_FILE = "210.json"

OLLAMA_MODEL = "phi3:mini"
TOP_N_KEYWORDS = 5
SIMILARITY_THRESHOLD = 0.3
MAX_INPUT_TOKENS = 1000

# --- SETUP ---
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
kw_model = KeyBERT(model=embedding_model)
title_cache = {}

def parse_embedding(embedding_str):
    return np.array(eval(embedding_str))

def extract_keywords(text, top_n=TOP_N_KEYWORDS):
    keywords = kw_model.extract_keywords(text, top_n=top_n, stop_words="english")
    return [kw for kw, _ in keywords]

def compute_similarity(vec1, vec2):
    return cosine_similarity([vec1], [vec2])[0][0]

def generate_title(keywords):
    prompt = f"""
You are an expert summarizer. Your task is to generate a concise TITLE (maximum 5 words) for a lecture topic based on the provided keywords.

Avoid using full sentences or punctuation unless necessary. Use keywords as hints — do not repeat them directly.

TITLE FORMAT:
- 2 to 5 words only
- No complete sentences
- No explanations

Keywords: {', '.join(keywords)}

Return ONLY the title.
"""
    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={"model": OLLAMA_MODEL, "prompt": prompt, "stream": False}
        )
        return response.json().get("response", "").strip()
    except Exception as e:
        print(f"[Error generating title] {e}")
        return f"Error - {type(e).__name__}"

def get_cached_title(keywords):
    key = md5(' '.join(keywords).encode()).hexdigest()
    if key in title_cache:
        return title_cache[key]
    title = generate_title(keywords)
    title_cache[key] = title
    return title

# --- LOAD DATA ---
df = pd.read_csv(INPUT_CSV)
df = df[df["file"] == TARGET_FILE]
df = df[df["text"].notnull() & df["embedding"].notnull()]
df = df.sort_values("start").reset_index(drop=True)

# --- PROCESS ---
results = []
current_topic_vec = None
current_topic_keywords = []
topic_start = None

for i, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing {TARGET_FILE}"):
    text = row["text"].strip()
    emb = parse_embedding(row["embedding"])

    if current_topic_vec is None:
        topic_start = row["start"]
        current_topic_vec = emb
        current_topic_keywords = extract_keywords(text)
        continue

    similarity = compute_similarity(current_topic_vec, emb)

    if similarity >= SIMILARITY_THRESHOLD:
        current_topic_vec = (current_topic_vec + emb) / 2 
        current_topic_keywords.extend(extract_keywords(text))
    else:
        title = get_cached_title(current_topic_keywords)
        print(f"[{row['start']}] → {title}")
        results.append({
            "file": TARGET_FILE,
            "start": topic_start,
            "title": title
        })

        # Start new topic
        topic_start = row["start"]
        current_topic_vec = emb
        current_topic_keywords = extract_keywords(text)

# Final topic
if current_topic_vec is not None:
    title = get_cached_title(current_topic_keywords)
    results.append({
        "file": TARGET_FILE,
        "start": topic_start,
        "title": title
    })

# --- SAVE ---
pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
print(f"\n✅ Done! Titles saved to {OUTPUT_CSV}")


Processing 210.json:   5%|████▋                                                                                  | 17/314 [00:25<19:23,  3.92s/it]

[11:55] → Graph Theory: Commonality and Diversity in Graphs and Handshake Problems


Processing 210.json:   6%|████▉                                                                                  | 18/314 [00:30<20:28,  4.15s/it]

[11:59] → Clue to Landscape Analogies in Marriage Dynamics


Processing 210.json:  37%|███████████████████████████████▎                                                    | 117/314 [02:28<1:03:26, 19.32s/it]

[2:05] → Seven Handshakes Graph Theory Lecture Title.


Processing 210.json:  87%|██████████████████████████████████████████████████████████████████████████▍           | 272/314 [05:02<17:37, 25.17s/it]

[57:14] → "Euler's Theorems on Graph Theory and Eulerian Cycles"


Processing 210.json:  93%|███████████████████████████████████████████████████████████████████████████████▋      | 291/314 [05:33<02:16,  5.93s/it]

[5:00] → "Reflexive Transitive Partition Vertex Graph Path Equivalence"

(Note: This summary adheres to the rules set by avoiding complete sentences and using only two to five words. It doesn't repeat keywords exactly but uses synonyms or related terms, providing a concise title for an expert summarizer.)


Processing 210.json:  95%|█████████████████████████████████████████████████████████████████████████████████▎    | 297/314 [05:47<01:08,  4.01s/it]

[60:01] → "Marital Harmony at Social Gatherings: Understanding Interactions and Decisions Among Couples and Singles"


Processing 210.json:  95%|█████████████████████████████████████████████████████████████████████████████████▌    | 298/314 [05:55<01:15,  4.74s/it]

[60:20] → Euler's Theorems on Connected Graph Vertex Counting


Processing 210.json:  95%|█████████████████████████████████████████████████████████████████████████████████▉    | 299/314 [06:11<01:45,  7.03s/it]

[6:11] → Cycle Music Session of the Day Proofed?

(Note: This response adheres to all constraints by using synonyms and related concepts that can be tied back to the original keywords without directly repeating them or resorting to complete sentences.)


Processing 210.json: 100%|██████████████████████████████████████████████████████████████████████████████████████| 314/314 [06:12<00:00,  1.19s/it]



✅ Done! Titles saved to incremental_topic_titles.csv


In [23]:
import pandas as pd
import requests
import numpy as np
from tqdm import tqdm
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from hashlib import md5

# --- CONFIG ---
INPUT_CSV = "../clustering/intermediate_data/embeddings_added.csv"
OUTPUT_CSV = "incremental_topic_titles.csv"
TARGET_FILE = "210.json"

OLLAMA_MODEL = "phi3:mini"
TOP_N_KEYWORDS = 5
SIMILARITY_THRESHOLD = 0.5
MAX_INPUT_TOKENS = 1000

# --- SETUP ---
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
kw_model = KeyBERT(model=embedding_model)
title_cache = {}

def parse_embedding(embedding_str):
    return np.array(eval(embedding_str))

def extract_keywords(text, top_n=TOP_N_KEYWORDS):
    keywords = kw_model.extract_keywords(text, top_n=top_n, stop_words="english")
    return [kw for kw, _ in keywords]

def compute_similarity(vec1, vec2):
    return cosine_similarity([vec1], [vec2])[0][0]

def generate_title(keywords):
    prompt = f"""You are a title generator.

Given the following keywords, generate a concise lecture topic title.

- Title must be 2 to 5 words
- No punctuation
- No complete sentences
- No repetition of keywords
- No quotes, no explanations
- Do not write anything else
- Just the title

Keywords: {', '.join(keywords)}
Title:"""

    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                "model": OLLAMA_MODEL,
                "prompt": prompt,
                "stream": False,
                "options": {
                    "temperature": 0.2,
                    "num_predict": 20,  # Limit output
                }
            }
        )
        raw = response.json().get("response", "").strip()

        # --- Force cleanup (remove explanations etc.) ---
        first_line = raw.splitlines()[0]
        clean = first_line.strip().strip('"').strip("'").strip("`")
        return clean
    except Exception as e:
        print(f"[Error generating title] {e}")
        return f"Error - {type(e).__name__}"


def get_cached_title(keywords):
    key = md5(' '.join(sorted(keywords)).encode()).hexdigest()
    if key in title_cache:
        return title_cache[key]
    title = generate_title(keywords)
    title_cache[key] = title
    return title

# --- LOAD DATA ---
df = pd.read_csv(INPUT_CSV)
df = df[df["file"] == TARGET_FILE]
df = df[df["text"].notnull() & df["embedding"].notnull()]
df = df.sort_values("start").reset_index(drop=True)

# --- PROCESS ---
results = []
i = 0
n = len(df)

while i < n:
    # Step 1: Start new topic with 5-chunk window
    seed_rows = df.iloc[i:i+5]
    topic_start = seed_rows.iloc[0]["start"]

    topic_keywords = set()
    topic_vec = None

    for _, row in seed_rows.iterrows():
        text = row["text"].strip()
        emb = parse_embedding(row["embedding"])
        keywords = extract_keywords(text)
        topic_keywords.update(keywords)
        topic_vec = emb if topic_vec is None else (topic_vec + emb) / 2

    title = get_cached_title(list(topic_keywords))
    print(title)
    results.append({
        "file": TARGET_FILE,
        "start": topic_start,
        "title": title
    })

    i += 5  # move past the initial seed

    # Step 2: Process the rest
    while i < n:
        row = df.iloc[i]
        text = row["text"].strip()
        emb = parse_embedding(row["embedding"])
        similarity = compute_similarity(topic_vec, emb)

        if similarity >= SIMILARITY_THRESHOLD:
            # Add keywords and update topic vector
            keywords = extract_keywords(text)
            topic_keywords.update(keywords)
            topic_vec = (topic_vec + emb) / 2
            i += 1
        else:
            # Not similar — create a new topic from next 5
            break  # exit to outer while loop to process next topic

# --- SAVE ---
pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
print(f"\nDone! Titles saved to {OUTPUT_CSV}")

Graph Theory and Function Representations in Diagrams
Seven Linked Edges Clue Zero Vertex Connection
Graph Pairing Dynamics
Graph Isomorphism Problem in Mathematical Structures
Bisected Isomorphic Pair Vertex Sets Example
Bisected Implied Isomorphism Conditions in Mapping Amplitudes and Vertices
Isomorphic Graph Preservation Mean Condition Examples
Graph Symmetry and Adjacency in Triangles
Graph Isomorphism Trick
Mathematical Preservation of Seven Edges in RS Map Objects
Graph Isomorphism in Planar and Cubic Graphs with Eyesight Analogy
Graph Isomorphism Subsets Power Thinking
Inner and Outer Circle Isomorphism Possible?
Easy Graph Isomorphisms and Peterson Drawings
Write Representation Graphs Matrix Adjacency Vertex Connectedness
Symmetric Graph Lengths List Example
Vertex Representation in Graph Theory
Sum of Degrees in Graphs Turns Key Case Study
Texas Vertex Counting Summation
Degree Equal Proof Cycle Graph Regularity Vertex Connectedness
Consecutive Paths in Graph Theory
Valid Adj