In [None]:
import sys
import os

# add project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.db.snowflake_client import SnowflakeORM
from src.db.tables import DiscoveredTopics

client_snowflake = SnowflakeORM()

with client_snowflake.session_scope() as session:
        query = session.query(
            DiscoveredTopics.topic_name,
            DiscoveredTopics.description,
        )
        all_topics = query.all()

In [None]:
import sys
import os

# add project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.db.snowflake_client import SnowflakeORM
from src.db.tables import ChromeHistory

client_snowflake = SnowflakeORM()

with client_snowflake.session_scope() as session:
        query = session.query(
            ChromeHistory.title,
            ChromeHistory.url,
        )
        all_topics = query.all()

In [None]:
import pandas as pd

df = pd.DataFrame(all_topics).groupby('url').count()
df.sort_values('title', ascending=False).head(20)

In [None]:
from src.topic_modeling.gemini import call_llm 
from src.topic_modeling.prompts import TOPIC_REFINMENT_PROMPT
from src.topic_modeling.topic_discovery import extract_json
import json

def format_topics(topics):
    """Format a list of (title, description) into a single structured text block."""
    lines = []
    for i, (title, desc) in enumerate(topics, start=1):
        lines.append(f"Topic {i}:\n- Name: {title}\n- Description: {desc}\n")
    return "\n".join(lines)

prompt = TOPIC_REFINMENT_PROMPT.format(
    all_topics=format_topics(all_topics)
    )
response = call_llm(prompt)

try:
    topics_dict = extract_json(response)
    print(topics_dict)
except json.JSONDecodeError:
    print(response)


In [None]:
import re
import string
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources if needed
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def preprocess_text(text: str) -> str:
    """Clean, remove stopwords, and lemmatize text."""
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)  # keep only letters
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
    return " ".join(tokens)


def split_topics(data):
    """Split topic names on '&' and duplicate rows with same description."""
    new_data = []
    for topic, desc in data:
        parts = [part.strip() for part in topic.split("&")]
        for part in parts:
            new_data.append((part, desc))
    return new_data


def analyze_ngrams(all_topics, ngram_range=(1, 3), top_k=20):
    """
    Compute TF-IDF scores for n-grams in topic names.
    Returns ranked list of top n-grams with their TF-IDF score.
    """
    processed = split_topics(all_topics)
    cleaned_topics = [preprocess_text(topic) for topic, _ in processed]

    # TF-IDF on unigrams, bigrams, trigrams
    vectorizer = TfidfVectorizer(
        ngram_range=ngram_range,
        stop_words="english",
        min_df=1,
        max_df=0.9
    )
    tfidf_matrix = vectorizer.fit_transform(cleaned_topics)
    feature_names = vectorizer.get_feature_names_out()

    # Compute average TF-IDF per term across topics
    scores = tfidf_matrix.mean(axis=0).A1
    tfidf_scores = dict(zip(feature_names, scores))

    # Sort terms by TF-IDF score
    top_terms = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]

    return top_terms, tfidf_scores


def plot_wordcloud(tfidf_scores):
    """Plot a wordcloud from TF-IDF scores."""
    wordcloud = WordCloud(
        width=900,
        height=450,
        background_color="white"
    ).generate_from_frequencies(tfidf_scores)

    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()


top_terms, tfidf_scores = analyze_ngrams(all_topics, ngram_range=(1, 3), top_k=15)

print("🔝 Top n-grams ranked by TF-IDF:")
for term, score in top_terms:
    print(f"{term}: {score:.4f}")

# Plot wordcloud from all terms
plot_wordcloud(tfidf_scores)


In [None]:
pip install umap-learn

In [None]:
from sentence_transformers import SentenceTransformer
import umap
import random
import matplotlib.pyplot as plt

# Suppose your preprocessed titles and descriptions are in a list
titles = [title for title, description in all_topics]
descriptions = [description for title, description in all_topics]

# 1️⃣ Create embeddings using a lightweight model
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(descriptions, show_progress_bar=True)

# 2️⃣ Apply UMAP for 2D visualization
umap_reducer = umap.UMAP(n_neighbors=5, min_dist=0.3, random_state=42)
embeddings_2d = umap_reducer.fit_transform(embeddings)
# 3️⃣ Plot
plt.figure(figsize=(14, 10), dpi=120)
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], s=50, c='skyblue', alpha=0.7)

# Randomly select 50% of indices to label
indices_to_label = random.sample(range(len(titles)), k=len(titles)//2)

for i in indices_to_label:
    plt.text(
        embeddings_2d[i, 0]+0.01, 
        embeddings_2d[i, 1]+0.01, 
        titles[i][:25],  # first 25 chars of title
        fontsize=8
    )

plt.title("UMAP projection of generated topic descriptions (titles shown)", fontsize=16)
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.grid(True, alpha=0.3)
plt.show()