In [1]:
import pandas as pd
import re
import nltk
import os

# Set up NLTK data path
custom_nltk_path = "/Users/ananyathakur/nltk_data"
os.makedirs(custom_nltk_path, exist_ok=True)
nltk.data.path.clear()
nltk.data.path.append(custom_nltk_path)

# Download the correct tokenizer (punkt_tab for newer NLTK versions)
try:
    nltk.download('punkt_tab', download_dir=custom_nltk_path, force=True)
except:
    # Fallback to punkt if punkt_tab is not available
    nltk.download('punkt', download_dir=custom_nltk_path, force=True)

# Load the raw dataset
df = pd.read_csv("../data/raw/Speech_Completion_Prediction_Dataset.csv")

# Keep only needed columns
df = df[["VideoTitle", "Transcript"]]

# Drop missing values
df.dropna(inplace=True)

# Text cleaning function
def clean_transcript(text):
    text = str(text)
    text = re.sub(r'\s+', ' ', text)                  # remove extra whitespace
    text = re.sub(r'\[.*?\]', '', text)               # remove content in brackets
    text = re.sub(r'[^\w\s.,!?]', '', text)           # remove special characters
    text = re.sub(r'http\S+', '', text)               # remove URLs
    return text.strip()

# Apply cleaning
df["Transcript"] = df["Transcript"].apply(clean_transcript)

# Preview
df.head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/ananyathakur/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,VideoTitle,Transcript
0,Muniba_Mazari,"Whoa. Im running short of words right now, but..."
1,RIHANNA,So I made it to Harvard. Never thought Id be a...
2,Mr_Bean,My starting point when it comes to the conside...
3,Nita_Ambani,"In India, women are discriminated against even..."
4,Joe_Biden,"Hello, my fellow Americans and the people who ..."


In [2]:
import nltk
from nltk.tokenize import sent_tokenize
import os

custom_nltk_path = "/Users/ananyathakur/nltk_data"
os.makedirs(custom_nltk_path, exist_ok=True)

# Download the correct tokenizer
try:
    nltk.download('punkt_tab', download_dir=custom_nltk_path, force=True)
    print("Downloaded punkt_tab successfully")
except:
    try:
        nltk.download('punkt', download_dir=custom_nltk_path, force=True)
        print("Downloaded punkt successfully")
    except Exception as e:
        print(f"Error downloading tokenizers: {e}")

nltk.data.path.clear()  # Clear default paths to avoid conflict
nltk.data.path.append(custom_nltk_path)  # Add your user nltk_data path

# Define chunking function with error handling
def chunk_text(text, chunk_size=3):
    try:
        sentences = sent_tokenize(text)
        return [" ".join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]
    except Exception as e:
        print(f"Error tokenizing text: {e}")
        # Fallback: split by periods if tokenization fails
        sentences = text.split('. ')
        sentences = [s.strip() for s in sentences if s.strip()]
        return [" ".join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]

# Create chunks
chunks = []
for idx, row in df.iterrows():
    chunked = chunk_text(row["Transcript"])
    for i, chunk in enumerate(chunked):
        chunks.append({
            "VideoTitle": row["VideoTitle"],
            "Chunk_ID": i,
            "Chunk_Text": chunk
        })

# Save and view
df_chunks = pd.DataFrame(chunks)
df_chunks.to_csv("../data/processed/chunked_transcript.csv", index=False)
print(f"Created {len(df_chunks)} chunks from {len(df)} transcripts")
df_chunks.head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/ananyathakur/nltk_data...


Downloaded punkt_tab successfully
Created 3969 chunks from 103 transcripts


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,VideoTitle,Chunk_ID,Chunk_Text
0,Muniba_Mazari,0,"Whoa. Im running short of words right now, but..."
1,Muniba_Mazari,1,Thank you for accepting me. Thank you very muc...
2,Muniba_Mazari,2,And that disclaimer is that Ive never claimed ...
3,Muniba_Mazari,3,"Its the story of a woman who, in pursuit of he..."
4,Muniba_Mazari,4,Many people speak before they think that I kno...


In [3]:
# Install packages directly in the notebook
import sys
import subprocess

# Ensures we install to the same Python environment as the notebook
def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
packages = ['bertopic', 'umap-learn', 'hdbscan', 'scikit-learn']

for package in packages:
    try:
        print(f"Installing {package}...")
        install_package(package)
        print(f"{package} installed successfully")
    except Exception as e:
        print(f"Error installing {package}: {e}")

print("\nInstallation complete. Please restart your kernel now")
print("Go to: Kernel → Restart Kernel")

Installing bertopic...
bertopic installed successfully
Installing umap-learn...
umap-learn installed successfully
Installing hdbscan...
hdbscan installed successfully
Installing scikit-learn...
scikit-learn installed successfully

Installation complete. Please restart your kernel now
Go to: Kernel → Restart Kernel


In [4]:
from bertopic import BERTopic
from umap import UMAP
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Load your data
df = pd.read_csv("../data/processed/chunked_transcript.csv")  # Update filename if needed
docs = df["Chunk_Text"].tolist()

# UMAP for dimensionality reduction
umap_model = UMAP(
    n_neighbors=10,      # Controls local vs global structure
    n_components=5,      # Dimensionality (lower = more general clusters)
    min_dist=0.1,        # Lower means tighter clusters
    metric='cosine',
    random_state=42
)

# HDBSCAN for clustering
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=5,            # Minimum size for a topic
    metric='euclidean',
    cluster_selection_method='eom', # Better separation
    prediction_data=True
)

# CountVectorizer for topic representations
vectorizer_model = CountVectorizer(
    ngram_range=(1, 2),
    stop_words="english"
)

# BERTopic Initialization
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True,
    top_n_words=10,
    min_topic_size=5,
    low_memory=True
)

# Fit the model
topics, probs = topic_model.fit_transform(docs)

# Assign topics back
df["Topic_Label"] = topics

# Save the data
df.to_csv("../data/processed/chunked_with_topics_refined.csv", index=False)

# Explore topics
topic_info = topic_model.get_topic_info()
print(topic_info)

# Optional visualization
topic_model.visualize_topics()


  from .autonotebook import tqdm as notebook_tqdm
2025-07-13 16:16:50,282 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|████████████████████████████████| 125/125 [00:24<00:00,  5.01it/s]
2025-07-13 16:17:22,009 - BERTopic - Embedding - Completed ✓
2025-07-13 16:17:22,010 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-13 16:17:36,510 - BERTopic - Dimensionality - Completed ✓
2025-07-13 16:17:36,511 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-13 16:17:38,471 - BERTopic - Cluster - Completed ✓
2025-07-13 16:17:38,476 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-13 16:17:38,711 - BERTopic - Representation - Completed ✓


     Topic  Count                                               Name  \
0       -1   1555                          -1_people_like_know_world   
1        0    147                           0_movie_film_movies_star   
2        1     76                       1_fear_scared_airplane_fears   
3        2     60              2_road_road safety_safety_enforcement   
4        3     52             3_thank_grateful_thank thank_gratitude   
..     ...    ...                                                ...   
137    136      5                       136_school_cool_want love_50   
138    137      5  137_imagination_acts_thinks terms_acts imagina...   
139    138      5        138_color_yeah gotta_morning bus_bus school   
140    139      5                    139_sana_mountain_syria_stories   
141    140      5           140_vajaben_village_netrang_malnourished   

                                        Representation  \
0    [people, like, know, world, just, im, want, do...   
1    [movie, film, 

In [5]:
import pandas as pd

# Load your topic-labeled data (adjust path as needed)
df = pd.read_csv("../data/processed/chunked_with_topics_refined.csv")
print(df.columns)

# Show how many -1 labels exist
print("Number of unassigned (-1) topics:", (df["Topic_Label"] == -1).sum())

# Convert Topic to int if needed
df["Topic_Label"] = df["Topic_Label"].astype(int)

# Reassign -1s by majority topic within the same video
def replace_outliers_with_majority(group):
    most_common = group.loc[group["Topic_Label"] != -1, "Topic_Label"].mode()
    replacement = most_common.iloc[0] if not most_common.empty else 0
    group["Reassigned_Topic"] = group["Topic_Label"].replace(-1, replacement)
    return group

df = df.groupby("VideoTitle", group_keys=False).apply(replace_outliers_with_majority)

# Check result
print("Unique topics after reassignment:", df["Reassigned_Topic"].unique())
print(df[["VideoTitle", "Chunk_ID", "Reassigned_Topic"]].head())

# Save for next steps
df.to_csv("../data/processed/chunked_with_topics_refined.csv", index=False)

Index(['VideoTitle', 'Chunk_ID', 'Chunk_Text', 'Topic_Label'], dtype='object')
Number of unassigned (-1) topics: 1555
Unique topics after reassignment: [  7   3  10   1  63  36  73 112  45   9  41  76   8  30  62  75  17  34
  68   6 119  24 140  12  82  15  91  22 133  65  21   0  52 117  64 136
  67   5  39  72  96  47  14  87  69  20   4  32  88  29  81 102  86  23
 100  42  74  85  35  95  60  79  48  19  31  26  80  59  56  46  66  18
  54 131  83  70 129  55 111 114  25 132  84  97  43 134  78 126  50  98
 118 125  33 123  13 110 127  11  38  37  57  16  90 120 121 107 130 137
  94 122  58  53 138  89 124  40  51 105 116  99 101 115 139  71  28 113
  49  27 103 104 106 128  77  93 108 109  44   2  92 135  61]
      VideoTitle  Chunk_ID  Reassigned_Topic
0  Muniba_Mazari         0                 7
1  Muniba_Mazari         1                 3
2  Muniba_Mazari         2                 7
3  Muniba_Mazari         3                 7
4  Muniba_Mazari         4                 7


In [6]:

import pandas as pd
import numpy as np

# Assuming your DataFrame is `df`
df = pd.read_csv("../data/processed/chunked_with_topics_refined.csv")  # or the latest file

# Check Topic Labels
print("Unique Topic Labels:", df["Reassigned_Topic"].unique())

# Recompute Cumulative Unique Topics
df["Cumulative_Unique_Topics"] = (
    df.groupby("VideoTitle")["Reassigned_Topic"]
    .transform(lambda x: x.expanding().apply(lambda s: len(set(s)), raw=False))
)

# Check for correctness
print(df[["VideoTitle", "Chunk_ID", "Reassigned_Topic", "Cumulative_Unique_Topics"]].head())

# Adaptive Saturation Detection
def detect_saturation(group, window=3):
    counts = group["Cumulative_Unique_Topics"].values
    diffs = np.diff(counts)
    for i in range(len(diffs) - window):
        if np.all(diffs[i:i+window] == 0):
            return group["Chunk_ID"].iloc[i + window]
    return group["Chunk_ID"].iloc[-1]  # If no saturation found, use last chunk

saturation_info = (
    df.groupby("VideoTitle")
    .apply(lambda g: pd.Series({
        "Saturation_Chunk_Position": detect_saturation(g),
        "Total_Chunks": len(g),
        "Estimated_Completion_%": round(
            (detect_saturation(g) / len(g)) * 100, 2)
    }))
    .reset_index()
)

# Merge saturation info back
df_final = df.merge(saturation_info, on="VideoTitle")

# Save final
df_final.to_csv("../data/processed/final_topic_modeling.csv", index=False)

print("Completed pipeline.")

Unique Topic Labels: [  7   3  10   1  63  36  73 112  45   9  41  76   8  30  62  75  17  34
  68   6 119  24 140  12  82  15  91  22 133  65  21   0  52 117  64 136
  67   5  39  72  96  47  14  87  69  20   4  32  88  29  81 102  86  23
 100  42  74  85  35  95  60  79  48  19  31  26  80  59  56  46  66  18
  54 131  83  70 129  55 111 114  25 132  84  97  43 134  78 126  50  98
 118 125  33 123  13 110 127  11  38  37  57  16  90 120 121 107 130 137
  94 122  58  53 138  89 124  40  51 105 116  99 101 115 139  71  28 113
  49  27 103 104 106 128  77  93 108 109  44   2  92 135  61]
      VideoTitle  Chunk_ID  Reassigned_Topic  Cumulative_Unique_Topics
0  Muniba_Mazari         0                 7                       1.0
1  Muniba_Mazari         1                 3                       2.0
2  Muniba_Mazari         2                 7                       2.0
3  Muniba_Mazari         3                 7                       2.0
4  Muniba_Mazari         4                 7       

In [7]:
print(df.columns)

Index(['VideoTitle', 'Chunk_ID', 'Chunk_Text', 'Topic_Label',
       'Reassigned_Topic', 'Cumulative_Unique_Topics'],
      dtype='object')


In [8]:
print(df.columns.tolist())


['VideoTitle', 'Chunk_ID', 'Chunk_Text', 'Topic_Label', 'Reassigned_Topic', 'Cumulative_Unique_Topics']


In [9]:
import matplotlib.pyplot as plt
import os

# Create output folder if it doesn't exist
output_dir = "../results/plots/topic_progression/reassigned"
os.makedirs(output_dir, exist_ok=True)

# Loop over each video and plot its topic progression
for title, group in df.groupby("VideoTitle"):
    plt.figure(figsize=(8, 4))
    plt.plot(group["Chunk_ID"], group["Cumulative_Unique_Topics"], marker='o', color='teal')
    plt.title(f"Topic Progression: {title}", fontsize=12)
    plt.xlabel("Chunk Index")
    plt.ylabel("Cumulative Unique Topics")
    plt.grid(True)
    plt.tight_layout()

    # Clean title to use in filename
    safe_title = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).rstrip()
    filename = os.path.join(output_dir, f"{safe_title}.png")

    # Save the figure
    plt.savefig(filename)
    plt.close()



In [10]:
import matplotlib.pyplot as plt
import os
import re
# Load final processed data
import pandas as pd

df_final = pd.read_csv("../data/processed/final_topic_modeling.csv")


output_dir = "../results/plots/topic_progression/reassigned"
os.makedirs(output_dir, exist_ok=True)

for video_title, group in df_final.groupby("VideoTitle"):
    chunk_ids = group["Chunk_ID"].values
    cumulative_topics = group["Cumulative_Unique_Topics"].values
    saturation_chunk = group["Saturation_Chunk_Position"].iloc[0]

    if saturation_chunk in chunk_ids:
        saturation_topic = group[group["Chunk_ID"] == saturation_chunk]["Cumulative_Unique_Topics"].values[0]

        plt.figure(figsize=(8, 4))
        plt.plot(chunk_ids, cumulative_topics, marker="o", label="Cumulative Unique Topics")
        plt.axvline(x=saturation_chunk, color="red", linestyle="--", label="Saturation Point")
        plt.axhline(y=saturation_topic, color="gray", linestyle="--", alpha=0.5)
        plt.title(f"Topic Saturation – {video_title}")
        plt.xlabel("Chunk ID")
        plt.ylabel("Cumulative Unique Topics")
        plt.legend()
        plt.tight_layout()

        safe_title = re.sub(r'[\\/*?:"<>|]', "_", video_title)
        plt.savefig(f"{output_dir}/{safe_title}_saturation_plot.png")
        plt.close()

    else:
        print(f"Skipping plot: Chunk ID {saturation_chunk} not found in '{video_title}'")


In [11]:
df2 = pd.read_csv("../data/processed/final_topic_modeling.csv")

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

# Only keep one row per speech for prediction
df_speech_level = df2.drop_duplicates(subset=["VideoTitle"])[
    ["Saturation_Chunk_Position", "Total_Chunks", "Estimated_Completion_%"]
]

# Define features and target
X = df_speech_level[["Saturation_Chunk_Position", "Total_Chunks"]]
y = df_speech_level["Estimated_Completion_%"]

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Initialize and train the model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on test set
y_pred = rf.predict(X_test)

# Evaluation
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"R² Score: {r2:.2f}")


Mean Absolute Error: 0.76
R² Score: 0.98


In [14]:
os.makedirs("../model", exist_ok=True)

# Save BERTopic model
topic_model.save("../model/final_bertopic_model")

# Save Completion Predictor (e.g. Random Forest)
import joblib
joblib.dump(rf, "../model/completion_predictor.pkl")



['../model/completion_predictor.pkl']