In [1]:
import numpy as np
import pandas as pd
import joblib
import json

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

import umap
import hdbscan
from hdbscan import approximate_predict

import ruptures as rpt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../data/processed/chunked_with_changepoints.csv")

def parse_embedding(s):
    s_clean = s.strip("[]")
    return np.fromstring(s_clean, sep=" ")

df["embedding"] = df["embedding"].apply(parse_embedding)
print(f"Loaded {len(df)} rows across {df['VideoTitle'].nunique()} videos")

Loaded 4031 rows across 101 videos


In [3]:
def extract_enhanced_features(embeddings):
    novelties = []
    for i in range(1, len(embeddings)):
        sim = cosine_similarity([embeddings[i]], [embeddings[i-1]])[0][0]
        novelties.append(1 - sim)

    if not novelties:
        return [0] * 10

    mean_novelty = np.mean(novelties)
    var_novelty = np.var(novelties)
    max_novelty = np.max(novelties)
    min_novelty = np.min(novelties)

    try:
        trend_novelty = np.polyfit(range(len(novelties)), novelties, 1)[0]
    except:
        trend_novelty = 0

    recent_window = min(5, len(novelties))
    recent_novelty = np.mean(novelties[-recent_window:]) if recent_window > 0 else 0

    num_cps = 0
    cp_density = 0
    recent_cp_activity = 0
    cp_recency = 0

    if len(embeddings) >= 5:
        try:
            novelty_array = np.array(novelties).reshape(-1, 1)
            change_points = []

            model_cp = rpt.Pelt(model="l2").fit(novelty_array)
            cps = model_cp.predict(pen=3)
            if len(cps) > 1:
                change_points.extend(cps[:-1])

            novelty_threshold = np.mean(novelties) + 1.5 * np.std(novelties)
            threshold_cps = [i for i, nov in enumerate(novelties) if nov > novelty_threshold]
            change_points.extend(threshold_cps)

            change_points = sorted(list(set(change_points)))
            change_points = [cp for cp in change_points if 0 < cp < len(embeddings)]

            num_cps = len(change_points)
            if num_cps > 0:
                cp_density = num_cps / len(embeddings)
                recent_threshold = int(0.7 * len(embeddings))
                recent_cps = sum(1 for cp in change_points if cp > recent_threshold)
                chunks_in_recent = len(embeddings) - recent_threshold
                recent_cp_activity = recent_cps / max(1, chunks_in_recent)
                last_cp = max(change_points)
                cp_recency = (len(embeddings) - last_cp) / len(embeddings)

        except Exception as e:
            pass

    return [
        mean_novelty, var_novelty, max_novelty, min_novelty,
        trend_novelty, recent_novelty, num_cps, cp_density,
        recent_cp_activity, cp_recency
    ]


In [4]:
feature_rows = []
target_rows = []
raw_chunks = []

for video_title, group in df.groupby("VideoTitle"):
    group = group.reset_index(drop=True)
    embeddings = list(group["embedding"])
    total_chunks = len(group)

    for i in range(5, total_chunks):
        feats = extract_enhanced_features(embeddings[:i])
        feature_rows.append(feats)
        target_rows.append((i / total_chunks) * 100)

        # Store both Chunk and Speech_ID
        raw_chunks.append({
            "Chunk": group["Chunk"].iloc[i - 1],
            "Speech_ID": video_title
        })

X = np.array(feature_rows)
y = np.array(target_rows)
X_df = pd.DataFrame(raw_chunks)

print(f"Features: {X.shape}, Targets: {y.shape}, Text Chunks: {X_df.shape}")

Features: (3526, 10), Targets: (3526,), Text Chunks: (3526, 2)


In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_val, y_train, y_val, X_df_train, X_df_val = train_test_split(
    X_scaled, y, X_df, test_size=0.2, random_state=42
)

In [6]:
from hdbscan import approximate_predict 

class TextFeatureExtractor(BaseEstimator, TransformerMixin):
    """Extract structural features from text"""
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        features = []
        for text in X:
            words = text.split()
            num_words = len(words)
            num_chars = len(text)
            num_commas = text.count(',')
            num_periods = text.count('.')
            num_exclaims = text.count('!')
            num_questions = text.count('?')
            unique_words = len(set(words))
            fraction_unique_words = unique_words / (num_words + 1e-5)
            
            features.append([
                num_words, num_chars, num_commas, num_periods, 
                num_exclaims, num_questions, unique_words, fraction_unique_words
            ])
        
        return np.array(features)
    
    def get_feature_names_out(self, input_features=None):
        return ['num_words', 'num_chars', 'num_commas', 'num_periods', 
                'num_exclaims', 'num_questions', 'unique_words', 'fraction_unique_words']


class EmbeddingTransformer(BaseEstimator, TransformerMixin):
    """Generate sentence embeddings"""
    
    def __init__(self, model_name="all-mpnet-base-v2"):
        self.model_name = model_name
        self.model = None
    
    def fit(self, X, y=None):
        self.model = SentenceTransformer(self.model_name)
        return self
    
    def transform(self, X):
        if self.model is None:
            self.model = SentenceTransformer(self.model_name)
        return self.model.encode(X.tolist(), show_progress_bar=True)


class ClusteringTransformer(BaseEstimator, TransformerMixin):
    """Apply PCA, UMAP, and clustering to embeddings"""
    
    def __init__(self, n_pca_components=5, n_umap_components=2):
        self.n_pca_components = n_pca_components
        self.n_umap_components = n_umap_components
        self.pca = PCA(n_components=n_pca_components, random_state=42)
        self.umap_model = umap.UMAP(n_neighbors=15, min_dist=0.0, 
                                    n_components=n_umap_components, random_state=42)
        self.clusterer = hdbscan.HDBSCAN(min_cluster_size=5, prediction_data=True)
        
    def fit(self, X, y=None):
        pca_features = self.pca.fit_transform(X)
        umap_features = self.umap_model.fit_transform(X)
        self.clusterer.fit(umap_features)
        return self
    
    def transform(self, X):
        pca_features = self.pca.transform(X)
        umap_features = self.umap_model.transform(X)

        cluster_labels, _ = approximate_predict(self.clusterer, umap_features)
        is_noise = (cluster_labels == -1).astype(int)

        features = np.column_stack([pca_features, cluster_labels, is_noise])
        return features, cluster_labels

    def get_feature_names_out(self, input_features=None):
        pca_names = [f'pca_{i+1}' for i in range(self.n_pca_components)]
        return pca_names + ['cluster', 'is_noise']

print("Custom transformers created successfully.")

Custom transformers created successfully.


In [7]:
class SpeechCompletionPredictor(BaseEstimator):
    """Single unified model for speech completion prediction"""

    def __init__(self):
        self.text_extractor = TextFeatureExtractor()
        self.embedding_transformer = EmbeddingTransformer()
        self.clustering_transformer = ClusteringTransformer()
        self.lgb_model = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.01, random_state=42)
        self.feature_names = None
        self.is_fitted = False

    def _compute_temporal_features(self, df_temp, cluster_labels):
        """Compute temporal cluster features"""
        cluster_progress = []
        fraction_unique_clusters = []

        for _, group in df_temp.groupby("Speech_ID"):
            seen = set()
            total_clusters = set(group['cluster'].unique())
            cluster_seen = []
            frac_unique = []

            for c in group['cluster']:
                cluster_seen.append(int(c in seen))
                seen.add(c)
                frac_unique.append(len(seen) / len(total_clusters))

            cluster_progress.extend(cluster_seen)
            fraction_unique_clusters.extend(frac_unique)

        return np.array(cluster_progress), np.array(fraction_unique_clusters)

    def _prepare_features(self, X_df):
        """Prepare all features for training/prediction"""
        text_features = self.text_extractor.transform(X_df['Chunk'])
        embeddings = self.embedding_transformer.transform(X_df['Chunk'])
        clustering_features, cluster_labels = self.clustering_transformer.transform(embeddings)

        df_temp = X_df.copy()
        df_temp['cluster'] = cluster_labels
        cluster_seen, frac_unique = self._compute_temporal_features(df_temp, cluster_labels)

        all_features = np.column_stack([
            text_features,
            clustering_features,
            cluster_seen.reshape(-1, 1),
            frac_unique.reshape(-1, 1)
        ])

        return all_features

    def fit(self, X_df, y, eval_set=None):
        """Fit the complete model"""
        text_features = self.text_extractor.fit_transform(X_df['Chunk'])
        embeddings = self.embedding_transformer.fit_transform(X_df['Chunk'])
        clustering_features, cluster_labels = self.clustering_transformer.fit_transform(embeddings)

        df_temp = X_df.copy()
        df_temp['cluster'] = cluster_labels
        cluster_seen, frac_unique = self._compute_temporal_features(df_temp, cluster_labels)

        all_features = np.column_stack([
            text_features,
            clustering_features,
            cluster_seen.reshape(-1, 1),
            frac_unique.reshape(-1, 1)
        ])

        self.feature_names = (
            self.text_extractor.get_feature_names_out() +
            self.clustering_transformer.get_feature_names_out() +
            ['cluster_seen_before', 'fraction_unique_clusters']
        )


        if eval_set is not None:
            X_val_df, y_val = eval_set
            val_features = self._prepare_features(X_val_df)
            eval_set_processed = [(val_features, y_val)]

            self.lgb_model.fit(
                all_features, y,
                eval_set=eval_set_processed,
                eval_metric='mae',
                callbacks=[early_stopping(stopping_rounds=50), log_evaluation(50)]
            )
        else:
            self.lgb_model.fit(all_features, y)

        self.is_fitted = True
        return self

    def predict(self, X_df):
        """Make predictions"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before making predictions")

        features = self._prepare_features(X_df)
        return self.lgb_model.predict(features)

    def get_feature_importance(self):
        """Get feature importance"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before getting feature importance")

        importance_df = pd.DataFrame({
            'feature': self.feature_names,
            'importance': self.lgb_model.feature_importances_ / self.lgb_model.feature_importances_.sum()
        }).sort_values(by='importance', ascending=False)

        return importance_df

print("Single unified model class created successfully.")

Single unified model class created successfully.


In [8]:
import os

model_file = "../model/speech_completion_clustering_model.pkl"

print("File exists:", os.path.exists(model_file))
print("File size:", os.path.getsize(model_file), "bytes")

File exists: True
File size: 445766314 bytes


In [9]:
import joblib
from bertopic import BERTopic

# Load Random Forest model (semantic + structural features)
rf_model = joblib.load("../model/tuned_random_forest_model.pkl")

# Load clustering-based model (encapsulates its own feature pipeline)
cluster_model = joblib.load("../model/speech_completion_clustering_model.pkl")

# Load BERTopic model and completion regressor trained on topic-based saturation features
topic_model = BERTopic.load("../model/final_bertopic_model")
completion_model = joblib.load("../model/completion_predictor.pkl")

print("All models loaded successfully")

All models loaded successfully


In [10]:
# Random Forest (semantic + structural features)
pred1 = rf_model.predict(X_val)

# Clustering-based Model
pred2 = cluster_model.predict(X_df_val)

Batches: 100%|██████████████████████████████████| 23/23 [00:10<00:00,  2.16it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# Compute Total Chunks per video
video_lengths = X_df_val.groupby("Speech_ID").agg({
    "Chunk": "count"
}).rename(columns={"Chunk": "Total_Chunks"}).reset_index()

# Simulate Saturation Chunk (e.g., midpoint, unless you stored actual)
video_lengths["Saturation_Chunk_Position"] = video_lengths["Total_Chunks"] // 2

# Predict estimated completion percentage
X_topic = video_lengths[["Saturation_Chunk_Position", "Total_Chunks"]]
video_lengths["Predicted_Saturation"] = completion_model.predict(X_topic)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error

df_topic = pd.read_csv("../data/processed/final_topic_modeling.csv")

# Dropping any potential duplicates 
df_topic = df_topic.drop_duplicates(subset=["VideoTitle", "Chunk_ID"])

# Preparation of X_df_val with Chunk_ID
X_df_val_temp = X_df_val.copy()
X_df_val_temp["Chunk_ID"] = X_df_val_temp.groupby("Speech_ID").cumcount()
X_df_val_temp = X_df_val_temp.rename(columns={"Speech_ID": "VideoTitle"})

# Ensure types match
X_df_val_temp["Chunk_ID"] = X_df_val_temp["Chunk_ID"].astype(int)
df_topic["Chunk_ID"] = df_topic["Chunk_ID"].astype(int)

# Merge cleanly
X_val_topic = X_df_val_temp.merge(
    df_topic[["VideoTitle", "Chunk_ID", "Saturation_Chunk_Position", "Total_Chunks", "Estimated_Completion_%"]],
    on=["VideoTitle", "Chunk_ID"],
    how="inner"  # ensures only valid matches
)

# Predict
topic_features = X_val_topic[["Saturation_Chunk_Position", "Total_Chunks"]].fillna(0)
pred3 = completion_model.predict(topic_features)

# True values
y_true = X_val_topic["Estimated_Completion_%"].values

# Evaluate
print(f"Topic Model MAE: {mean_absolute_error(y_true, pred3):.4f}%")
print(f"pred3 shape: {pred3.shape}")
print(f"y_true shape: {y_true.shape}")

Topic Model MAE: 0.7585%
pred3 shape: (706,)
y_true shape: (706,)


In [None]:
from sklearn.metrics import mean_absolute_error

# Evaluation of Random Forest and Clustering using y_val (original target)
mae_rf = mean_absolute_error(y_val, pred1)
mae_cluster = mean_absolute_error(y_val, pred2)

# Evaluation of Topic Model using y_true (from df_topic)
mae_topic = mean_absolute_error(y_true, pred3)

# Print MAEs
print(f"Random Forest MAE: {mae_rf:.4f}")
print(f"Clustering Model MAE: {mae_cluster:.4f}")
print(f"Topic Model MAE: {mae_topic:.4f}")

Random Forest MAE: 5.9796
Clustering Model MAE: 22.8508
Topic Model MAE: 0.7585


In [None]:
import numpy as np
import json
from sklearn.metrics import mean_absolute_error

best_mae = float("inf")
best_weights = (0.1, 0.02, 0.88)  # starting guess

step = 0.01
w1_min, w2_min = 0.05, 0.01

for w1 in np.arange(w1_min, 1.0, step):
    for w2 in np.arange(w2_min, 1.0 - w1 + step, step):
        w3 = 1.0 - w1 - w2
        if w3 < 0:
            continue

        ensemble_pred = w1 * pred1 + w2 * pred2 + w3 * pred3
        mae = mean_absolute_error(y_true, ensemble_pred)

        if mae < best_mae:
            best_mae = mae
            best_weights = (w1, w2, w3)

# Extract final weights
w1, w2, w3 = best_weights

print("Constrained Best Weights:")
print(f"Random Forest (w_rf): {w1:.4f}")
print(f"Clustering     (w_cluster): {w2:.4f}")
print(f"Topic Model    (w_topic): {w3:.4f}")
print(f"\nFinal Blended MAE: {best_mae:.4f}")

with open("../model/ensemble_weights.json", "w") as f:
    json.dump({
        "w_rf": w1,
        "w_cluster": w2,
        "w_topic": w3
    }, f)

print("Weights saved to ../model/ensemble_weights.json")

Constrained Best Weights:
Random Forest (w_rf): 0.0500
Clustering     (w_cluster): 0.0100
Topic Model    (w_topic): 0.9400

Final Blended MAE: 2.9020
Weights saved to ../model/ensemble_weights.json


## Final Model Summary: Evaluation, Ensemble, and Feature Design

### Individual Model Performance (Validation Set)

| Model             | MAE (%) |
|------------------|---------|
| Topic Model       | 0.76    |
| Random Forest     | 5.98    |
| Clustering Model  | 22.85   |

---

### Ensemble Weight Optimization

To combine the strengths of all three models, a **constrained weighted ensemble** was constructed using the formulation:

final_prediction = w₁ * RF_prediction + w₂ * Clustering_prediction + w₃ * Topic_prediction


After experimenting with unconstrained and constrained optimization, the best validation performance was achieved with the following smart-blending weights:

- **w₁ = 0.0500** (Random Forest)  
- **w₂ = 0.0100** (Clustering Model)  
- **w₃ = 0.9400** (Topic Model)

**Final Blended MAE:** **2.9020%**

This formulation ensured that all models contributed meaningfully, while allowing the topic model to dominate due to its superior accuracy.

Weights saved to: `../model/ensemble_weights.json`

---

### Feature Design Philosophy

#### Random Forest

The Random Forest model used **explicit, structured features**, including:

- Mean and variance of semantic novelty  
- Textual complexity indicators  
- Number and density of semantic change points  
- Relative recency of the last detected change  

These hand-engineered features made the model interpretable and structured, but its accuracy plateaued when handling nuanced conceptual shifts.

#### Clustering-Based Model

The clustering model used an **end-to-end unsupervised pipeline** involving:

- Sentence embeddings  
- PCA + UMAP for semantic compression  
- HDBSCAN clustering  
- Change dynamics over cluster labels  

While this helped track latent structure evolution, it was sensitive to noise and had limited predictive reliability.

#### Topic Model-Based Completion Estimator

The BERTopic-based model inferred progress using **topic saturation**:

- Assigned topic labels to each chunk  
- Counted cumulative unique topics over time  
- Detected saturation (no new topics)  
- Inferred completion % from saturation point  

This method aligned naturally with how concepts evolve in a speech, leading to both stability and high accuracy.

---

### Final Decision

The final system uses a **blended ensemble combining structured, clustering-based, and topic modeling predictions**.  
This ensemble balances generalization and robustness, achieving a **final MAE of 2.90%**.
