In [1]:
import pandas as pd
import numpy as np

# Load chunked transcripts
df = pd.read_csv('../data/processed/chunked_transcripts.csv')

# Load embeddings and assign back
embeddings = np.load('../data/processed/chunk_embeddings.npy')
df['embedding'] = list(embeddings)

In [None]:
import ruptures as rpt
from tqdm import tqdm

change_points_dict = {}

# group by video (speech) and run change point detection
for video_title, group in tqdm(df.groupby("VideoTitle")):
    emb = np.vstack(group["embedding"].values)
    model = rpt.Pelt(model="rbf").fit(emb)
    cp = model.predict(pen=0.6)  # can be tuned
    change_points_dict[video_title] = cp

100%|███████████████████████████████████████| 101/101 [00:00<00:00, 3156.55it/s]


In [3]:
def assign_section_labels(df_group, change_points):
    labels = []
    last_cp = 0
    for idx, cp in enumerate(change_points):
        labels += [idx] * (cp - last_cp)
        last_cp = cp
    return labels

section_labels = []

for video_title, group in df.groupby("VideoTitle"):
    cps = change_points_dict[video_title]
    labels = assign_section_labels(group, cps)
    section_labels += labels

df["section_label"] = section_labels

In [4]:
df.to_csv('../data/processed/chunked_with_changepoints.csv', index=False)

In [5]:
df.columns.tolist()

['VideoTitle', 'Chunk_ID', 'Chunk', 'embedding', 'section_label']

In [6]:
import json
print(json.dumps(change_points_dict, indent=2))

{
  "3 Forms of Listening": [
    5,
    10,
    15,
    20,
    25,
    30,
    40,
    45,
    51
  ],
  "5 Rules for Success": [
    5,
    10,
    15,
    20,
    25,
    30,
    33
  ],
  "7 Steps to Success": [
    5,
    10,
    15,
    20,
    25,
    30,
    32
  ],
  "75th UN General Assembly Speech": [
    5,
    10,
    20,
    25,
    30,
    35,
    43
  ],
  "9 Life Lessons": [
    5,
    10,
    15,
    20,
    25,
    30,
    35,
    40,
    45,
    51
  ],
  "Abuse of Power": [
    5,
    10,
    15,
    20,
    25,
    30,
    35,
    45,
    50,
    55,
    60,
    65,
    68
  ],
  "America Loves India": [
    5,
    15,
    20,
    25,
    30,
    35,
    40,
    42
  ],
  "Angelina_Jolie": [
    5,
    10,
    15,
    20,
    25,
    30,
    35,
    40,
    45,
    50,
    53
  ],
  "Anne_Hathaway": [
    5,
    10,
    15,
    20,
    23
  ],
  "Arnold_Schwarzenegger": [
    5,
    10,
    15,
    20,
    25,
    35,
    40,
    45,
    50,
    55,
    60,
    6

In [16]:
last_cp_progress = []

for video_title, group in df.groupby("VideoTitle"):
    cps = change_points_dict.get(video_title, [])
    last_cp = max(cps) if cps else len(group)
    total_chunks = len(group)
    
    for i in range(total_chunks):
        # Progress capped at 100%
        pct = (i / last_cp) * 100 if i <= last_cp else 100
        last_cp_progress.append(min(pct, 100))

df["speech_progress_percent"] = last_cp_progress

In [8]:
df.columns.tolist()

['VideoTitle',
 'Chunk_ID',
 'Chunk',
 'embedding',
 'section_label',
 'speech_progress_percent']

In [9]:
sample_title = "75th UN General Assembly Speech"
sample_df = df[df["VideoTitle"] == sample_title]

sample_df[["Chunk", "section_label", "speech_progress_percent"]].head(10)

Unnamed: 0,Chunk,section_label,speech_progress_percent
3480,"Thank you, Representatives of the member state...",5,82.5
3481,"And two years ago here, I asked your name. I u...",5,85.0
3482,As a boy from the small city of Ilsan in Korea...,6,87.5
3483,All of our plans went away and I became alone....,6,90.0
3484,I remember the words I spoke here two years ag...,6,92.5
3485,We must try to love ourselves and imagine the ...,6,95.0
3486,But the stars shine brightest when the night i...,6,97.5
3487,Let’s reimagine our world. We’re huddled toget...,0,0.0
3488,Let’s dream about a future when our words can ...,0,2.222222
3489,Life goes on. Life goes on. Life goes.,0,4.444444


In [11]:
sample_title = "How to Stop A Bully"
sample_df = df[df["VideoTitle"] == sample_title]

sample_df[["Chunk", "section_label", "speech_progress_percent"]].head(10)

Unnamed: 0,Chunk,section_label,speech_progress_percent
2823,"How you guys doing? Good, good good good good....",6,90.697674
2824,My brother and my sister were born with arms a...,7,93.023256
2825,"I’m from Australia, anybody want to one day go...",7,95.348837
2826,So I only live about four hours from here. And...,7,97.674419
2827,Can you imagine if I’m driving a car? They rec...,0,0.0
2828,Imagine if I get pulled over by the cops? Can ...,0,5.0
2829,Imagine if I’m in big trouble! Put your hands ...,0,10.0
2830,"Uhhhhh… So I’m in the front passenger’s seat, ...",0,15.0
2831,She has no idea that I have no arms and no leg...,0,20.0
2832,"So I get the seatbelt in my mouth, and I loose...",1,25.0


In [13]:
def get_completion_percent(video_title, current_chunk_index):
    cps = change_points_dict.get(video_title, [])
    if not cps:
        return 0.0
    last_cp = max(cps)
    if current_chunk_index >= last_cp:
        return 100.0
    return round((current_chunk_index / last_cp) * 100, 2)

In [14]:
print(get_completion_percent("75th UN General Assembly Speech", 5))

11.63


In [15]:
df.columns.tolist()

['VideoTitle',
 'Chunk_ID',
 'Chunk',
 'embedding',
 'section_label',
 'speech_progress_percent']