In [13]:
import pandas as pd
import json
import glob
from tqdm import tqdm

In [5]:
!mkdir -p human-transcripts/

In [11]:
fs = glob.glob('matched_transcripts/*')

In [23]:
def pencil_transcript_to_df(path):
    with open(path, 'r') as file:
        data = json.load(file)

    file_name = f.split('/')[-1]
    
    entries = data['transcript']
    rows = []
    
    for entry in entries:
        text = entry.get('text', '')
        start = entry.get('start')
        end = entry.get('end')
    
        # Try to get speaker from entry
        speaker = entry.get('speaker')
    
        # If not found, try to infer from first word's speaker
        if speaker is None:
            words = entry.get('words', [])
            if words and 'speaker' in words[0]:
                speaker = words[0]['speaker']
            else:
                speaker = "UNKNOWN"
    
        rows.append({
            'file_name': file_name,
            'speaker': speaker,
            'start': start,
            'end': end,
            'text': text.strip()
        })

    return pd.DataFrame(rows)


In [24]:
dfs = []
for f in tqdm(fs):
    df = pencil_transcript_to_df(f)
    dfs.append(df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3075/3075 [00:11<00:00, 263.92it/s]


In [25]:
df_pencil = pd.concat(dfs)

In [28]:
df_pencil.to_csv('exports/2025-05-08-pencil-transcripts.csv', index=False)

In [29]:
def whisperx_transcript_to_df(path):
    with open(path, 'r') as file:
        data = json.load(file)

    file_name = path.split('/')[-1]
    segments = data['whisperxTranscript']['whisper']['align']['segments']

    rows = []
    for seg in segments:
        rows.append({
            'file_name': file_name,
            'speaker': seg.get('speaker', 'UNKNOWN'),
            'start': seg['start'],
            'end': seg['end'],
            'text': seg['text'].strip()
        })

    return pd.DataFrame(rows)

In [30]:
dfs = []
for f in tqdm(fs):
    df = whisperx_transcript_to_df(f)
    dfs.append(df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3075/3075 [00:11<00:00, 257.07it/s]


In [32]:
df_whisperx = pd.concat(dfs)

In [33]:
df_whisperx.to_csv('exports/2025-05-08-whisperX-transcripts.csv', index=False)

In [46]:
def find_best_match(row, df_whisperx):
    start, end = row['start'], row['end']

    # Find overlaps
    overlap = df_whisperx[
        ((df_whisperx['start'] <= end) & (df_whisperx['end'] >= start))
    ].copy()

    if not overlap.empty:
        # Choose the one with the most overlap
        overlap['overlap'] = overlap.apply(
            lambda r: min(end, r['end']) - max(start, r['start']), axis=1)
        return overlap.sort_values('overlap', ascending=False).iloc[0]
    else:
        # Fallback: choose the closest in start time
        closest_idx = (df_whisperx['start'] - start).abs().idxmin()
        return df_whisperx.loc[closest_idx]

def sample_and_fuzzy_match(df_pencil, df_whisperx, n=300):
    # Step 1: Uniform sample from df_pencil
    sampled_pencil = df_pencil.sample(n=n, random_state=42).reset_index(drop=True)

    # Step 2: Find best match in df_whisperx
    matched_whisperx = sampled_pencil.apply(lambda row: find_best_match(row, df_whisperx), axis=1)

    # Step 3: Combine side-by-side
    comparison_df = pd.concat(
        [sampled_pencil.add_suffix('_pencil').reset_index(drop=True),
         matched_whisperx.add_suffix('_whisperx').reset_index(drop=True)],
        axis=1
    )
    comparison_df = comparison_df.drop(columns=['file_name_whisperx']).rename(columns={'file_name_pencil': 'file_name'}).copy()
    return comparison_df

In [53]:
comparison_df = sample_and_fuzzy_match(df_pencil, df_whisperx, n=1000)

In [56]:
comparison_df.to_csv('exports/2025-05-08-matched-snippets-1000-sampled.csv', index=False)

In [55]:
comparison_df.shape

(1000, 10)