In [1]:
import pandas as pd
import json
import glob
from tqdm import tqdm

In [2]:
!mkdir -p human-transcripts/

In [3]:
fs = glob.glob('matched_transcripts/*')

In [4]:
def pencil_transcript_to_df(path):
    with open(path, 'r') as file:
        data = json.load(file)

    file_name = f.split('/')[-1]
    
    entries = data['transcript']
    rows = []
    
    for entry in entries:
        text = entry.get('text', '')
        start = entry.get('start')
        end = entry.get('end')
    
        # Try to get speaker from entry
        speaker = entry.get('speaker')
    
        # If not found, try to infer from first word's speaker
        if speaker is None:
            words = entry.get('words', [])
            if words and 'speaker' in words[0]:
                speaker = words[0]['speaker']
            else:
                speaker = "UNKNOWN"
    
        rows.append({
            'file_name': file_name,
            'speaker': speaker,
            'start': start,
            'end': end,
            'text': text.strip()
        })

    return pd.DataFrame(rows)


In [5]:
dfs = []
for f in tqdm(fs):
    df = pencil_transcript_to_df(f)
    dfs.append(df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3075/3075 [00:10<00:00, 290.54it/s]


In [6]:
df_pencil = pd.concat(dfs)

In [63]:
df_pencil.to_csv('exports/2025-05-08-pencil-transcripts.csv', index=False)

In [26]:
import pandas as pd
import re

def detect_disengagement(text):
    if not isinstance(text, str):
        return 0
    text = text.lower().strip()

    # Expanded list of disengagement patterns (regex-friendly)
    disengagement_patterns = [
        r"\bi (don't|do not|dont|wanna|won't|cant|can't) (want|feel like|care)\b",
        r"\b(this|math|it) (is )?(so )?(boring|stupid|dumb|hard|pointless|annoying)\b",
        r"\bi (hate|dislike) (this|math|school|work)\b",
        r"\b(i'm|im|i am) (tired|bored|confused|lost|annoyed|mad)\b",
        r"\b(who cares|no idea|not sure)\b",
        r"\b(ugh+|grr+|sigh+|yawn+)\b",
        r"\b(not doing this|don’t make me|leave me alone|stop it)\b",
        r"\bwhy (do|should) i (even )?(have to|need to)\b",
        r"\b(so dumb|makes no sense|hate this part)\b",
        r"\b(can we stop|can we be done|is it over)\b"
    ]

    return int(any(re.search(pat, text) for pat in disengagement_patterns))

# Apply to your DataFrame
df_pencil['disengaged'] = df_pencil['text'].apply(detect_disengagement)

In [40]:
def whisperx_transcript_to_df(path):
    with open(path, 'r') as file:
        data = json.load(file)

    file_name = path.split('/')[-1]
    segments = data['whisperxTranscript']['whisper']['align']['segments']

    rows = []
    for seg in segments:
        rows.append({
            'file_name': file_name,
            'speaker': seg.get('speaker', 'UNKNOWN'),
            'start': seg['start'],
            'end': seg['end'],
            'text': seg['text'].strip()
        })

    return pd.DataFrame(rows)

In [41]:
dfs = []
for f in tqdm(fs):
    df = whisperx_transcript_to_df(f)
    dfs.append(df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3075/3075 [00:09<00:00, 318.51it/s]


In [42]:
df_whisperx = pd.concat(dfs)

In [43]:
df_whisperx.to_csv('exports/2025-05-08-whisperX-transcripts.csv', index=False)

In [44]:
import pandas as pd
import re

def detect_disengagement(text):
    if not isinstance(text, str):
        return 0
    text = text.lower().strip()

    # Expanded list of disengagement patterns (regex-friendly)
    disengagement_patterns = [
        r"\bi (don't|do not|dont|wanna|won't|cant|can't) (want|feel like|care)\b",
        r"\b(this|math|it) (is )?(so )?(boring|stupid|dumb|hard|pointless|annoying)\b",
        r"\bi (hate|dislike) (this|math|school|work)\b",
        r"\b(i'm|im|i am) (tired|bored|confused|lost|annoyed|mad)\b",
        r"\b(who cares|no idea|not sure)\b",
        r"\b(ugh+|grr+|sigh+|yawn+)\b",
        r"\b(not doing this|don’t make me|leave me alone|stop it)\b",
        r"\bwhy (do|should) i (even )?(have to|need to)\b",
        r"\b(so dumb|makes no sense|hate this part)\b",
        r"\b(can we stop|can we be done|is it over)\b"
    ]

    return int(any(re.search(pat, text) for pat in disengagement_patterns))

# Apply to your DataFrame
df_whisperx['disengaged'] = df_whisperx['text'].apply(detect_disengagement)

In [48]:
df_whisperx[df_whisperx['disengaged']==1][['text']].to_csv('whisperX-disengaged-student-moves.csv', index=False)

In [53]:
def get_first_response_after_disengaged(df):
    result = []

    # Reset index to ensure iloc works with positions
    df = df.reset_index(drop=True)
    disengaged_indices = df.index[df['disengaged'] == 1].tolist()

    for i in disengaged_indices:
        speaker_d = df.iloc[i]['speaker']
        disengaged_text = df.iloc[i]['text']
        response_text = ""

        for j in range(i + 1, len(df)):
            # Stop if we hit the next disengaged point
            if df.iloc[j]['disengaged'] == 1:
                break
            # Different speaker?
            if df.iloc[j]['speaker'] != speaker_d:
                response_text = df.iloc[j]['text']
                break

        result.append({
            'disengaged_text': disengaged_text,
            'response_text': response_text
        })

    return pd.DataFrame(result)

# Call the function
df_response_pairs = get_first_response_after_disengaged(df_whisperx)

# Save result
df_response_pairs.to_csv('whisperX-disengaged-response-pairs.csv', index=False)


In [55]:
def get_immediate_responses_after_disengaged(df):
    result = []

    # Reset index for safe row access
    df = df.reset_index(drop=True)

    for i in range(len(df) - 1):  # Stop at len-1 to avoid out-of-bounds
        if df.iloc[i]['disengaged'] == 1:
            speaker_d = df.iloc[i]['speaker']
            disengaged_text = df.iloc[i]['text']
            next_speaker = df.iloc[i + 1]['speaker']

            if next_speaker != speaker_d:
                response_text = df.iloc[i + 1]['text']
            else:
                response_text = ""

            result.append({
                'disengaged_text': disengaged_text,
                'response_text': response_text
            })

    return pd.DataFrame(result)


In [56]:
df_immediate_responses = get_immediate_responses_after_disengaged(df_whisperx)
df_immediate_responses.to_csv('whisperX-disengagement-immediate-responses.csv', index=False)


In [61]:
#df_immediate_responses[df_immediate_responses['response_text']!=''].response_text.values

In [70]:
def find_best_match(row, df_whisperx):
    start, end = row['start'], row['end']
    file_name = row['file_name']

    # Restrict df_whisperx to the same file name
    df_same_file = df_whisperx[df_whisperx['file_name'] == file_name]

    # Find overlaps within the same file
    overlap = df_same_file[
        ((df_same_file['start'] <= end) & (df_same_file['end'] >= start))
    ].copy()

    if not overlap.empty:
        # Choose the one with the most overlap
        overlap['overlap'] = overlap.apply(
            lambda r: min(end, r['end']) - max(start, r['start']), axis=1)
        return overlap.sort_values('overlap', ascending=False).iloc[0]
    elif not df_same_file.empty:
        # Fallback: choose the closest in start time
        closest_idx = (df_same_file['start'] - start).abs().idxmin()
        return df_same_file.loc[closest_idx]
    else:
        # No match possible due to missing file_name in whisperx
        return pd.Series(dtype='object')

def sample_and_fuzzy_match(df_pencil, df_whisperx, n=300):
    sampled_pencil = df_pencil.sample(n=n, random_state=42).reset_index(drop=True)

    matched_whisperx = sampled_pencil.apply(lambda row: find_best_match(row, df_whisperx), axis=1)

    comparison_df = pd.concat(
        [sampled_pencil.add_suffix('_pencil').reset_index(drop=True),
         matched_whisperx.add_suffix('_whisperx').reset_index(drop=True)],
        axis=1
    )

    comparison_df = comparison_df.drop(columns=['file_name_whisperx'], errors='ignore')\
                                 .rename(columns={'file_name_pencil': 'file_name'}).copy()
    return comparison_df

In [77]:
def sample_and_fuzzy_match_20_files(df_pencil, df_whisperx, n_files=20):
    # Step 1: Sample n_files distinct file_names
    sampled_files = df_pencil['file_name'].dropna().unique()
    sampled_files = pd.Series(sampled_files).sample(n=n_files, random_state=42)

    # Step 2: Subset df_pencil to those files
    subset_pencil = df_pencil[df_pencil['file_name'].isin(sampled_files)]
    sampled_pencil = subset_pencil.reset_index(drop=True)

    # Step 3: Match
    matched_whisperx = sampled_pencil.apply(lambda row: find_best_match(row, df_whisperx), axis=1)

    # Step 4: Combine
    comparison_df = pd.concat(
        [sampled_pencil.add_suffix('_pencil').reset_index(drop=True),
         matched_whisperx.add_suffix('_whisperx').reset_index(drop=True)],
        axis=1
    )

    comparison_df = comparison_df.drop(columns=['file_name_whisperx'], errors='ignore')\
                                 .rename(columns={'file_name_pencil': 'file_name'}).copy()
    return comparison_df


In [71]:
comparison_df = sample_and_fuzzy_match(df_pencil, df_whisperx, n=1000)

In [72]:
comparison_df.to_csv('exports/2025-05-08-matched-snippets-1000-sampled.csv', index=False)

In [81]:
comparison_df = sample_and_fuzzy_match_20_files(df_pencil, df_whisperx, n_files=10)

In [82]:
comparison_df.to_csv('exports/2025-05-08-matched-snippets-10-files.csv', index=False)

In [86]:
comparison_df.file_name.value_counts()

file_name
March_12_2024_El_Capitan_Period_2_1647_1406814b-1ce9-47ae-972e-191a002c7840_matched_transcripts.json                     415
October_22_2024_MACS_Tutoring_1417_9640958f-f383-4cf7-b45a-f74595b59583_matched_transcripts.json                         294
October_25_2023_Life_Male_STEAM_Academy_1407_8c8299c3-91f8-4075-ba73-41714333adde_matched_transcripts.json               112
February_06_2024_El_Capitan_Period_2_1751_e8b64be2-a991-4fa3-9791-2471d3d73009_matched_transcripts.json                   69
December_13_2023_Life_Male_STEAM_Academy_1704_MainRoom_matched_transcripts.json                                           45
October_29_2024_MACS_Tutoring_1841_ab689e6b-ff38-497e-91f2-a5a862f84ace_matched_transcripts.json                          34
November_14_2023_El_Capitan_Period_2_1745_418df51e-931f-472c-861b-020dfd80b02f_matched_transcripts.json                   16
February_08_2024_El_Capitan_Period_2_1748_1a8782b2-117b-4643-a9d1-b6899d24b64e_matched_transcripts.json            