In [1]:
%matplotlib inline

In [2]:
import glob, re
import string
import json
import math

import numpy as np
import pandas as pd

from fuzzywuzzy import fuzz
from typing import Tuple
from pathlib import Path

In [86]:
#!uv pip install python-Levenshtein
#!uv pip install fuzzywuzzy

# Code assigns speaker labels to AA Transcripts

Uses labels from fan-made Uncut Friends Scripts transcripts to time-stamped, non-identified AssemblyAI speech2text transcripts 

In [3]:

def prep_ufs(s_num, e_num):
    """
    Extracts cleaned utterances from 
    Uncut Friends Scripts fan-made transcripts
    """
    ufs_filepath = Path(
        "/home/mstlaure/Documents/Marie/neuromod/"
        "friends_annotations/annotation_results/"
        f"community_based/s{s_num}/friends_s0{s_num}e{e_num}_ufs.txt"
    )
    with open(ufs_filepath, 'r', encoding='utf-8') as f:
        ufs_transcript = f.read()

    ufs_processed = []
    for line in ufs_transcript.splitlines():
        # Remove all text inside parentheses and squared brackets (transcriber notes)
        line = re.sub(r'\s*\(.*?\)\s*', ' ', line)
        line = re.sub(r'\s*\[.*?\]\s*', ' ', line)
        line = line.strip()    
        # Only process lines that start with "Speaker:"
        if ":" in line:
            # Split line between speaker and utterance
            line_segs = line.strip().lower().split(":")
            speaker = line_segs[0]
            speech = line_segs[1]
            # To handle clock times...
            if len(line_segs) > 2:    
                for seg in line_segs[2:]:
                    speech += f":{seg}"
                
            sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', speech)
            for s in sentences:
                if s.strip():
                    ufs_processed.append((speaker, s))

    return ufs_processed


def prep_aa(s_num, e_num):
    """
    Load cleaned up AssemblyAI speech2text transcripts
    (theme song removed)
    """
    json_file_path = Path(
        "/home/mstlaure/Documents/Marie/neuromod"
        "/friends_annotations/annotation_results/Speech2Text"
        f"/s{s}/friends_s0{s_num}e{e_num}_model-AA_desc-wUtter_transcript.json"   
    )
    with open(json_file_path, 'r') as file:
        aa_transcript = json.load(file)

    return aa_transcript


def save_aa(s_num, e_num, aa_transcript):
    """
    Load cleaned up AssemblyAI speech2text transcripts
    (theme song removed)
    """
    json_out_path = Path(
        "/home/mstlaure/Documents/Marie/neuromod"
        "/friends_annotations/annotation_results/Speech2Text"
        f"/s{s}/friends_s0{s_num}e{e_num}_model-AA_desc-wSpeaker_transcript.json"   
    )
    with open(json_out_path, "w") as outfile:
        json.dump(aa_transcript, outfile)    
    

def string_match(
    utter_1: str,
    utter_2: str,
) -> Tuple[int, int]:
    """
    Calculates a resemblance score between two strings.

    Args:
        utter_1: The first string.
        utter_2: The second string.

    Returns:
        A tuple containing:
        - The fuzzywuzzy ratio score (int).
        - The number of characters in the first string (int).
    """
    clean_string = str.maketrans('', '', string.punctuation + string.whitespace)

    clean_u1 = utter_1.lower().translate(clean_string)
    clean_u2 = utter_2.lower().translate(clean_string)

    return fuzz.ratio(clean_u1, clean_u2), len(utter_1)

    
def find_bestmatch(
    utter: str,
    candidates: list[tuple],
    i_0 = None,
    i_n = None,
    verbose = False,
) -> list:
    """
    Compares a string to a list of strings
    Returns closest match index, matching score,
    and first utterance length (to evaluate condifence 
    for filtering)
    """
    top = [-1, -1, None]
    best_match = None
    i_0 = 0 if i_0 is None else i_0
    i_n = len(candidates) if i_n is None else i_n

    for i in range(i_0, i_n):
        utt_i = candidates[i]
        fuzz_r, len_u = string_match(utter, utt_i[1])
        if fuzz_r > top[1]:
            top = [i, fuzz_r, len_u]
            best_match = utt_i[1]
    
    if verbose:
        print(f"{utter} \n {best_match}\n\n")
    
    return top 


def find_anchors(aa_transcript, processed_utterances, verbose=False):
    """
    For each speech2text sentence, find highest match in fan-made transcript.
    Identify high-confidence matches to structure the matching process 
    (done in chunks solved recursively)
    """
    top_scores = [
        [i] + find_bestmatch(
            x['text'], processed_utterances, verbose=verbose,
        ) for i, x in enumerate(aa_transcript['sentences'])
    ]
    # Anchors are pairings with >90% fuzzywuzzy ratio match 
    # on utterances long enough to be meaningful
    anchors = np.logical_and(
        np.array([x[2] for x in top_scores]) > 93,
        np.array([x[3] for x in top_scores]) > 35,
    )
    # Discard anchors where the match idx is lower than the previous anchor
    anchors = anchors * np.logical_and(
        np.array(
            [True] + [top_scores[i][1] >= top_scores[i-1][1] for i in range(1, len(top_scores))]
        ), 
        np.array(
            [top_scores[i][1] <= top_scores[i+1][1] for i in range(0, len(top_scores)-1)] + [True] 
        ),
    )
    
    return anchors, top_scores


def assign_chunk(
    aa_i0,
    aa_in,
    ufs_i0,
    ufs_in,
    aa_transcript,
    processed_utterances,
    score_assignments,
    verbose=False,
):
    """."""
    chunk_scores = np.array([
        [aa_i0 + j] + find_bestmatch(
            x['text'],
            processed_utterances,
            ufs_i0,
            ufs_in,
            verbose=verbose,
        ) for j, x in enumerate(aa_transcript['sentences'][aa_i0:aa_in])
    ])

    if chunk_scores.shape[0] == 1:
        score_assignments[aa_i0:aa_in] = chunk_scores
    else:
        # identify next highest match as new "anchor", exculding the top row (anchor)
        split_idx = int(np.argmax(chunk_scores[1:, 2]) + 1)

        assert chunk_scores[0][0] == aa_i0
        assert aa_i0 + split_idx == chunk_scores[split_idx][0]
        # lauch recursively on top and bottom chunks
        assign_chunk(
            chunk_scores[0][0],              # includes chunk's top sentence
            chunk_scores[split_idx][0],      # excludes chunk's split sentence
            chunk_scores[0][1],              # includes chunk's top sentence's match
            chunk_scores[split_idx][1] + 1,  # include chunk's split sentence match, in case precedent also matches that utterance
            aa_transcript,
            processed_utterances,
            score_assignments,
        )
        assign_chunk(
            chunk_scores[split_idx][0],      # includes split sentence
            chunk_scores[-1][0] + 1,         # includes chunk's last sentence
            chunk_scores[split_idx][1],      # includes split sentence's match
            ufs_in,                          # includes lower edge of match search space
            aa_transcript,
            processed_utterances,
            score_assignments,
        )
            

In [4]:

def process_chunks(
    ancs,
    tops,
    aa_transcript,
    processed_utterances,
    verbose=False,
):
    """."""
    score_assignments = np.full(
        (len(tops), 4), -1, dtype=int,
    )
    anchor_idx = np.array(tops)[ancs]    
    
    for i in range(anchor_idx.shape[0]): 
        
        if i == anchor_idx.shape[0] - 1:
            # Assign sentences from last anchor and below
            j_n = forward2btm(
                anchor_idx[-1], 
                aa_transcript, 
                processed_utterances
            )
            assign_chunk(
                anchor_idx[-1][0],                      # include last anchor speech-to-text in search
                len(aa_transcript['sentences']),        # process all remaining sentences
                anchor_idx[-1][1],                      # include last anchor's ccf match in search 
                j_n + 1,                                # include last sentence's ccf match in search
                aa_transcript,
                processed_utterances, 
                score_assignments,
                verbose=verbose,
            )
            
        else:
            assign_chunk(
                anchor_idx[i][0],          # include current anchor speech-to-text in search
                anchor_idx[i+1][0],        # exclude next anchor speech-to-text from search
                anchor_idx[i][1],          # include current anchor's ccf match in search 
                anchor_idx[i+1][1] + 1,    # include next anchor's ccf match in search, in case precedent text also matches that utterance  
                aa_transcript,
                processed_utterances, 
                score_assignments,
                verbose=verbose,
            )
            
    # Assign sentences before first anchor, if any
    if anchor_idx[0][0] > 0:
        j_0 = backtrack2top(
            anchor_idx[0],
            aa_transcript,
            processed_utterances, 
        )
        assign_chunk(
            0,                        # include first speech-to-text sentence in search
            anchor_idx[0][0],         # exclude first anchor speech-to-text from search
            j_0,                      # include first sentence's ccf match in search 
            anchor_idx[0][1] + 1,     # include first anchor's ccf match in search, in case precedent text also matches that utterance  
            aa_transcript,
            processed_utterances, 
            score_assignments,
            verbose=verbose,
        )
    
    return anchor_idx, score_assignments


def backtrack2top(
    start_vals,   # anchor_idx[0]
    aa_transcript,
    processed_utterances, 
):
    """
    Starting from the first anchor, work backward to find the starting point 
    of the (half / third / quarter episode) AA transcript in the 
    community transcript (full episode).
    """
    i, j, _, _ = start_vals
    if i == 0:
        """
        Edge case for which the first anchor is also the first speech2text sentence
        No unassigned sentences remain at the top, nothing to do
        """
        return j
        
    else:
        # Concatenate all text from top sentences (including first anchor)
        aa_toptext = " ".join([s['text'] for s in aa_transcript['sentences'][:i+1]])

        # concat community transcript text to find optimal match 
        top_score = (-1, -1)
        target_text = processed_utterances[j][1]
        
        for k in range(j-1, -1, -1): 
            target_text = " ".join([processed_utterances[k][1], target_text])
            score = (k, string_match(aa_toptext, target_text)[0])            
            if score[1] > top_score[1]:
                top_score = score

        return top_score[0]


def forward2btm(
    start_vals,   # anchor_idx[-1]
    aa_transcript,
    processed_utterances, 
):
    """
    Starting from the first anchor, work backward to find the starting point 
    of the (half / third / quarter episode) AA transcript in the 
    community transcript (full episode).
    """
    i, j, _, _ = start_vals
    if i == len(aa_transcript['sentences']) - 1:
        """
        Edge case for which the first anchor is also the first speech2text sentence
        No unassigned sentences remain at the top, nothing to do
        """
        return j
        
    else:
        # Concatenate all text from last anchor to last sentence (both included)
        aa_toptext = " ".join([s['text'] for s in aa_transcript['sentences'][i:]])

        # concat community transcript text to find optimal match 
        top_score = (-1, -1)
        target_text = processed_utterances[j][1]
        
        for k in range(j+1, len(processed_utterances), 1): 
            target_text = " ".join([target_text, processed_utterances[k][1]])
            score = (k, string_match(aa_toptext, target_text)[0])            
            if score[1] > top_score[1]:
                top_score = score

        return top_score[0]
                

In [5]:

def merge_segs(aa_t, i):
    """
    Merge two adjacent sentences into a single one
    """
    if i + 1 < len(aa_t['sentences']) and i > -1:
        s1 = aa_t['sentences'][i]
        s2 = aa_t['sentences'][i+1]
        new_sentence = {
            'text': s1['text'] + " " + s2['text'],
            'start': s1['start'],
            'end': s2['end'],
            'speaker': None,
            'confidence': None,
            'words': s1['words'] + s2['words'],
        }

        last_seg = [] if i + 2 == len(aa_t['sentences']) else aa_t['sentences'][i+2:]
        aa_t['sentences'] = aa_t['sentences'][:i] + [new_sentence] + last_seg
                
    return aa_t


def jitter_segs(aa_t, i, split_idx):
    """
    Re-distributes words in-place between adjacent sentences
    """
    temp_wlist = aa_t['sentences'][i]['words'] + aa_t['sentences'][i+1]['words']
    
    aa_t['sentences'][i]['words'] = temp_wlist[:split_idx]
    aa_t['sentences'][i]['text'] = " ".join([w['word'] for w in temp_wlist[:split_idx]])
    aa_t['sentences'][i]['end'] = aa_t['sentences'][i]['words'][-1]['end']

    aa_t['sentences'][i+1]['words'] = temp_wlist[split_idx:]
    aa_t['sentences'][i+1]['text'] = " ".join([w['word'] for w in temp_wlist[split_idx:]])
    aa_t['sentences'][i+1]['start'] = aa_t['sentences'][i+1]['words'][0]['start']

    return aa_t
    

def split_seg(aa_t, i, w2drop):
    """
    Split a sentence into two sentences.
    """
    wlist = aa_t['sentences'][i]["words"]
    #print(len(wlist), w2drop)
    new_sentence = {
        'text': " ".join([w['word'] for w in wlist[:w2drop]]),
        'start': wlist[0]['start'],
        'end': wlist[w2drop-1]['end'],
        'speaker': None,
        'confidence': None,
        'words': wlist[:w2drop],
    }
    
    aa_t['sentences'][i]['text'] = " ".join([w['word'] for w in wlist[w2drop:]])    
    aa_t['sentences'][i]['start'] = wlist[w2drop]['start']    
    aa_t['sentences'][i]['words'] = wlist[w2drop:]

    aa_t['sentences'] = aa_t['sentences'][:i] + [new_sentence] + aa_t['sentences'][i:]
        
    return aa_t
    

def find_best_split(wlist, target1, target2, i_start=None, i_stop=None):
    """
    Split contatenates list of words and test the cummulative matching score
    with two adjacent target utterances.

    Return the split index with the highest matching score and the score itself
    """
    i_start = 0 if i_start is None else i_start
    i_stop = len(wlist)+1 if i_stop is None else i_stop

    scores = []
    for i in range(i_start, i_stop):
        if i == 0:
            scores.append((i, string_match(
                " ".join([w['word'] for w in wlist]), target2,
            )[0]))        
        elif i == len(wlist):
            scores.append((i, string_match(
                " ".join([w['word'] for w in wlist]), target1,
            )[0]))
        else:
            scores.append((i, string_match(
                " ".join([w['word'] for w in wlist[:i]]),
                target1,
            )[0] + string_match(
                " ".join([w['word'] for w in wlist[i:]]),
                target2,
            )[0]))

    split_idx, best_score = scores[np.argmax([x[1] for x in scores])]
    
    return split_idx, best_score    


def find_words2drop(wlist, target, fromleft=True):
    """
    Determine the number of words to drop from sentence edge 
    to maximize target match
    """
    scores = []
    if fromleft:
        for i in range(len(w_list)):
            scores.append((i, string_match(
                " ".join([w['word'] for w in wlist[i:]]),
                target,
            )[0]))
    else:
        for i in range(len(w_list)):
            scores.append((i, string_match(
                " ".join([w['word'] for w in wlist[:len(w_list)-i]]),
                target,
            )[0]))
            
    words2drop, best_score = scores[np.argmax([x[1] for x in scores])]
    
    return words2drop, best_score
    
    
def adjacent_segs(aa_t, p_utter, i, j):
    """
    Adjust split between adjacent sentences for optimal match
    """
    if i + 1 < len(aa_t['sentences']):
        
        # find optimal split between two consecutive sentences
        temp_wlist = aa_t['sentences'][i]['words'] + aa_t['sentences'][i+1]['words']
        split_idx, best_score = find_best_split(
            temp_wlist, 
            p_utter[j][1], 
            p_utter[j+1][1],
        )

        if split_idx == 0: 
            # Assign all combined words to j+1 match
            aa_t = merge_segs(aa_t, i)
            return aa_t, (j+1,)
            
        elif split_idx == len(temp_wlist):
            # Assign all combined words to j match
            aa_t = merge_segs(aa_t, i)
            return aa_t, (j,)
            
        elif split_idx != len(aa_t['sentences'][i]['words']):
            # re-distribute words between adjacent sentences
            aa_t = jitter_segs(aa_t, i, split_idx)
            
    return aa_t, (j, j+1)


def gap_segs(aa_t, p_utter, i, j1, j2):
    # concat four consecutive sentences for more context
    i_0 = max(0, i - 1)
    i_n = min(i+3, len(aa_t['sentences']))
    temp_wlist = [w for s in aa_t['sentences'][i_0:i_n] for w in s["words"]]

    # Set search space boundaries: limit split placements to two middle sentences i and i+1
    i_start = 0 if i == 0 else len(aa_t['sentences'][i_0]['words'])
    i_stop = len([w for s in aa_t['sentences'][i_0:i+2] for w in s["words"]]) + 1
    
    """
    Find best split for each target between j1 and j2
    """
    j_0 = max(0, j1-1)
    j_n = min(j2+2, len(p_utter))
    split_indices = []
    for k in range(1, j2-j1+1):
        targ1 = " ".join([u[1] for u in p_utter[j_0:j1+k]])
        targ2 = " ".join([u[1] for u in p_utter[j1+k:j_n]])

        split_idx, best_score = find_best_split(
            temp_wlist, 
            targ1, 
            targ2,
            i_start=i_start, 
            i_stop=i_stop,
        )    
        split_indices.append((k, split_idx, best_score))

    # merge two middle sentences
    aa_t = merge_segs(aa_t, i)

    # Perform splits and save their target (j_vals)
    prev_cuts = {i_start, i_stop-1}
    j_vals = []
    cutwords = 0
    n_cuts = 0
    for k, s, b in split_indices:
        if s not in prev_cuts:
            w2cut = s-(i_start+cutwords)
            aa_t = split_seg(aa_t, i+n_cuts, s-(i_start+cutwords))
            j_vals.append(j1+k-1)
            prev_cuts.add(s)
            cutwords += w2cut
            n_cuts += 1
    j_vals.append(j2)    
        
    return aa_t, j_vals
    

In [6]:

def print_gapinfo(
    i1, 
    i2, 
    j1, 
    j2, 
    aa_t, 
    p_utter,    
):
    print(aa_t['sentences'][i1]['text'])
    print(aa_t['sentences'][i2]['text'])
    print()
    print(p_utter[j1][1])
    print(p_utter[j2][1])
    print()
    for m in range (j1, j2+1):
        print(p_utter[m][1])
        

def finetune_segments(
    anchor_idx,
    aa_transcript,
    processed_utterances,
    score_assignments,
):
    """
    Adjust sentence boundaries to increase match with target utterances

    Changes are made in-place in the transcript dictionary (sentences and 
    their word lists are split and merged to optimize the fit)
    """
    i = 0
    #i = anchor_idx[0][0]
    while i < len(aa_transcript['sentences']):
        if i + 1 == len(aa_transcript['sentences']):
            # Last segment, just exit loop
            i += 1

        else:
            j1 = score_assignments[i][1]
            j2 = score_assignments[i+1][1]
            assert j1 <= j2
    
            # two subsequent sentences w same match: merge sentences
            if j1 == j2:
                aa_transcript = merge_segs(
                    aa_transcript, i,
                )
                s, l = string_match(
                    aa_transcript['sentences'][i]['text'],
                    processed_utterances[j1][1],
                )
                score_assignments[i] = np.array([i, j1, s, l])
                if i + 2 == len(aa_transcript['sentences']):
                    score_assignments = score_assignments[:-1]
                else:
                    score_assignments[i+2:, 0] = score_assignments[i+2:, 0] - 1
                    score_assignments = np.concatenate(
                        (score_assignments[:i+1], score_assignments[i+2:])
                    )
                # i+= 0  # Do NOT update i, sentence list shortened by one 
    
            # Normally covers all other scenarios
            elif j2 > j1:
                
                if j2 - j1 > 3:
                    # Handle large gaps... merge two next consecutive segments, assign second seg's match to scores, then process
                    print(f"Warning! Large jump of {j2 - j1} in matching indices between lines {i} and {i+1}")
                    
                    print_gapinfo(
                        i, i+1, j1, j2, 
                        aa_transcript, 
                        processed_utterances,
                    )
                    # Merge i+1 with next sentence to broaden context
                    if i + 2 < len(aa_transcript['sentences']):
                        j2 = score_assignments[i+2][1]
                        aa_transcript = merge_segs(
                            aa_transcript, i+1,    # merge i+1 et i+2
                        )
                        s, l = string_match(
                            aa_transcript['sentences'][i+1]['text'],
                            processed_utterances[j2][1],
                        )
                        score_assignments[i+1] = np.array([i+1, j2, s, l])
                        if i + 3 == len(aa_transcript['sentences']):
                            score_assignments = score_assignments[:-1]
                        else:
                            score_assignments[i+3:, 0] = score_assignments[i+3:, 0] - 1
                            score_assignments = np.concatenate(
                                (score_assignments[:i+2], score_assignments[i+3:])
                            )
                            
                    # Merge i with previous sentence to broaden context
                    if i > 0:
                        i -= 1
                        j1 = score_assignments[i][1]
                        aa_transcript = merge_segs(
                            aa_transcript, i,   # merge i-1 et i
                        )
                        s, l = string_match(
                            aa_transcript['sentences'][i]['text'],
                            processed_utterances[j1][1],
                        )
                        score_assignments[i] = np.array([i, j1, s, l])
                        if i + 2 == len(aa_transcript['sentences']):
                            score_assignments = score_assignments[:-1]
                        else:
                            score_assignments[i+2:, 0] = score_assignments[i+2:, 0] - 1
                            score_assignments = np.concatenate(
                                (score_assignments[:i+1], score_assignments[i+2:])
                            )
                
                aa_transcript, j_vals = gap_segs(
                    aa_transcript, processed_utterances, i, j1, j2,
                )
                temp_scores = []
                for k in range(len(j_vals)):
                    s, l = string_match(
                        aa_transcript['sentences'][i+k]['text'],
                        processed_utterances[j_vals[k]][1],
                    )
                    temp_scores.append([i+k, j_vals[k], s, l])
                if i + 2 == len(aa_transcript['sentences']):
                    score_assignments = np.concatenate(
                        (score_assignments[:i], np.array(temp_scores))
                    )
                else:
                    score_assignments[i+2:, 0] = score_assignments[i+2:, 0] + len(j_vals) - 2
                    score_assignments = np.concatenate(
                        (score_assignments[:i], np.array(temp_scores), score_assignments[i+2:])
                    )
                i += len(j_vals) - 1    
                
            else:
                i + 1
                
    return aa_transcript, score_assignments



In [7]:

def give_speaker_labels(
    aa_t,
    p_utter,
    score_assignments,    
):
    """."""
    word_list = aa_t['words']

    # Perform sanity checks
    swords_list = [w for s in aa_t['sentences'] for w in s["words"]]
    assert word_list == swords_list
    assert len(word_list) == len(swords_list)
    assert len(aa_t['sentences']) == score_assignments.shape[0]
    
    w_idx = 0    
    for i in range(len(aa_t['sentences'])):

        i_sa, j_sa, _, _ = score_assignments[i]
        assert i_sa == i

        speaker_name = p_utter[j_sa][0]
        
        sen = aa_t['sentences'][i]
        assert sen['text'].split(" ") == [w['word']for w in sen['words']]
        assert sen['text'] == " ".join([w['word']for w in sen['words']])
        num_words = len(sen['words'])
        assert sen['words'] == word_list[w_idx:w_idx+num_words]

        aa_t['sentences'][i]['speaker'] = speaker_name
        for w in sen['words']:
            w['speaker'] = speaker_name
        for w in word_list[w_idx:w_idx+num_words]:
            w['speaker'] = speaker_name
        
        w_idx += num_words

    return aa_t



In [8]:
"""
Main function to be called in a loop on each transcript
"""

def assign_speakers(s, e):
    """
    s (str) season, e.g., "1", "3"
    e (str) episode, e.g., "02a", "23c"
    """
    processed_ufs = prep_ufs(s, e[:2])
    transcript_aa = prep_aa(s, e)

    # Identify long segments w high confidence matches
    ancs, tops = find_anchors(
        transcript_aa,
        processed_ufs,
        verbose=True,
    )

    # Assign transcribed segments to utterances
    anchor_idx, score_assignments = process_chunks(
        ancs,
        tops,
        transcript_aa,
        processed_ufs,   
        verbose=True,
    )

    # TODO: Launch interatively until no more jumps, then do sanity check
    transcript_aa, score_assignments = finetune_segments(
        transcript_aa,
        processed_ufs,   
        score_assignments,
    )

    # Assign UFS speaker labels to AA transcript words & sentences
    transcript_aa = give_speaker_labels(
        transcript_aa, 
        processed_ufs, 
        score_assignments,
    )

    # Save output
    save_aa(s, e, transcript_aa)


# Tests & dev

In [9]:
s = "7"
e = "01b"

processed_utterances = prep_ufs(s, e[:2])
aa_transcript = prep_aa(s, e)

ancs, tops = find_anchors(
    aa_transcript,
    processed_utterances,
    verbose=False,
)

anchor_idx, score_assignments = process_chunks(
    ancs,
    tops,
    aa_transcript,
    processed_utterances,   
    verbose=False,
)


In [10]:
# Check before finetuning
print(len(aa_transcript['sentences']), score_assignments.shape)
for i in range(0, score_assignments.shape[0]):
    print(i, score_assignments[i][0], aa_transcript['sentences'][score_assignments[i][0]]['text'])
    print(score_assignments[i][1], processed_utterances[score_assignments[i][1]][1])
    print()


295 (295, 4)
0 0 Anyway, Phoebe, come on, let's go.
196  phoebe!

1 1 Come on.
197 come on!

2 2 Why aren't you dressed yet?
200 why aren’t you dressed yet?!

3 3 I'm sorry, but I just wrote the best dance song for your wedding.
201  i’m sorry, but i just wrote the best dance song for your wedding.

4 4 Check this out.
202 check this out.

5 5 You know what, Phoebe?
202 check this out.

6 6 I'll tell you what.
203  no, phoebe, i’ll tell you what, if you get ready now i’ll let you play it at the wedding.

7 7 If you get ready now, I'll let you play it at the wedding.
203  no, phoebe, i’ll tell you what, if you get ready now i’ll let you play it at the wedding.

8 8 Really?
204  really?!

9 9 Yes.
204  really?!

10 10 Oh, that's so exciting.
205 oh that’s so exciting!

11 11 Thank you.
206 thank you!

12 12 Thanks, Mon.
207 thanks mon!

13 13 Oh, but Mon, if you touch my guitar again, I'm gonna have to pound on you a little bit.
208 oh but mon, if you touch my guitar again i’ll have to p

In [11]:
for k in range(len(score_assignments)):
    if k > 0:
        i1, j1, s1, l1 = score_assignments[k-1]
        i2, j2, s2, l2 = score_assignments[k]
        if j1 > j2:
            print(i2, j2-j1)
        if j2 - j1 > 2:
            print(i2, j2-j1)
            

2 3
35 11
70 3
120 4
139 3
203 3


In [18]:

aa_transcript, score_assignments = finetune_segments(
    anchor_idx, 
    aa_transcript,
    processed_utterances,   
    score_assignments,
)


In [19]:
# Check after finetuning
print(len(aa_transcript['sentences']), score_assignments.shape)
for i in range(0, score_assignments.shape[0]):
    print(i, score_assignments[i][0], aa_transcript['sentences'][score_assignments[i][0]]['text'])
    print(score_assignments[i][1], processed_utterances[score_assignments[i][1]][1])
    print()


254 (254, 4)
0 0 Anyway, Phoebe,
196  phoebe!

1 1 come on,
197 come on!

2 2 let's go.
198 let’s go!

3 3 Come on.
199 come on!

4 4 Why aren't you dressed yet?
200 why aren’t you dressed yet?!

5 5 I'm sorry, but I just wrote the best dance song for your wedding.
201  i’m sorry, but i just wrote the best dance song for your wedding.

6 6 Check this out.
202 check this out.

7 7 You know what, Phoebe? I'll tell you what. If you get ready now, I'll let you play it at the wedding.
203  no, phoebe, i’ll tell you what, if you get ready now i’ll let you play it at the wedding.

8 8 Really? Yes.
204  really?!

9 9 Oh, that's so exciting.
205 oh that’s so exciting!

10 10 Thank you.
206 thank you!

11 11 Thanks, Mon.
207 thanks mon!

12 12 Oh, but Mon, if you touch my guitar again, I'm gonna have to pound on you a little bit.
208 oh but mon, if you touch my guitar again i’ll have to pound on you for a little bit.

13 13 Fair enough. Now go get ready.
209  fair enough, now go get ready!

14 1

In [20]:
for k in range(len(score_assignments)):
    if k > 0:
        i1, j1, s1, l1 = score_assignments[k-1]
        i2, j2, s2, l2 = score_assignments[k]
        if j2 - j1 > 1:
            print(i2, j2-j1)

24 2
59 2
63 2
88 2
89 2
176 2
244 2


In [21]:
aa_transcript = give_speaker_labels(aa_transcript, processed_utterances, score_assignments)


In [22]:

# Visual QC after assigning speaker labels
print(len(aa_transcript['sentences']), score_assignments.shape)

for i in range(0, score_assignments.shape[0]):
    i_idx = score_assignments[i][0]
    sen = aa_transcript['sentences'][i_idx]

    j_idx = score_assignments[i][1]
    utt = processed_utterances[j_idx]
    
    print(i, i_idx, f"{sen['speaker']}: {sen['text']}")
    print(j_idx, f"{utt[0]}: {utt[1]}")
    print()



254 (254, 4)
0 0 monica: Anyway, Phoebe,
196 monica:  phoebe!

1 1 monica: come on,
197 monica: come on!

2 2 monica: let's go.
198 monica: let’s go!

3 3 monica: Come on.
199 monica: come on!

4 4 monica: Why aren't you dressed yet?
200 monica: why aren’t you dressed yet?!

5 5 phoebe: I'm sorry, but I just wrote the best dance song for your wedding.
201 phoebe:  i’m sorry, but i just wrote the best dance song for your wedding.

6 6 phoebe: Check this out.
202 phoebe: check this out.

7 7 monica: You know what, Phoebe? I'll tell you what. If you get ready now, I'll let you play it at the wedding.
203 monica:  no, phoebe, i’ll tell you what, if you get ready now i’ll let you play it at the wedding.

8 8 phoebe: Really? Yes.
204 phoebe:  really?!

9 9 phoebe: Oh, that's so exciting.
205 phoebe: oh that’s so exciting!

10 10 phoebe: Thank you.
206 phoebe: thank you!

11 11 phoebe: Thanks, Mon.
207 phoebe: thanks mon!

12 12 phoebe: Oh, but Mon, if you touch my guitar again, I'm gonna hav

# Scripts graveyard

"Oh well, that sure didn't work!"

In [None]:

def split_rec(
    temp_wlist, 
    p_utter, 
    split_indices, 
    j1,
    j2,
    w_start, 
    w_stop, 
    k_0, 
    k_n,
):
    """."""
    j_0 = max(0, j1-1)
    j_n = min(j2+2, len(p_utter))

    for k in range(k_0, k_n):
        targ1 = " ".join([u[1] for u in p_utter[j_0:j1+k]])
        targ2 = " ".join([u[1] for u in p_utter[j1+k:j_n]])

        split_candidates = []
        split_idx, score = find_best_split(
            temp_wlist, 
            targ1, 
            targ2,
            i_start=w_start, 
            i_stop=w_stop + 1,
        )    
        split_candidates.append((k, split_idx, score))
    
    best_k, best_split, best_score = split_candidates[np.argmax(
        [x[2] for x in split_candidates])]

    if best_k == k_0 or best_k == (k_n - 1):
        split_indices[best_k] = split_idx
    if best_k > k_0:
        split_indices = split_rec(
            temp_wlist, 
            p_utter, 
            split_indices, 
            j1, 
            j2,
            w_start,
            split_idx, 
            k_0, 
            best_k,        
        )
    if best_k < (k_n - 1):
        split_indices = split_rec(
            temp_wlist, 
            p_utter, 
            split_indices, 
            j1, 
            j2,
            split_idx, 
            w_stop, 
            best_k+1, 
            k_n,        
        )

    return split_indices


def gap_segs_dev(aa_t, p_utter, i, j1, j2):
    # compact four consecutive sentences for more context
    i_0 = max(0, i - 1)
    i_n = min(i+3, len(aa_t['sentences']))
    temp_wlist = [w for s in aa_t['sentences'][i_0:i_n] for w in s["words"]]

    # Set search space boundaries: limit splits to two middle sentences i and i+1
    w_start = 0 if i == 0 else len(aa_t['sentences'][i_0]['words'])
    w_stop = len([w for s in aa_t['sentences'][i_0:i+2] for w in s["words"]])
    
    """
    Make it recursive
    """
    split_indices = {}
    k_0 = 1
    k_n = j2-j1+1

    split_indices = split_rec(
        temp_wlist, 
        p_utter, 
        split_indices, 
        j1, 
        j2,
        w_start, 
        w_stop, 
        k_0, 
        k_n,        
    )
        
    # merge two middle sentences
    aa_t = merge_segs(aa_t, i)

    cutwords = 0
    n_cuts = 0
    prev_cuts = {w_start, w_stop}
    j_vals = []    
    for k in range(k_0, k_n):
        s = split_indices[k]
        if s not in prev_cuts:
            w2cut = s-(w_start+cutwords)
            aa_t = split_seg(aa_t, i+n_cuts, w2cut)
            j_vals.append(j1+k-1)
            prev_cuts.add(s)
            cutwords += w2cut
            n_cuts += 1
    j_vals.append(j2)    
                    
    return aa_t, j_vals



In [5]:
def process_chunks_dev(
    ancs,
    tops,
    aa_transcript,
    processed_utterances,
    verbose=False,
):
    """."""
    score_assignments = np.full(
        (len(tops), 4), -1, dtype=int,
    )
    anchor_idx = np.array(tops)[ancs]    
    for i in range(anchor_idx.shape[0]): 
        
        if i == anchor_idx.shape[0] - 1:
            # skip assigning sentences from last segment because unbound
            pass    
        else:
            assign_chunk(
                anchor_idx[i][0],          # include current anchor speech-to-text in search
                anchor_idx[i+1][0],        # exclude next anchor speech-to-text from search
                anchor_idx[i][1],         # include current anchor's ccf match in search 
                anchor_idx[i+1][1] + 1,   # include next anchor's ccf match in search, in case precedent text also matches that utterance  
                aa_transcript,
                processed_utterances, 
                score_assignments,
                verbose=verbose,
            )
    
    # assign sentences from first segment
    assign_top(
        anchor_idx[0],
        aa_transcript,
        processed_utterances, 
        score_assignments,
    )
    # assign sentences from last segment
    assign_btm(
        anchor_idx[-1],
        aa_transcript,
        processed_utterances, 
        score_assignments,
    )
    
    return anchor_idx, score_assignments


def assign_top(
    start_vals,
    aa_transcript,
    processed_utterances, 
    score_assignments,
):
    """
    Starting from first anchor, work backward to assign speech2text sentences to transcript utterances 
    """
    if start_vals[0] == 0:
        """
        Edge case for which the first anchor is also the first speech2text sentence
        No unassigned sentences remain at the top, nothing to do
        """
        return score_assignments
        
    else:
        j = int(start_vals[1])        
        
        for i in range(start_vals[0]-1, -1, -1):
        
            # test 1: concat sentence with sentence below and calculate fit w sentence below's match
            t1_score = (j, string_match(
                aa_transcript['sentences'][i]['text'] + " " + aa_transcript['sentences'][i+1]['text'],
                processed_utterances[j][1],
            ))
        
            # test 2: calculate sentence fit with utterance just before next sentence's match
            t2_score = (max(0, j-1), string_match(
                aa_transcript['sentences'][i]['text'],
                processed_utterances[max(0, j-1)][1],
            ))
        
            # test 3: concat sentence with sentence above, and calculate fit with utterance just before next sentence's match
            t3_score = (0, (0, 0)) if i == 0 else (max(0, j-1), string_match(
                aa_transcript['sentences'][i-1]['text'] + " " + aa_transcript['sentences'][i]['text'],
                processed_utterances[max(0, j-1)][1],
            ))
        
            # test 4: calculate sentence fit with utterance two before next sentence's match
            t4_score = (max(0, j-2), string_match(
                aa_transcript['sentences'][i]['text'],
                processed_utterances[max(0, j-2)][1],
            ))
        
            # test 5: concat sentence with sentence above, and calculate fit with utterance two before next sentence's match 
            t5_score = (0, (0, 0)) if i == 0 else (max(0, j-2), string_match(
                aa_transcript['sentences'][i-1]['text'] + " " + aa_transcript['sentences'][i]['text'],
                processed_utterances[max(0, j-2)][1],
            ))
        
            res_list = [t1_score, t2_score, t3_score, t4_score, t5_score]
            
            # re-assigns j to best score j, becomes "previous" as moves backward
            j, (s, l) = res_list[int(np.argmax([x[1][0] for x in res_list]))]
        
            score_assignments[i] = np.array([i, j, s, l])        
    
        return score_assignments


def assign_btm(
    start_vals,
    aa_transcript,
    processed_utterances, 
    score_assignments,
):
    """
    Starting from last anchor, work forward to assign speech2text sentences to transcript utterances 
    """
    # Assign last anchor to scores
    score_assignments[start_vals[0]] = start_vals
    
    i_lim = score_assignments.shape[0] - 1
    if start_vals[0] == i_lim:
        """
        Edge case for which the last anchor is also the last speech2text sentence
        No unassigned sentences remain at the bottom, nothing to do
        """
        return score_assignments
        
    else:
        j = int(start_vals[1])
        j_lim = len(processed_utterances) - 1

        for i in range(start_vals[0]+1, i_lim+1, 1):

            # test 1: concat sentence with sentence above and calculate fit w sentence above's match
            t1_score = (j, string_match(
                aa_transcript['sentences'][i-1]['text'] + " " + aa_transcript['sentences'][i]['text'],
                processed_utterances[j][1],
            ))

            # test 2: calculate sentence fit with utterance just after previous sentence's match
            t2_score = (min(j_lim, j+1), string_match(
                aa_transcript['sentences'][i]['text'],
                processed_utterances[min(j_lim, j+1)][1],
            ))

            # test 3: concat sentence with sentence below, and calculate fit with utterance just after previous sentence's match
            t3_score = (0, (0, 0)) if i == i_lim else (min(j_lim, j+1), string_match(
                aa_transcript['sentences'][i]['text'] + " " + aa_transcript['sentences'][i+1]['text'],
                processed_utterances[min(j_lim, j+1)][1],
            ))
        
            # test 4: calculate sentence fit with utterance two after previous sentence's match
            t4_score = (min(j_lim, j+2), string_match(
                aa_transcript['sentences'][i]['text'],
                processed_utterances[min(j_lim, j+2)][1],
            ))
        
            # test 5: concat sentence with sentence below, and calculate fit with utterance two after previous sentence's match         
            t5_score = (0, (0, 0)) if i == i_lim else (min(j_lim, j+2), string_match(
                aa_transcript['sentences'][i]['text'] + " " + aa_transcript['sentences'][i+1]['text'],
                processed_utterances[min(j_lim, j+2)][1],
            ))
        
            res_list = [t1_score, t2_score, t3_score, t4_score, t5_score]
            
            # re-assigns j to best score j, becomes "previous" as moves forward
            j, (s, l) = res_list[int(np.argmax([x[1][0] for x in res_list]))]
        
            score_assignments[i] = np.array([i, j, s, l])        
    
        return score_assignments

In [71]:

def print_gapinfo(
    i1, 
    i2, 
    j1, 
    j2, 
    aa_t, 
    p_utter,    
):
    print(aa_t['sentences'][i1]['text'])
    print(aa_t['sentences'][i2]['text'])
    print()
    print(p_utter[j1][1])
    print(p_utter[j2][1])
    print()
    for m in range (j1, j2+1):
        print(p_utter[m][1])
        

def finetune_segments_dev(
    anchor_idx,
    aa_transcript,
    processed_utterances,
    score_assignments,
):
    """."""
    i = 1
    #i = anchor_idx[0][0]
    while i < len(aa_transcript['sentences']):
        if i + 1 == len(aa_transcript['sentences']):
            # Last segment, just exit loop
            i += 1

        else:
            j1 = score_assignments[i][1]
            j2 = score_assignments[i+1][1]
            assert j1 <= j2
    
            # two subsequent sentences w same match: merge sentences
            if j1 == j2:
                aa_transcript = merge_segs(
                    aa_transcript, i,
                )
                s, l = string_match(
                    aa_transcript['sentences'][i]['text'],
                    processed_utterances[j1][1],
                )
                score_assignments[i] = np.array([i, j1, s, l])
                if i + 2 == len(aa_transcript['sentences']):
                    score_assignments = score_assignments[:-1]
                else:
                    score_assignments[i+2:, 0] = score_assignments[i+2:, 0] - 1
                    score_assignments = np.concatenate(
                        (score_assignments[:i+1], score_assignments[i+2:])
                    )
                # i+= 0  # Do NOT update i, sentence list shortened by one 
    
            # Cap gap size at 10... otherwise most likely wrong match... 
            elif j2 > j1 and j2 - j1 < 7:
                aa_transcript, j_vals = gap_segs(
                    aa_transcript, processed_utterances, i, j1, j2,
                )
                temp_scores = []
                for k in range(len(j_vals)):
                    s, l = string_match(
                        aa_transcript['sentences'][i+k]['text'],
                        processed_utterances[j_vals[k]][1],
                    )
                    temp_scores.append([i+k, j_vals[k], s, l])
                if i + 2 == len(aa_transcript['sentences']):
                    score_assignments = np.concatenate(
                        (score_assignments[:i], np.array(temp_scores))
                    )
                else:
                    score_assignments[i+2:, 0] = score_assignments[i+2:, 0] + len(j_vals) - 2
                    score_assignments = np.concatenate(
                        (score_assignments[:i], np.array(temp_scores), score_assignments[i+2:])
                    )
                i += len(j_vals) - 1    
            
            else:
                # Handle large gaps... merge two next consecutive segments, assign second seg's match to scores, and re-try
                print(f"Warning! Large jump of {j2 - j1} in matching indices between lines {i} and {i+1}")
                """
                print_gapinfo(
                    i, i+1, j1, j2, 
                    aa_transcript, 
                    processed_utterances,
                )
                """
            
                if i + 2 < len(aa_transcript['sentences']):
                    new_j2 = score_assignments[i+2][1]
                    aa_transcript = merge_segs(
                        aa_transcript, i+1,
                    )
                    s, l = string_match(
                        aa_transcript['sentences'][i+1]['text'],
                        processed_utterances[new_j2][1],
                    )
                    score_assignments[i+1] = np.array([i+1, new_j2, s, l])
                    if i + 3 == len(aa_transcript['sentences']):
                        score_assignments = score_assignments[:-1]
                    else:
                        score_assignments[i+3:, 0] = score_assignments[i+3:, 0] - 1
                        score_assignments = np.concatenate(
                            (score_assignments[:i+2], score_assignments[i+3:])
                        )
                        
                else:
                    i += 1
    
    return aa_transcript, score_assignments

