In [1]:
%matplotlib inline

In [92]:
import json, glob, os, re
from pathlib import Path
import numpy as np

## Quality Checks on Community Transcripts

This notebook was used to perform quality checks and edits on fan-made community transcripts. The goal was to improve the alignment between those transcripts and the AssemblyAI speech-to-text output files, in order to assign more accuracte speaker tags to the AI transcripts.

In [3]:
"""
Support function loads and cleans up the .txt file's content
"""

def prep_ufs(s_num, e_num):
    """
    Extracts clean utterances from 
    Crazy for Friends fan-made transcripts
    """
    ufs_filepath = Path(
        "/home/mstlaure/Documents/Marie/neuromod/"
        "friends_annotations/annotation_results/"
        f"community_based/s{s_num}/friends_s0{s_num}e{e_num}_ufs.txt"
    )
    with open(ufs_filepath, 'r', encoding='utf-8') as f:
        ufs_transcript = f.read()

    ufs_processed = []
    for line in ufs_transcript.splitlines():
        # Remove all text inside parentheses and squared brackets (transcriber notes)
        line = re.sub(r'\s*\(.*?\)\s*', ' ', line)
        line = re.sub(r'\s*\[.*?\]\s*', ' ', line)
        line = line.strip()    
        # Only process lines that start with "Speaker:"
        if ":" in line:
            # Split line between speaker and utterance
            speaker, speech = line.strip().lower().split(":")
            sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', speech)
            for s in sentences:
                if s.strip():
                    ufs_processed.append((speaker, s))

    return ufs_processed

In [185]:
s = "2"

tr_list = sorted(glob.glob(
    "/home/mstlaure/Documents/Marie/neuromod/"
    "friends_annotations/annotation_results/"
    f"community_based/s{s}/friends_s0{s}e*_ufs.txt"
    
))


In [183]:
"""
Idenfity loose text segments, e.g., dialog split on multiple lines that would become unassigned to a speaker in alignment scripts
"""

words_to_skip = [
    "Closing Credits and Bloopers",
    "Closing Credits", 
    "Commercial Break",
    "Opening Credits",
    "Ending Credits",
    "OPENING TITLES",
    "THE END",
    "Commercial",
    "End",
]
pattern = re.compile('|'.join(re.escape(word) for word in words_to_skip), re.IGNORECASE)

for tr in tr_list:
    with open(tr, 'r', encoding='utf-8') as f:
        ufs_transcript = f.read()
    print(os.path.basename(tr))
          
    for line in ufs_transcript.splitlines():
        # Remove all text inside parentheses and squared brackets (transcriber notes)
        line = re.sub(r'\s*\(.*?\)\s*', ' ', line)
        line = re.sub(r'\s*\[.*?\]\s*', ' ', line)
        line = pattern.sub('', line)
        line = line.strip()    # removes leading/trailing whitespace
        # Identify lines of dialog without a "Speaker:"
        if not ":" in line and line.strip():
            print(line)
    print("\n")
    


friends_s01e01_ufs.txt


friends_s01e02_ufs.txt


friends_s01e03_ufs.txt
.


friends_s01e04_ufs.txt


friends_s01e05_ufs.txt


friends_s01e06_ufs.txt


friends_s01e07_ufs.txt
.


friends_s01e08_ufs.txt


friends_s01e09_ufs.txt


friends_s01e10_ufs.txt


friends_s01e11_ufs.txt


friends_s01e12_ufs.txt


friends_s01e13_ufs.txt


friends_s01e14_ufs.txt


friends_s01e15_ufs.txt


friends_s01e16_ufs.txt


friends_s01e17_ufs.txt


friends_s01e18_ufs.txt


friends_s01e19_ufs.txt


friends_s01e20_ufs.txt


friends_s01e21_ufs.txt


friends_s01e22_ufs.txt


friends_s01e23_ufs.txt


friends_s01e24_ufs.txt




In [188]:
"""
Identify lines with multiple ":" or ":" that don't indicate a speaker 

Also handle times (e.g., 6:30) to keep them from the transcripts


Compile lists of known speakers to ease mental load during manual checks
"""

known_names = {
    'chandler', 'joey', 'monica', 'phoebe', 'rachel', 'ross',
    'mr. geller', 'mrs. geller', 'woman', 'rachel and monica',
    'monica and phoebe', 'ross and rachel', 'gunther', 'melissa',
    'jake', 'tag', 'terry', 'the director', 'emily',
    'mr. bing', 'mrs. bing', 'phoebe and ross', 'richard',
    'all', 'fat monica', 'phoebe and rachel', 'man', 'woman',
    'joey and ross', 'julie', 'erin', 'female student', 'guy', 'janice',
    'joey’s date', 'male student', 'the colonel', 'the head librarian', 
    'the librarian', 'the waitress', 'ross and joey', 'ursula',
    'barry', 'bernice', 'carol', 'chandler and joey', 'susan',
    'joey and chandler', 'chandler and ross', 'ross and chandler', 
    'girls', 'guys',
}

for s in range(1, 8):
    tr_list = sorted(glob.glob(
        "/home/mstlaure/Documents/Marie/neuromod/"
        "friends_annotations/annotation_results/"
        f"community_based/s{s}/friends_s0{s}e*_ufs.txt"
        
    ))
    
    for tr in tr_list:
        with open(tr, 'r', encoding='utf-8') as f:
            ufs_transcript = f.read()
        print(os.path.basename(tr))
    
        speaker_list = []
        for line in ufs_transcript.splitlines():
            # Remove all text inside parentheses and squared brackets (transcriber notes)
            line = re.sub(r'\s*\(.*?\)\s*', ' ', line)
            line = re.sub(r'\s*\[.*?\]\s*', ' ', line)
            line = line.strip()        
            if ":" in line:
                # Split line between speaker and utterance
                line_splits = line.strip().lower().split(":")
                speaker = line_splits[0].strip()
                if speaker not in known_names:
                    speaker_list.append(speaker)
    
                speech = line_splits[1]
                if len(line_splits) > 2:  
                    for seg in line_splits[2:]:
                        speech += f":{seg}"
                #speaker, speech = line.strip().lower().split(":")
                #sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', speech)
                #for s in sentences:
                #    if s.strip():
                #        ufs_processed.append((speaker, s))
    
                if len(line_splits) != 2:
                    print(line)
                    print(speech)
    
        
    
        print(np.unique(speaker_list))
        print("\n")

friends_s01e01_ufs.txt
Monica: Oh really, so that hysterical phone call I got from a woman at sobbing 3:00 A.M., "I'll never have grandchildren, I'll never have grandchildren." was what?  A wrong number?
 oh really, so that hysterical phone call i got from a woman at sobbing 3:00 a.m., "i'll never have grandchildren, i'll never have grandchildren." was what?  a wrong number?
Monica: Oh God, is it 6:30?  Buzz him in!
 oh god, is it 6:30?  buzz him in!
['customer' 'frannie' 'paul' 'phoebe, ross, chandler, and joey'
 'priest on tv' 'waitress']


friends_s01e02_ufs.txt
['chrissy on three’s company' 'dr. oberman' 'marsha' 'robbie']


friends_s01e03_ufs.txt
['alan' 'everybody' 'lizzie' 'paula' 'the guys']


friends_s01e04_ufs.txt
['joanne' 'kid' 'kiki' 'leslie' 'pizza guy' 'receptionist']


friends_s01e05_ufs.txt
['angela' 'bob']


friends_s01e06_ufs.txt
['all except joey' 'aurora' 'director' 'estelle']


friends_s01e07_ufs.txt
['jill' 'monica, joey, and phoebe' 'mr. heckles' 'paolo']


frie