In [1]:
import pandas as pd
import numpy as np

In [2]:
lines = pd.read_csv ('1_lines.csv')

In [3]:
lines.head()

Unnamed: 0,0,lines
0,Online Store,F.D. » Transcripts » TV Show Transcripts » F »...
1,Online Store,'Tis the Season! We're officially a verified a...
2,Online Store,Feed the kitteh
3,Online Store,F.D. » Transcripts » TV Show Transcripts » F »...
4,Online Store,© 2001-2020 Forever Dreaming. All rights reser...


In [4]:
def create_columns(df):
    """ 
    Extracts character names, converts them to lower case and creates a new character column,
    Deletes character names from lines and
    Creates column for title of episode
    """
    df['character'] = df['lines'].str.extract(r"((\bMR?S?\b.? ?|\bMrs?\b.? ?|\bMs\b.? ?|\bDR.? \b|\bDr.? \b)?\b[A-Z][A-Za-z]+)")[0]
    df['clean_lines'] = df['lines'].str.extract(r"(: .*)")
    df['character_lower'] = df["character"].str.lower()
    df['title'] = df['0'].str.extract(r"((\d\d?)x(\d\d?)) - (.*)")[3]
    
    return df

In [5]:
lines = create_columns(lines)

In [6]:
def episode_cleaner(df):
    """
    Identifies episode pattern in title, then creates own column for episode and season # and
    Creates a new column for episode length (double/ single)
    """
    
    pattern = lines['0'].str.extract(r"((\d\d?)x(\d\d?) - ((\d\d?)x(\d\d?))?)")
    lines['season'] = pattern[1]
    lines['episode'] = pattern[2]
    
    double = lines[pattern[5].notnull()]
    single = lines[pattern[5].isna()]
    double.insert(7, 'length', ['double']*double.shape[0], True)
    single.insert(7, "length", ['single']*single.shape[0], True)
    
    return pd.concat([double, single]).sort_index()

In [7]:
lines = episode_cleaner(lines)

In [8]:
#delete marketing and irrelevant lines
lines = lines[(lines['0'] != 'Online Store')  & (lines['0'] != 'Board Updates: Please Read 8/26/19')].reset_index(drop = True)

In [9]:
#a handfull of episodes were not picked up in the previous cleaning step
lines.title.str.extract(r"(\d\d?x\d\d? - .*)")[0].unique()

array([nan, '02x13 - The One After the Superbowl',
       '04x24 - The One With Ross’s Wedding Parts I and II',
       '05x24 - The One In Vegas', '06x16 - The One That Could Have Been',
       '06x25 - The One With The Proposal',
       '07x24 - The One With Chandler and Monica’s Wedding',
       '08x24 - The One Where Rachel Has A Baby',
       '09x24 - The One In Barbados', '10x18 -  The Last One (Finale)'],
      dtype=object)

In [10]:
#clean titles manually
lines.title.replace('02x13 - The One After the Superbowl', 'The One After the Superbowl', inplace = True)
lines.title.replace('04x24 - The One With Ross’s Wedding Parts I and II', 'The One With Ross’s Wedding Parts I and II', inplace = True)
lines.title.replace('05x24 - The One In Vegas', 'The One In Vegas', inplace = True)
lines.title.replace('06x16 - The One That Could Have Been',  'The One That Could Have Been', inplace = True)
lines.title.replace('06x25 - The One With The Proposal', 'The One With The Proposal', inplace = True)
lines.title.replace('07x24 - The One With Chandler and Monica’s Wedding', 'The One With Chandler and Monica’s Wedding', inplace = True)
lines.title.replace('08x24 - The One Where Rachel Has A Baby', 'The One Where Rachel Has A Baby', inplace = True)
lines.title.replace('09x24 - The One In Barbados', 'The One In Barbados', inplace = True)
lines.title.replace('10x18 -  The Last One (Finale)', 'The Last One (Finale)', inplace = True)

In [11]:
lines = lines.drop(columns = ['0', 'lines','character'])

In [12]:
#characters: only those referred to by name
#if more than one person spoke at once, then delete
#create a list of non characters in the character column
not_chars = [
            'transcripts', 'tis', 'feed', 'forever', 'as', 'thank', 'if','originally', 'the', 'cut','enter', 
             'they', 'moment', 'scene',  'knock', 'ad', 'tv', 'exit', 'pause', 'closing', 'credits',  'at', 
             'again', 'stunned','intro', 'door', 'opens', 'general', 'silence', 'dubious', 'uncomfortable',
             'pizza', 'hospital', 'balcony', 'both', 'phone', 'end', 'long', 'opening', 'commercial', 'lights',
             'aurora', 'flashback', 'director', 'heck', 'while', 'time', 'there', 'written',  'short', 'bladder',
             'fun', 'next',  'coma', 'camera', 'transcribed', 'so', 'microwave', 'iris', 'timer', 'pre', 'same',
             'intercom', 'another', 'in', 'quick', 'music', 'fake', 'casting', 'story', 'frnz', 'meln', 'melanie',
             'flgt', 'man', 'voice', 'carl', 'chinese', 'store', 'outside', 'old', 'machine', 'central',
             'transit', 'rtst', 'radio', 'gang', 'gives', 'fbob', 'estl', 'three', 'wedding', 'lipson', 'some',
             'security', 'trainer', 'susie', 'van', 'back', 'looks', 'note', 'now', 'since', 'delivery', 'tattoo',
             'amger', 'big', 'little',  'once', 'annoying', 'starts', 'best', 'video', 'young', 'transcriber', 
             'baywatch', 'mattress', 'matress', 'margha', 'sequence', 'it',  'we', 'cookie', 'things', 'quartet',
             'bass', 'to', 'hearing', 'later', 'hypnosis', 'of', 'hold', 'awkward', 'stage', 'guru', 'two',
             'tank', 'chip',  'oven', 'gym', 'this', 'bank','suddenly', 'from', 'after', 'kitchen', 'happy', 
             'original', 'uh', 'see', 'tour', 'onstage',  'just',  'teleplay', 'devon', 'ticket', 'goes', 'yep',
             'seeing', 'sleep', 'and', 'with', 'part', 'one', 'drunk', 'gate', 'ending', 'pbs',  'hotel',
             'health', 'notices', 'thanksgiving', 'present', 'past', 'fat', 'upon', 'french', 'follows', 'shows',
             'no', 'hits', 'neither', 'needless', 'sleepy', 'yeah', 'check', 'party', 'dream', 'for', 'answering',
             'but', 'oh', 'casino', 'blackjack', 'drunken', 'cue', 'transciber', 'pan', 'croupler', 'applause',
             'decided', 'passerby', 'kisses', 'inside','tall', 'when', 'patron',  'hope', 'eldad', 'thus',
             'what', 'first', 'second', 'matire', 'walks', 'whenever', 'exits', 'more', 'maitre', 'smoke',
             'disembodied', 'morse', 'finally', 'fade', 'flash', 'behind', 'dedicated', 'front', 'anxious'
             'airline', 'six', 'dramatic', 'kash', 'basically', 'doorbell', 'don', 'various', 'during', 'evil',
             'sick', 'cuts',  'prospective', 'directed',  'starting', 'beat', 'clink', 'rattling', 'shuts',
             'reaches', 'bar', 'uneasy',  'went', 'did', 'shortly', 'surprised', 'ok', 'extra', 'aired', 'very'
             'potluck', 'is', 'salon', 'hey', 'blonde', 'delta', 'about', 'who', 'always', 'points', 'meanwhile',
             'everybody', 'hugsy', 'strange', 'message', 'bitter', 'turns', 'shop', 'trying', 'switch',
             'precious', 'does', 'sound', 'supposedly', 'lifts',  'charity', 'agency', 'adoption',  'tainted',
             'can', 'tape', 'air', 'anxious', 'very', 'potluck', 'all', 'waitress', 'boys', 'receptionist', 'kid',
             'angl', 'woman', 'she', 'aunt', 'nurse', 'guy', 'girl', 'customer', 'guys', 'girls', 'her', 'pa', 
             'actor', 'stranger', 'waiter', 'doctor', 'together', 'minister', 'interviewer', 'friend', 'producer',
             'everyone', 'host', 'student', 'leader', 'singer', 'lecturer', 'anchorwoman', 'announcer', 'referee',
             'minster', 'male', 'female', 'guest', 'cop', 'tourist', 'attendant', 'hitchhiker', 'crew', 'customers',
             'narrator', 'you', 'supervisor', 'policeman', 'airline', 'witch', 'clown', 'cowgirl', 'ballerina',
             'boy', 'stripper', 'hooker', 'locksmith', 'uncle', 'cashier', 'assistant', 'others', 'intern',
             'handyman', 'actress', 'prof', 'paleontologist', 'professore', 'realtor', 'clerk', 'mr', 'lady'
            ]
                        

In [13]:
#filter for only characters in character column
lines = lines[~lines['character_lower'].isin(not_chars)].reset_index(drop = True)

In [14]:
#some main character names are mispellt
list(lines[lines['character_lower']== 'rach']['clean_lines'][:10])

[': Come on! You guys can pee standing up.',
 ': [waitressing] Does anybody want anything else?',
 ": It's my father. He wants to give me a Mercedes convertible.",
 ": Yeah, well, it's a Mercedes if I move back home. Oh, it was horrible. He called me young lady.",
 ': Oh, yeah, yeah. Actually, I got the extended disco version, with three choruses of "You\'ll never make it on your own".',
 ': Oh, big glamour night. Me and Monica at Laundorama.',
 ': Who?',
 ": Don't you have a laundry room in your building?",
 ': Sure.',
 ": Oh, 'scuse me. I was kinda using that machine."]

In [15]:
def name_changer(df):
    """
    Correct mispelled main character names
    """
    df.replace('mnca', 'monica', inplace = True)
    df.replace('rach', 'rachel', inplace = True)
    df.replace('rahcel', 'rachel', inplace = True)
    df.replace('rache', 'rachel', inplace = True)
    df.replace('racel', 'rachel', inplace = True)
    df.replace('chan', 'chandler', inplace = True)
    df.replace('chandlers', 'chandler', inplace = True)
    df.replace('phoe', 'phoebe', inplace = True)
    df.replace('phobe', 'phoebe', inplace = True)
    df.replace('jnce', 'janice', inplace = True)
    df.clean_lines = df.clean_lines.str.replace(': ', '')
    df.character_lower = df.character_lower.str.replace('.', '')
    
    return df.copy()


In [16]:
name_changer(lines)

Unnamed: 0,clean_lines,character_lower,title,season,length,episode
0,There's nothing to tell! He's just some guy I ...,monica,The One Where Monica Gets a New Roommate (The ...,01,single,01
1,"C'mon, you're going out with the guy! There's ...",joey,The One Where Monica Gets a New Roommate (The ...,01,single,01
2,So does he have a hump? A hump and a hairpiece?,chandler,The One Where Monica Gets a New Roommate (The ...,01,single,01
3,"Wait, does he eat chalk?",phoebe,The One Where Monica Gets a New Roommate (The ...,01,single,01
4,"Just, 'cause, I don't want her to go through w...",phoebe,The One Where Monica Gets a New Roommate (The ...,01,single,01
...,...,...,...,...,...,...
61253,,chandler,The Last One (Finale),10,double,17
61254,(crying) Do you guys have to go to the new hou...,rachel,The Last One (Finale),10,double,17
61255,We got some time.,monica,The Last One (Finale),10,double,17
61256,"Okay, should we get some coffee?",rachel,The Last One (Finale),10,double,17


In [17]:
#the nulls for clean_lines are stage directions so can delete these
print(lines.isna().sum())
lines.clean_lines.dropna(inplace = True)

clean_lines        2853
character_lower     649
title                 0
season                0
length                0
episode               0
dtype: int64


In [18]:
#drop missing values
lines = lines.dropna().reset_index(drop=True)

In [19]:
#change order of columns
lines = lines[['title', 'character_lower', 'clean_lines', 'season', 'episode', 'length']]

In [20]:
#rename columns
lines.rename(columns = {'character_lower': 'character', 'clean_lines':'lines'}, inplace = True)

In [21]:
#look at a line. line includes stage directions, in parenthesis. these should be deleted.
lines.lines[123]

"(SCORNFUL) Grab a spoon. Do you know how long it's been since I've grabbed a spoon? Do the words 'Billy, don't be a hero' mean anything to you? Y'know, here's the thing. Even if I could get it together enough to- to ask a woman out,... who am I gonna ask? (GAZES OUT OF THE WINDOW)"

In [22]:
#testing regex to delete everything within parenthesis
lines.lines.str.replace(r'\(.*?\)', '').str.strip()[123]

"Grab a spoon. Do you know how long it's been since I've grabbed a spoon? Do the words 'Billy, don't be a hero' mean anything to you? Y'know, here's the thing. Even if I could get it together enough to- to ask a woman out,... who am I gonna ask?"

In [23]:
#full df: parenthesis included
lines.lines

0        There's nothing to tell! He's just some guy I ...
1        C'mon, you're going out with the guy! There's ...
2          So does he have a hump? A hump and a hairpiece?
3                                 Wait, does he eat chalk?
4        Just, 'cause, I don't want her to go through w...
                               ...                        
58400                              Oh, it's gonna be okay.
58401    (crying) Do you guys have to go to the new hou...
58402                                    We got some time.
58403                     Okay, should we get some coffee?
58404                                         Sure. Where?
Name: lines, Length: 58405, dtype: object

In [24]:
#remove everything within parenthesis - as there are actions not lines
lines['lines'] = lines.lines.str.replace(r'\(.*?\)', '').str.strip()

In [25]:
#full df: no more parenthesis
lines.lines

0        There's nothing to tell! He's just some guy I ...
1        C'mon, you're going out with the guy! There's ...
2          So does he have a hump? A hump and a hairpiece?
3                                 Wait, does he eat chalk?
4        Just, 'cause, I don't want her to go through w...
                               ...                        
58400                              Oh, it's gonna be okay.
58401    Do you guys have to go to the new house right ...
58402                                    We got some time.
58403                     Okay, should we get some coffee?
58404                                         Sure. Where?
Name: lines, Length: 58405, dtype: object

In [26]:
lines.head()

Unnamed: 0,title,character,lines,season,episode,length
0,The One Where Monica Gets a New Roommate (The ...,monica,There's nothing to tell! He's just some guy I ...,1,1,single
1,The One Where Monica Gets a New Roommate (The ...,joey,"C'mon, you're going out with the guy! There's ...",1,1,single
2,The One Where Monica Gets a New Roommate (The ...,chandler,So does he have a hump? A hump and a hairpiece?,1,1,single
3,The One Where Monica Gets a New Roommate (The ...,phoebe,"Wait, does he eat chalk?",1,1,single
4,The One Where Monica Gets a New Roommate (The ...,phoebe,"Just, 'cause, I don't want her to go through w...",1,1,single


In [27]:
lines.shape

(58405, 6)

In [28]:
lines.to_csv('2_clean_lines.csv', index = False)