In [206]:
import pandas as pd
import numpy as np
import re
import string
import os
import csv
import math
pd.set_option('display.max_colwidth', -1)

In [257]:
def read_file(file_path):
    df = pd.read_csv(file_path, header=None, encoding="utf-8")
    df.columns = ['lines']   
    title = df.lines[0]
    ep_num = re.sub(r'[^0-9]', '', file_path)
    df = df[df['lines'] != '\n']
 
    # General line clean-up:
    df['lines'] = df['lines'].apply(lambda x: re.sub('', "'", x))
    df['lines'] = df['lines'].apply(lambda x: re.sub('…', '', x))
    df['lines'] = df['lines'].apply(lambda x: re.sub('\n', ' ', x))
    df['lines'] = df['lines'].apply(lambda x: re.sub(r'\d[:]', '', x))
    df['lines'] = df['lines'].apply(lambda x: re.sub('hint:', 'hint', x))
    df['lines'] = df['lines'].apply(lambda x: x.lower().lstrip())
    
    # Get rid of transcriber notes
    df['lines'] = df['lines'].apply(lambda x: re.sub(r'([\{]).*?([\}])', '', x))
    
    df = df[df['lines'] != '']
    df = df.reset_index(drop=True)
    
    cleaned_lines = []
    
    for i in range(len(df)):
        #print(i)
        if df.lines[i][0] == ':':
            cleaned_lines[-1] += df.lines[i]
        else:
            cleaned_lines.append(df.lines[i])
    
    df = pd.DataFrame(columns=['lines'])
    
    df['lines'] = cleaned_lines
    
    ep_scene = []
    ep_character = []
    ep_dialogue = []
    
    current_scene = None
    current_character = None
    current_dialogue = None
    
    df = df.reset_index(drop=True)

    start_point = 0
    if title == 'the one where mr heckles dies':
        start_point = 6
        current_scene = 'Unknown'
    
    for i in range(start_point, len(df)):
        #print(df.lines[i])
        if 'scene:' in df.lines[i]:
            current_scene = re.sub(r'[\[\]]', '', re.sub('scene: ', '', df.lines[i]))
        elif 'scene,' in df.lines[i]:
            current_scene = re.sub(r'[\[\]]', '', re.sub('scene, ', '', df.lines[i]))
        
        elif df.lines[i] in ['end', 'opening credits', 'commercial break', 'closing credits', 'opening titles']:
                current_scene = df.lines[i]
                ep_scene.append(current_scene)
                current_character = None
                current_dialogue = None
                ep_character.append(current_character)
                ep_dialogue.append(current_dialogue)
        
        elif ':' in df.lines[i] and current_scene != None: # and df.lines[i].split(':', 1)[1][0] not in ['0','1','2','3','4','5','6','7','8','9']):
            if df.lines[i].rstrip().split(':', 1)[1] != '':
                current_character = df.lines[i].split(':', 1)[0]
                current_dialogue = df.lines[i].split(':', 1)[1]
            else: 
                current_character = df.lines[i].split(':', 1)[0]
        
        elif current_character != None and current_scene != None:         
            
            if current_dialogue == None: 
                current_dialogue = df.lines[i]
            else:
                current_dialogue += ' ' + df.lines[i]      
        
        # Need to handle dialogue that is split across multpile lines
        if i < (len(df)-1):
            if ':' in df.lines[i+1] or df.lines[i+1] in ['end', 'opening credits', 'commercial break', 'closing credits', 'opening titles']:                
                if current_scene != None and (current_character != None and current_dialogue != None):
                    ep_scene.append(current_scene)
                    ep_character.append(current_character)
                    ep_dialogue.append(current_dialogue)
                    current_character = None
                    current_dialogue = None
                    
    # Clean lines
    
    ep_dialogue = [re.sub(r'([\[]).*?([\]])', '', x) if x != None else x for x in ep_dialogue]
    ep_dialogue = [re.sub(r'([\(]).*?([\)])', '', x) if x != None else x for x in ep_dialogue]
    ep_dialogue = [re.sub(r'[^a-z ]', '', x) if x != None else x for x in ep_dialogue]
    
    ep_character = ['chandler' if x == 'chan' else x for x in ep_character]
    ep_character = ['rachel' if x == 'rach' else x for x in ep_character]
    ep_character = ['phoebe' if x == 'phoe' else x for x in ep_character]
    ep_character = ['monica' if x == 'mnca' else x for x in ep_character]
    
    ep_character = ['woman' if x == 'an' else x for x in ep_character]
    
    def chop_scene(scn):
        comma_place = scn.find(',')
        period_place = scn.find('.')
        if comma_place == -1 and period_place == -1:
            return scn
        elif comma_place < period_place and comma_place != -1:
            return scn.split(',')[0]
        else:
            return scn.split('.')[0]
           
    ep_scene = [chop_scene(x) for x in ep_scene]
 
    return_df = pd.DataFrame(columns=['scene', 'character', 'dialogue'])
    return_df['scene'] = ep_scene
    return_df['character'] = ep_character
    return_df['dialogue'] = ep_dialogue
    return_df['ep_title'] = title
    return_df['ep_num'] = ep_num  
    return_df = return_df[return_df['dialogue'] != '']
    return_df = return_df.reset_index(drop=True)
    
    return return_df

In [229]:
# test = read_file('friendsscraper/spiders/ep-911.csv')
# test.head(50)

Unnamed: 0,scene,character,dialogue,ep_title,ep_num
0,chandler and monica's,chandler,hey,the one where rachel goes back to work,911
1,chandler and monica's,monica,good morning tiger im making you a nice big breakfast so you can keep up your strength for tonight youre gonna get me good and pregnant,the one where rachel goes back to work,911
2,chandler and monica's,chandler,ive got nowhere to go this morning im unemployed i dont know what im gonna do with my life,the one where rachel goes back to work,911
3,chandler and monica's,monica,well i just lost my erection,the one where rachel goes back to work,911
4,chandler and monica's,chandler,i mean what am i supposed to do with myself,the one where rachel goes back to work,911
5,chandler and monica's,monica,youre supposed to find your passion in life you can be whatever you wanna be now its exciting,the one where rachel goes back to work,911
6,chandler and monica's,chandler,but its all so overwhelming i dont know where to start,the one where rachel goes back to work,911
7,chandler and monica's,monica,hey wait a second i can help you with this you just need to be organized we can make a list of your qualifications and categorize jobs by industry there could be folders and files,the one where rachel goes back to work,911
8,chandler and monica's,chandler,hey this is where your hyperorganizedpainintheass stuff pays off,the one where rachel goes back to work,911
9,chandler and monica's,monica,i know my erection is back,the one where rachel goes back to work,911


In [251]:
episode_dfs = []

for episode in os.listdir('friendsscraper/spiders/'):
    if 'ep-' in episode: 
        path = 'friendsscraper/spiders/' + episode
        episode_dfs.append(read_file(path))
        
        #df = process_script(raw_lines, ep_title)
        #ep_title = ep_title.lower()
        #df.to_csv('friendsscraper/spiders/processed_ep_%s' % ep_title, index=False)

In [252]:
big_friends = episode_dfs[0]

for ep in episode_dfs[1:]:
    big_friends = pd.concat([big_friends, ep])

In [253]:
len(big_friends)

61741

In [254]:
uni = big_friends['character'].unique()
for i in uni:
    print(i)

chandler
rachel
phoebe
monica
all
None
joey
ross
julie
mich
waiter
guy
gunther
receptionist
joey's look-a-like
carl
the doctor
janice
the instructor
janice's voice
the potential roommate
parker
mrs. geller
mr. geller
woman
man
aunt lisa
uncle dan
emily
devon
liam
ticket counter attendant
mrs. chatracus
rachel and bonnie
bonnie
phoebe sr.
ursula
chandler, monica, and joey
ross and rachel
phoebe and rachel
phoebe, ross, and rachel
joshua
mr. waltham
the a.d
the cigarette smoking guy
charlton heston
the cigarette guy
the interviewer
host
mike
lauren
both
jill
janine
ooh! that's good! wow! but now if you were pregnant, what would you name it? hint
{transciber's note
stephanie
karin
meg
russell
commercial voiceover
rob
erica
lipson
kids
rachel and monica
janitor
tv doctor
joey on tv
kid
security guard
trainer
director's assistant
susie
van damme
monica and rachel
director
terry
caroline
boys
doctor
girl 1 on bus
girl 2 on bus
chandler and joey
transit authority guy
both (but to different ba

In [255]:
big_friends[big_friends['character'] == "ooh! that's good! wow! but now if you were pregnant, what would you name it? hint"]

Unnamed: 0,scene,character,dialogue,ep_title,ep_num
61,central perk,"ooh! that's good! wow! but now if you were pregnant, what would you name it? hint",phoebe,the one where ross hugs rachel,602


In [256]:
big_friends[big_friends['ep_num'] == '602'][0:63]

Unnamed: 0,scene,character,dialogue,ep_title,ep_num
0,central perk,ross,hey,the one where ross hugs rachel,602
1,central perk,rachel,hey so did everything go okay with the annulment,the one where ross hugs rachel,602
2,central perk,ross,oh yeah no problems its all taken care of,the one where ross hugs rachel,602
3,central perk,rachel,ross thank you hey do you guys wanna go see a movie,the one where ross hugs rachel,602
4,central perk,ross,oh yeah why not,the one where ross hugs rachel,602
5,central perk,rachel,okay umm im gonna get my sweater,the one where ross hugs rachel,602
6,central perk,ross,okay you uh you wanna hear something weird,the one where ross hugs rachel,602
7,central perk,phoebe,always,the one where ross hugs rachel,602
8,central perk,ross,i didnt get the annulment,the one where ross hugs rachel,602
9,central perk,phoebe,what,the one where ross hugs rachel,602
