In [360]:
import pandas as pd
import numpy as np
import re
import string
import os
import csv
import math

def clean_line(line):
    if line == '' or line == None:
        return 'delete'
    elif line[0] == '[':
        return 'scene'
    elif line[0] == '(' and line[-1] == ')': 
        return 'direction'
    elif ':' in line:
        return 'character'
    elif line == "Opening Credits":
        return 'credits'
    elif line == 'Commercial Break':
        return 'commercials'
    elif line == 'End':
        return 'end'
    else:
        return 'dialogue'

def remove_returns(line):
    if line != None and str(line) != 'nan':
        return line.replace('\n', '')

def read_file(file_path):
    df = pd.read_csv(file_path, header=None, encoding="utf-8")
    df.columns = ['lines']
    print(df.head(20))
            
    print(file_path)
    # Get episode title
    
    title = file_path.replace('friendsscraper/spiders/ep-', '')
    
    print(title)
    
    # Remove rows that will not be included in analysis
    index_list = df.lines.str.contains('[', regex=False)
    index_list = pd.Series([x for x in index_list if str(x) != 'nan'])
    script = df[index_list.idxmax():]
    
    script = script.applymap(remove_returns)
    
    script['clean_code'] = list(map(clean_line, script['lines']))
    script.set_index('clean_code', inplace=True)
    
    if 'delete' in script.index:
        script.drop('delete', axis=0, inplace=True)
    
    lines = script['lines'].values
    
    return (lines, title) #, writers)

def clean_scenes(scene):
    if scene not in [None, 'Opening Credits', 'End', 'Commercial Break']:
        return scene.replace('[Scene:', '').replace(']', '').replace(')', '').replace('\xa0', '').strip()
    else:
        return scene
    
def dir_from_scene(scene):
    if scene not in [None, 'Opening Credits', 'End', 'Commercial Break']:
        search = {',', '.'}
        delim_index = next((i for i, ch in enumerate(scene) if ch in search), None)
        if delim_index == None:
            delim = ','
        else:
            delim = scene[delim_index]
        if scene.split(delim)[1:] != '':
            #print(scene.split(delim))
            #print(' '.join(scene.split(delim)[1:]))
            return ' '.join(scene.split(delim)[1:])
        else: 
            return None
    else: 
        return None

def isolate_scene(scene):
    if scene not in [None, 'Opening Credits', 'End', 'Commercial Break']:
        search = {',', '.'}
        delim_index = next((i for i, ch in enumerate(scene) if ch in search), None)
        if delim_index == None:
            delim = ','
        else:
            delim = scene[delim_index]
        return scene.split(delim)[0]
    else:
        return scene
    
def clean_directions(direction):
    if direction != None:
        return direction.replace('(', '').replace(')', '')
    else:
        return direction
    
def text_clean(line):
    if line != None and type(line) != list:
        return line.lower().strip().translate(str.maketrans('', '', string.punctuation))
    else: 
        return line

def process_script(lines, title): #, writers):

    dialogue = []
    character = []
    scene = []
    episode = []
    direction = []
    # written_by = []
    
    def update_lists(dia, char, scn, ep, direct): #, wrt):
        dialogue.append(dia)
        character.append(char)
        scene.append(scn)
        episode.append(ep.rstrip('csv'))
        direction.append(direct)
        #written_by.append(wrt)

    current_scene = None
    current_dialogue = None
    current_character = None
    colon_in_front = False

    i = 0 

    while i < len(lines):
        if 'Scene:' in lines[i]:
            current_scene = lines[i]
            current_dialogue = None
            current_character = None

        elif (lines[i][0] == '(') or ('Scene:' not in lines[i] and (lines[i][0] == '[' and lines[i][-1] == ']')):
            if current_scene == None:
                current_scene = lines[i]
                current_character = None
                current_dialogue = None
            else:
                current_character = None
                current_dialogue = None
                update_lists(current_dialogue, current_character, current_scene, title, lines[i])# , writers)


        elif ':' in lines[i]:
            if lines[i][0] == ':':
                current_character = lines[i-1].strip()
                current_dialogue = lines[i].strip(':')
                colon_in_front = True
            elif lines[i][-1] != ':':
                current_character = lines[i].split(':')[0]
                current_dialogue = lines[i].split(':')[1]
            else:
                current_character = lines[i].strip(':')
                current_dialogue = None
                
            if lines[i-1].strip()[-3:] == 'and':
                current_character = lines[i-1] + ' ' + current_character
            
        elif (lines[i] == 'Opening Credits' or lines[i] == 'Commercial Break') or lines[i] == 'End':
            current_character = None
            current_dialogue = None  
            update_lists(current_dialogue, current_character, lines[i], title, None) #, writers)
       
        else:
            if current_dialogue == None:
                current_dialogue = lines[i]
            else:
                current_dialogue += ' ' + lines[i]
        
        if i + 1 < len(lines) and current_dialogue != None and (lines[i+1][0] == '[' or (lines[i+1][0] == '(' and lines[i+1][-1] == ')') or ':' in lines[i+1] or lines[i+1] == 'Opening Credits' or lines[i+1] == 'Commercial Break' or lines[i+1] == 'End'):
            
            if colon_in_front:
                current_dialogue = current_dialogue.lower().strip()
                if (current_dialogue.endswith('monica') or current_dialogue.endswith('doctor')) or (current_dialogue.endswith('rachel') or current_dialogue.endswith('phoebe')):
                    current_dialogue = current_dialogue[:-6]
                elif current_dialogue.endswith('ross') or current_dialogue.endswith('joey'):
                    current_dialogue = current_dialogue[:-4]
                elif current_dialogue.endswith('chandler'):
                    current_dialogue = current_dialogue[:-8]
                elif current_dialogue.endswith('erica'):
                    current_dialogue = current_dialogue[:-5]
                elif current_dialogue.endswith('gate attendant #1') or current_dialogue.endswith('gate attendant #2'):
                    current_dialogue = current_dialogue[:-17]
                elif (current_dialogue.endswith('ticket agent') or current_dialogue.endswith('passenger #1')) or (current_dialogue.endswith('passenger #2') or current_dialogue.endswith('passenger #3')):
                    current_dialogue = current_dialogue[:-12]
                elif current_dialogue.endswith('air stewerdess'):
                    current_dialogue = current_dialogue[:-14]
                
                if current_dialogue != '':
                    update_lists(current_dialogue, current_character, current_scene, title, None) #, writers)
            else:
                update_lists(current_dialogue, current_character, current_scene, title, None) #, writers)


        i += 1
    
    script_df = pd.DataFrame(list(zip(episode, scene, direction, character, dialogue)), columns=['episode', 'scene', 'direction', 'characters', 'dialogue'])
    
    script_df['scene']  = list(map(clean_scenes, script_df['scene']))
    
    for i in range(len(script_df)):
        if i == 0 or script_df['scene'][i] != script_df['scene'][i-1]:
            if script_df['direction'][0] == '' or script_df['direction'][0] == None:
                script_df['direction'][i] = dir_from_scene(script_df['scene'][i])
            elif script_df['direction'][i] != None and dir_from_scene(script_df['scene'][i]) != None:
                script_df['direction'][i] = script_df['direction'][i] + ' ' + dir_from_scene(script_df['scene'][i])
            #print(dir_from_scene(script_df['scene'][i]))
            
    for i in range(len(script_df)):
        
        if script_df['dialogue'][i] != None and '(' in script_df['dialogue'][i]:
            script_df['direction'][i] = script_df['dialogue'][i][script_df['dialogue'][i].find("(") + 1:script_df['dialogue'][i].find(")")]
            script_df['dialogue'][i] = re.sub("[\(].*?[\)]", '', script_df['dialogue'][i])

        
    script_df['scene'] = list(map(isolate_scene, script_df['scene']))
    script_df['direction'] = list(map(clean_directions, script_df['direction']))
    
    return script_df.applymap(text_clean)



In [355]:
for episode in os.listdir('friendsscraper/spiders/'):
    if 'ep-' in episode: 
        path = 'friendsscraper/spiders/' + episode
        raw_lines, ep_title = read_file(path)
        df = process_script(raw_lines, ep_title)
        ep_title = ep_title.lower()
        df.to_csv('friendsscraper/spiders/processed_ep_%s' % ep_title, index=False)


friendsscraper/spiders/ep-the one with ross tan.csv
the one with ross tan.csv
friendsscraper/spiders/ep-the one with the screamer.csv
the one with the screamer.csv
friendsscraper/spiders/ep-the one with the sonogram at the end.csv
the one with the sonogram at the end.csv
friendsscraper/spiders/ep-the one with phoebes wedding.csv
the one with phoebes wedding.csv
friendsscraper/spiders/ep-the one with the fake monica.csv
the one with the fake monica.csv
friendsscraper/spiders/ep-the one with joeys fridge.csv
the one with joeys fridge.csv
friendsscraper/spiders/ep-the one with the birth.csv
the one with the birth.csv
friendsscraper/spiders/ep-the one in barbados.csv
the one in barbados.csv
friendsscraper/spiders/ep-the one where chandler cant cry.csv
the one where chandler cant cry.csv
friendsscraper/spiders/ep-the one with the inappropriate sister.csv
the one with the inappropriate sister.csv
friendsscraper/spiders/ep-the one with the late thanksgiving.csv
the one with the late thanksgiv

friendsscraper/spiders/ep-the one with the fake party.csv
the one with the fake party.csv
friendsscraper/spiders/ep-the one with the cuffs.csv
the one with the cuffs.csv
friendsscraper/spiders/ep-the one with rachels assistant.csv
the one with rachels assistant.csv
friendsscraper/spiders/ep-the one with the thumb.csv
the one with the thumb.csv
friendsscraper/spiders/ep-the one with the truth about london.csv
the one with the truth about london.csv
friendsscraper/spiders/ep-the one with the engagement picture.csv
the one with the engagement picture.csv
friendsscraper/spiders/ep-the one with joeys new girlfriend.csv
the one with joeys new girlfriend.csv
friendsscraper/spiders/ep-the one with monicas thunder.csv
the one with monicas thunder.csv
friendsscraper/spiders/ep-the one with the baby on the bus.csv
the one with the baby on the bus.csv
friendsscraper/spiders/ep-the one with rachels other sister.csv
the one with rachels other sister.csv
friendsscraper/spiders/ep-the one where dr rem

friendsscraper/spiders/ep-the one in vegas.csv
the one in vegas.csv
friendsscraper/spiders/ep-the one with the embryos.csv
the one with the embryos.csv
friendsscraper/spiders/ep-the one with all the haste.csv
the one with all the haste.csv
friendsscraper/spiders/ep-the one with the boob job.csv
the one with the boob job.csv
friendsscraper/spiders/ep-the one with the ball.csv
the one with the ball.csv
friendsscraper/spiders/ep-the one after ross says rachel.csv
the one after ross says rachel.csv
friendsscraper/spiders/ep-the one where mr heckles dies.csv
the one where mr heckles dies.csv
friendsscraper/spiders/ep-the one where noones ready.csv
the one where noones ready.csv
friendsscraper/spiders/ep-the one with joeys award.csv
the one with joeys award.csv
friendsscraper/spiders/ep-the one with the sharks.csv
the one with the sharks.csv
friendsscraper/spiders/ep-the one where rachel is late.csv
the one where rachel is late.csv
friendsscraper/spiders/ep-the one with the blackout.csv
the 

In [361]:
#ex = pd.read_csv('friendsscraper/spiders/processed_ep_the one with chandler in a box.csv', encoding='utf-8')
#ex.head(20)
ex_raw = pd.read_csv('friendsscraper/spiders/ep-the one with chandler in a box.csv')
ex_raw.head(20)

Unnamed: 0,the one with chandler in a box
0,Written by: Michael Borkow
1,\nTranscribed by:
2,Eric Aasen
3,"[Scene: Chandler and Joeys, Joey is sitting at..."
4,Joey:
5,(answering phone) Hello.
6,Chandler:
7,"(on phone) Hey, its me. I know you cant sta..."
8,(Pause)
9,(The phone rings again.)


In [366]:
raw = pd.read_csv("friendsscraper/spiders/ep-the one where chandler crosses a line.csv")
raw.head(20)

Unnamed: 0,the one where chandler crosses a line
0,Written by: Adam Chase
1,\nTranscribed by:
2,Eric Aasen
3,"[Scene: Monica and Rachel's, everyone except J..."
4,Chandler:
5,(entering in a bathrobe) I just walked in the...
6,Ross:
7,Y'know if we ever go to war and youre captur...
8,big
9,surprise.


In [364]:
raw_lines, ep_title = read_file("friendsscraper/spiders/ep-the one where chandler crosses a line.csv")
last_proc2 = process_script(raw_lines, ep_title)

                                                lines
0               the one where chandler crosses a line
1                              Written by: Adam Chase
2                                  \nTranscribed by: 
3                                          Eric Aasen
4   [Scene: Monica and Rachel's, everyone except J...
5                                           Chandler:
6    (entering in a bathrobe) I just walked in the...
7                                               Ross:
8    Y'know if we ever go to war and youre captur...
9                                                 big
10                                          surprise.
11                                          Chandler:
12   It just keeps getting worse and worse! Y'know...
13                                            (Pause)
14                                            Phoebe:
15       Wow! Could everyone totally see up his robe?
16                                               All:
17                          

In [346]:
last_proc2.head(50)

Unnamed: 0,episode,scene,direction,characters,dialogue
0,the one with all the poker,central perk,ross and chandler are sitting at a table rache...,monica,hey guys
1,the one with all the poker,central perk,,chandler and ross,hey
2,the one with all the poker,central perk,,hey hi ladies uh can i get you anything to mon...,did you bring the mail
3,the one with all the poker,central perk,,monica,lots of responses
4,the one with all the poker,central perk,out loud,to monica,really
5,the one with all the poker,central perk,crumples up letter,reading,dear ms green thank you for your inquiry howev...
6,the one with all the poker,central perk,,out loud,we have apple cinnamon
7,the one with all the poker,central perk,crumpesup letter,reading,ok dear ms green yeah yeah yeah no
8,the one with all the poker,central perk,,phoebe,wow
9,the one with all the poker,central perk,,rachel,what


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''