In [276]:
import pandas as pd
import numpy as np
import re
import string
import os
import csv

def clean_line(line):
    if line == '' or line == None:
        return 'delete'
    elif line[0] == '[':
        return 'scene'
    elif line[0] == '(' and line[-1] == ')': 
        return 'direction'
    elif ':' in line:
        return 'character'
    elif line == "Opening Credits":
        return 'credits'
    elif line == 'Commercial Break':
        return 'commercials'
    elif line == 'End':
        return 'end'
    else:
        return 'dialogue'

def remove_returns(line):
    if line != None:
        return line.replace('\n', '')

def read_file(file_path):
    df = pd.read_csv(file_path, header=None, encoding="utf-8")
    df.columns = ['lines']
    for i in range(len(df['lines'])):
        if np.isnan(df['lines'][i]):
            df['lines'][i] = ''
            
    print(file_path)
    # Get episode title
    
    title = file_path.replace('friendsscraper/spiders/ep-', '')
    
    print(title)
    
    # Remove rows that will not be included in analysis
    print(df.lines.str.contains('[', regex=False))
    script = df[df.lines.str.contains('[', regex=False).idxmax():]
    
    script = script.applymap(remove_returns)
    
    script['clean_code'] = list(map(clean_line, script['lines']))
    script.set_index('clean_code', inplace=True)
    
    if 'delete' in script.index:
        script.drop('delete', axis=0, inplace=True)
    
    lines = script['lines'].values
    
    return (lines, title) #, writers)

def clean_scenes(scene):
    if scene not in [None, 'Opening Credits', 'End', 'Commercial Break']:
        return scene.replace('[Scene:', '').replace(']', '').replace(')', '').replace('\xa0', '').strip()
    else:
        return scene
    
def dir_from_scene(scene):
    if scene not in [None, 'Opening Credits', 'End', 'Commercial Break']:
        search = {',', '.'}
        delim_index = next((i for i, ch in enumerate(scene) if ch in search), None)
        if delim_index == None:
            delim = ','
        else:
            delim = scene[delim_index]
        if scene.split(delim)[1:] != '':
            #print(scene.split(delim))
            #print(' '.join(scene.split(delim)[1:]))
            return ' '.join(scene.split(delim)[1:])
        else: 
            return None
    else: 
        return None

def isolate_scene(scene):
    if scene not in [None, 'Opening Credits', 'End', 'Commercial Break']:
        search = {',', '.'}
        delim_index = next((i for i, ch in enumerate(scene) if ch in search), None)
        if delim_index == None:
            delim = ','
        else:
            delim = scene[delim_index]
        return scene.split(delim)[0]
    else:
        return scene
    
def clean_directions(direction):
    if direction != None:
        return direction.replace('(', '').replace(')', '')
    else:
        return direction
    
def text_clean(line):
    if line != None and type(line) != list:
        return line.lower().strip().translate(str.maketrans('', '', string.punctuation))
    else: 
        return line

def process_script(lines, title): #, writers):

    dialogue = []
    character = []
    scene = []
    episode = []
    direction = []
    # written_by = []
    
    def update_lists(dia, char, scn, ep, direct): #, wrt):
        dialogue.append(dia)
        character.append(char)
        scene.append(scn)
        episode.append(ep.rstrip('csv'))
        direction.append(direct)
        #written_by.append(wrt)

    current_scene = None
    current_dialogue = None
    current_character = None
    colon_in_front = False

    i = 0 

    while i < len(lines):
        if 'Scene:' in lines[i]:
            current_scene = lines[i]
            current_dialogue = None
            current_character = None

        elif (lines[i][0] == '(' and lines[i][-1] == ')') or ('Scene:' not in lines[i] and (lines[i][0] == '[' and lines[i][-1] == ']')):
            current_character = None
            current_dialogue = None
            update_lists(current_dialogue, current_character, current_scene, title, lines[i])# , writers)


        elif ':' in lines[i]:
            if lines[i][0] == ':':
                current_character = lines[i-1].strip()
                current_dialogue = lines[i].strip(':')
                colon_in_front = True
            elif lines[i].split(':')[1] != '':
                current_character = lines[i].split(':')[0]
                current_dialogue = lines[i].split(':')[1]
            else:
                current_character = lines[i].strip(':')
                current_dialogue = None
                
            if lines[i-1].strip()[-3:] == 'and':
                current_character = lines[i-1] + ' ' + current_character
            
        elif (lines[i] == 'Opening Credits' or lines[i] == 'Commercial Break') or lines[i] == 'End':
            current_character = None
            current_dialogue = None  
            update_lists(current_dialogue, current_character, lines[i], title, None) #, writers)
       
        else:
            if current_dialogue == None:
                current_dialogue = lines[i]
            else:
                current_dialogue += ' ' + lines[i]
        
        if i + 1 < len(lines) and current_dialogue != None and (lines[i+1][0] == '[' or (lines[i+1][0] == '(' and lines[i+1][-1] == ')') or ':' in lines[i+1] or lines[i+1] == 'Opening Credits' or lines[i+1] == 'Commercial Break' or lines[i+1] == 'End'):
            
            if colon_in_front:
                current_dialogue = current_dialogue.lower().strip()
                if (current_dialogue.endswith('monica') or current_dialogue.endswith('doctor')) or (current_dialogue.endswith('rachel') or current_dialogue.endswith('phoebe')):
                    current_dialogue = current_dialogue[:-6]
                elif current_dialogue.endswith('ross') or current_dialogue.endswith('joey'):
                    current_dialogue = current_dialogue[:-4]
                elif current_dialogue.endswith('chandler'):
                    current_dialogue = current_dialogue[:-8]
                elif current_dialogue.endswith('erica'):
                    current_dialogue = current_dialogue[:-5]
                elif current_dialogue.endswith('gate attendant #1') or current_dialogue.endswith('gate attendant #2'):
                    current_dialogue = current_dialogue[:-17]
                elif (current_dialogue.endswith('ticket agent') or current_dialogue.endswith('passenger #1')) or (current_dialogue.endswith('passenger #2') or current_dialogue.endswith('passenger #3')):
                    current_dialogue = current_dialogue[:-12]
                elif current_dialogue.endswith('air stewerdess'):
                    current_dialogue = current_dialogue[:-14]
                
                if current_dialogue != '':
                    update_lists(current_dialogue, current_character, current_scene, title, None) #, writers)
            else:
                update_lists(current_dialogue, current_character, current_scene, title, None) #, writers)


        i += 1
    
    script_df = pd.DataFrame(list(zip(episode, scene, direction, character, dialogue)), columns=['episode', 'scene', 'direction', 'characters', 'dialogue'])
    
    script_df['scene']  = list(map(clean_scenes, script_df['scene']))
    
    for i in range(len(script_df)):
        if i == 0 or script_df['scene'][i] != script_df['scene'][i-1]:
            if script_df['direction'][0] == '' or script_df['direction'][0] == None:
                script_df['direction'][i] = dir_from_scene(script_df['scene'][i])
            elif script_df['direction'][i] != None and dir_from_scene(script_df['scene'][i]) != None:
                script_df['direction'][i] = script_df['direction'][i] + ' ' + dir_from_scene(script_df['scene'][i])
            #print(dir_from_scene(script_df['scene'][i]))
            
    for i in range(len(script_df)):
        
        if script_df['dialogue'][i] != None and '(' in script_df['dialogue'][i]:
            script_df['direction'][i] = script_df['dialogue'][i][script_df['dialogue'][i].find("(") + 1:script_df['dialogue'][i].find(")")]
            script_df['dialogue'][i] = re.sub("[\(].*?[\)]", '', script_df['dialogue'][i])

        
    script_df['scene'] = list(map(isolate_scene, script_df['scene']))
    script_df['direction'] = list(map(clean_directions, script_df['direction']))
    
    return script_df.applymap(text_clean)



In [277]:
for episode in os.listdir('friendsscraper/spiders/'):
    if 'ep-' in episode: 
        path = 'friendsscraper/spiders/' + episode
        raw_lines, ep_title = read_file(path)
        df = process_script(raw_lines, ep_title)
        ep_title = ep_title.lower()
        df.to_csv('friendsscraper/spiders/processed_ep_%s' % ep_title, index=False)


friendsscraper/spiders/ep-the one with ross tan.csv
the one with ross tan.csv
0      False
1      False
2      False
3      False
4      False
5       True
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
582    False
583    False
584    False
585    False
586    False
587    False
588    False
589    False
590     True
591    False
592    False
593    False
594    False
595    False
596    False
597    False
598    False
599    False
600    False
601    False
602    False
603    False
604    False
605    False
606    False
607    False
608    False
609    False
610    False
611    False
Name: lines, Length: 612, dtype: bool
friendsscraper/spiders/ep-the one with the screamer.csv
the one with the screamer.csv
0    

friendsscraper/spiders/ep-the one with the inappropriate sister.csv
the one with the inappropriate sister.csv
0      False
1      False
2      False
3      False
4      False
5      False
6       True
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
572    False
573    False
574    False
575    False
576    False
577    False
578    False
579     True
580    False
581    False
582    False
583    False
584    False
585    False
586    False
587    False
588    False
589    False
590    False
591    False
592    False
593    False
594    False
595    False
596    False
597    False
598    False
599    False
600    False
601    False
Name: lines, Length: 602, dtype: bool
friendsscraper/spiders/ep-the one with the late thanksgivin

friendsscraper/spiders/ep-the one where monica and richard are friends.csv
the one where monica and richard are friends.csv
0      False
1      False
2      False
3      False
4       True
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
592    False
593    False
594    False
595    False
596    False
597    False
598    False
599    False
600     True
601    False
602    False
603    False
604    False
605    False
606    False
607    False
608    False
609    False
610    False
611    False
612    False
613    False
614    False
615    False
616    False
617    False
618    False
619    False
620    False
621    False
Name: lines, Length: 622, dtype: bool
friendsscraper/spiders/ep-the one with all th

friendsscraper/spiders/ep-the one with five steaks and an eggplant.csv
the one with five steaks and an eggplant.csv
0      False
1      False
2      False
3       True
4      False
5       True
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
269    False
270    False
271    False
272    False
273    False
274    False
275    False
276    False
277    False
278    False
279    False
280    False
281    False
282    False
283    False
284    False
285    False
286    False
287    False
288    False
289    False
290    False
291    False
292    False
293    False
294    False
295    False
296    False
297    False
298    False
Name: lines, Length: 299, dtype: bool
friendsscraper/spiders/ep-the one with the vows.csv
t

Name: lines, Length: 651, dtype: bool
friendsscraper/spiders/ep-the one with the soap opera party.csv
the one with the soap opera party.csv
0      False
1      False
2      False
3      False
4      False
5      False
6       True
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
635    False
636    False
637    False
638    False
639    False
640    False
641    False
642    False
643    False
644    False
645    False
646    False
647    False
648    False
649    False
650    False
651    False
652    False
653    False
654     True
655    False
656    False
657    False
658    False
659    False
660    False
661    False
662    False
663    False
664    False
Name: lines, Length: 665, dtype: bool
friendsscraper/spiders/ep-the

TypeError: reduction operation 'argmax' not allowed for this dtype

In [278]:
#ex = pd.read_csv('friendsscraper/spiders/processed_ep_the one where chandler crosses a line.csv', encoding='utf-8')
#ex.head(20)
ex_raw = pd.read_csv('friendsscraper/spiders/ep-the one with the metaphorical tunnel.csv')
ex_raw.tail(30)

Unnamed: 0,the one with the metaphorical tunnel
499,Ross:
500,There was no song. (to Monica) There was no s...
501,Monica:
502,(singing)
503,I am Bea.
504,
505,Ross:
506,Okay.
507,Monica:
508,I drink tea.


In [228]:
raw_lines, ep_title = read_file("friendsscraper/spiders/ep-the last one.csv")
last_proc = process_script(raw_lines, ep_title)

friendsscraper/spiders/ep-the last one.csv
the last one.csv


In [229]:
last_proc.head(50)

Unnamed: 0,episode,scene,direction,characters,dialogue
0,the last one,monica and chandlers apartment,vo,,jennifer aniston
1,the last one,monica and chandlers apartment,,jennifer aniston vo,previously on friends
2,the last one,monica and chandlers apartment,,monica,erica are you okay
3,the last one,monica and chandlers apartment,,erica,yeah you know maybe i ate too much i keep gett...
4,the last one,monica and chandlers apartment,,monica,oh my god
5,the last one,monica and chandlers apartment,,chandler,relax well just get her some antacids
6,the last one,monica and chandlers apartment,,monica,she doesnt have a stomachache shes in labor
7,the last one,monica and chandlers apartment,,chandler,oh my god
8,the last one,monica and chandlers apartment,cut to rosss apartment ross and rachel are the...,,
9,the last one,monica and chandlers apartment,,rachel,so if you think i didnt say goodbye to you bec...
