In [1]:
import pandas as pd
import numpy as np
import re
import string
import os
import csv
import math

def clean_line(line):
    if line == '' or line == None:
        return 'delete'
    elif line[0] == '[':
        return 'scene'
    elif line[0] == '(' and line[-1] == ')': 
        return 'direction'
    elif ':' in line:
        return 'character'
    elif line == "Opening Credits":
        return 'credits'
    elif line == 'Commercial Break':
        return 'commercials'
    elif line == 'End':
        return 'end'
    else:
        return 'dialogue'

def remove_returns(line):
    if line != None and str(line) != 'nan':
        return line.replace('\n', '')

def read_file(file_path):
    df = pd.read_csv(file_path, header=None, encoding="utf-8")
    df.columns = ['lines']
            
    print(file_path)
    # Get episode title
    
    title = file_path.replace('friendsscraper/spiders/ep-', '')
    
    print(title)
    
    # Remove rows that will not be included in analysis
    index_list = df.lines.str.contains('[', regex=False)
    index_list = pd.Series([x for x in index_list if str(x) != 'nan'])
    script = df[index_list.idxmax():]
    
    script = script.applymap(remove_returns)
    
    script['clean_code'] = list(map(clean_line, script['lines']))
    script.set_index('clean_code', inplace=True)
    
    if 'delete' in script.index:
        script.drop('delete', axis=0, inplace=True)
    
    lines = script['lines'].values
    
    return (lines, title) #, writers)

def clean_scenes(scene):
    if scene not in [None, 'Opening Credits', 'End', 'Commercial Break']:
        return scene.replace('[Scene:', '').replace(']', '').replace(')', '').replace('\xa0', '').strip()
    else:
        return scene
    
def dir_from_scene(scene):
    if scene not in [None, 'Opening Credits', 'End', 'Commercial Break']:
        search = {',', '.'}
        delim_index = next((i for i, ch in enumerate(scene) if ch in search), None)
        if delim_index == None:
            delim = ','
        else:
            delim = scene[delim_index]
        if scene.split(delim)[1:] != '':
            #print(scene.split(delim))
            #print(' '.join(scene.split(delim)[1:]))
            return ' '.join(scene.split(delim)[1:])
        else: 
            return None
    else: 
        return None

def isolate_scene(scene):
    if scene not in [None, 'Opening Credits', 'End', 'Commercial Break']:
        search = {',', '.'}
        delim_index = next((i for i, ch in enumerate(scene) if ch in search), None)
        if delim_index == None:
            delim = ','
        else:
            delim = scene[delim_index]
        return scene.split(delim)[0]
    else:
        return scene
    
def clean_directions(direction):
    if direction != None:
        return direction.replace('(', '').replace(')', '')
    else:
        return direction
    
def text_clean(line):
    if line != None and type(line) != list:
        return line.lower().strip().translate(str.maketrans('', '', string.punctuation))
    else: 
        return line

def process_script(lines, title): #, writers):

    dialogue = []
    character = []
    scene = []
    episode = []
    direction = []
    # written_by = []
    
    def update_lists(dia, char, scn, ep, direct): #, wrt):
        dialogue.append(dia)
        character.append(char)
        scene.append(scn)
        episode.append(ep.rstrip('csv'))
        direction.append(direct)
        #written_by.append(wrt)

    current_scene = None
    current_dialogue = None
    current_character = None
    colon_in_front = False

    i = 0 

    while i < len(lines):
        if 'Scene:' in lines[i]:
            current_scene = lines[i]
            current_dialogue = None
            current_character = None

        elif (lines[i][0] == '(' and lines[i][-1] == ')') or ('Scene:' not in lines[i] and (lines[i][0] == '[' and lines[i][-1] == ']')):
            current_character = None
            current_dialogue = None
            update_lists(current_dialogue, current_character, current_scene, title, lines[i])# , writers)


        elif ':' in lines[i]:
            if lines[i][0] == ':':
                current_character = lines[i-1].strip()
                current_dialogue = lines[i].strip(':')
                colon_in_front = True
            elif lines[i].split(':')[1] != '':
                current_character = lines[i].split(':')[0]
                current_dialogue = lines[i].split(':')[1]
            else:
                current_character = lines[i].strip(':')
                current_dialogue = None
                
            if lines[i-1].strip()[-3:] == 'and':
                current_character = lines[i-1] + ' ' + current_character
            
        elif (lines[i] == 'Opening Credits' or lines[i] == 'Commercial Break') or lines[i] == 'End':
            current_character = None
            current_dialogue = None  
            update_lists(current_dialogue, current_character, lines[i], title, None) #, writers)
       
        else:
            if current_dialogue == None:
                current_dialogue = lines[i]
            else:
                current_dialogue += ' ' + lines[i]
        
        if i + 1 < len(lines) and current_dialogue != None and (lines[i+1][0] == '[' or (lines[i+1][0] == '(' and lines[i+1][-1] == ')') or ':' in lines[i+1] or lines[i+1] == 'Opening Credits' or lines[i+1] == 'Commercial Break' or lines[i+1] == 'End'):
            
            if colon_in_front:
                current_dialogue = current_dialogue.lower().strip()
                if (current_dialogue.endswith('monica') or current_dialogue.endswith('doctor')) or (current_dialogue.endswith('rachel') or current_dialogue.endswith('phoebe')):
                    current_dialogue = current_dialogue[:-6]
                elif current_dialogue.endswith('ross') or current_dialogue.endswith('joey'):
                    current_dialogue = current_dialogue[:-4]
                elif current_dialogue.endswith('chandler'):
                    current_dialogue = current_dialogue[:-8]
                elif current_dialogue.endswith('erica'):
                    current_dialogue = current_dialogue[:-5]
                elif current_dialogue.endswith('gate attendant #1') or current_dialogue.endswith('gate attendant #2'):
                    current_dialogue = current_dialogue[:-17]
                elif (current_dialogue.endswith('ticket agent') or current_dialogue.endswith('passenger #1')) or (current_dialogue.endswith('passenger #2') or current_dialogue.endswith('passenger #3')):
                    current_dialogue = current_dialogue[:-12]
                elif current_dialogue.endswith('air stewerdess'):
                    current_dialogue = current_dialogue[:-14]
                
                if current_dialogue != '':
                    update_lists(current_dialogue, current_character, current_scene, title, None) #, writers)
            else:
                update_lists(current_dialogue, current_character, current_scene, title, None) #, writers)


        i += 1
    
    script_df = pd.DataFrame(list(zip(episode, scene, direction, character, dialogue)), columns=['episode', 'scene', 'direction', 'characters', 'dialogue'])
    
    script_df['scene']  = list(map(clean_scenes, script_df['scene']))
    
    for i in range(len(script_df)):
        if i == 0 or script_df['scene'][i] != script_df['scene'][i-1]:
            if script_df['direction'][0] == '' or script_df['direction'][0] == None:
                script_df['direction'][i] = dir_from_scene(script_df['scene'][i])
            elif script_df['direction'][i] != None and dir_from_scene(script_df['scene'][i]) != None:
                script_df['direction'][i] = script_df['direction'][i] + ' ' + dir_from_scene(script_df['scene'][i])
            #print(dir_from_scene(script_df['scene'][i]))
            
    for i in range(len(script_df)):
        
        if script_df['dialogue'][i] != None and '(' in script_df['dialogue'][i]:
            script_df['direction'][i] = script_df['dialogue'][i][script_df['dialogue'][i].find("(") + 1:script_df['dialogue'][i].find(")")]
            script_df['dialogue'][i] = re.sub("[\(].*?[\)]", '', script_df['dialogue'][i])

        
    script_df['scene'] = list(map(isolate_scene, script_df['scene']))
    script_df['direction'] = list(map(clean_directions, script_df['direction']))
    
    return script_df.applymap(text_clean)



In [3]:
#ex = pd.read_csv('friendsscraper/spiders/processed_ep_the one where chandler crosses a line.csv', encoding='utf-8')
#ex.head(20)
ex_raw = pd.read_csv('friendsscraper/spiders/ep-the one where chandler crosses a line.csv')
ex_raw.head(20)

Unnamed: 0,the one where chandler crosses a line
0,Written by: Adam Chase
1,\nTranscribed by:
2,Eric Aasen
3,"[Scene: Monica and Rachels, everyone except Jo..."
4,Chandler:
5,(entering in a bathrobe) I just walked in the...
6,Ross:
7,Yknow if we ever go to war and youre capture...
8,big
9,surprise.


In [4]:
ex_raw = pd.read_csv('friendsscraper/spiders/ep-the one with all the poker.csv')
ex_raw.head(20)

Unnamed: 0,the one with all the poker
0,Written by: Jeffrey Astrof and Mike Sikowitz. .
1,(The whole gang is helping Rachel mail out res...
2,The\nBridge on the River Kwai.
3,)
4,Ross:
5,"Uh, Rach, were running low on resumes over here."
6,Monica:
7,Do you really want a job with
8,Popular Mechanics
9,?
