### Getting transcript data from https://transcripts.foreverdreaming.org/viewforum.php?f=574
### The Office episode ratings from https://www.kaggle.com/kapastor/the-office-imdb-ratings-per-episode

In [1]:
import os
import csv
import re
import pandas as pd
import pickle
import string
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.tokenize import word_tokenize

In [2]:
RAW_BASE_PATH = '/Users/gursharan/Desktop/the_office_transcript/'

In [3]:
theOfficeIMDB = pd.read_csv( open( os.path.join(RAW_BASE_PATH, 'TheOfficeIMDBPerEpisode.csv'), 'r' ) )

In [4]:
CHARACTERS_UPPER = ['MICHAEL', 'JIM', 'PAM', 'DWIGHT', 'OSCAR', 'ANGELA',
                   'KELLY', 'TOBY', 'ANDY', 'PHYLLIS', 'RYAN', 'GABE',
                   'ERIN', 'DARRYL', 'CREED', 'KEVIN', 'MEREDITH'];

TRANSITION = str.maketrans('', '', string.punctuation)

In [5]:
def toCharacterNameList( charGroupName ):
    characters = [ character.strip() for character in charGroupName.split('&') ]
    return characters;

def rawTranscriptToCharacterDialogueMap( file_path ):
    transcript = []
    file = open( file_path, 'r');
    i = 0;
    for line in file.readlines():
        line = line.split('*');
        if len(line) < 3: # empty, character name, dialogue
            continue
        character_name_list = toCharacterNameList( line[1].upper() );
        for character_name in character_name_list:
            end = 2
            if( len(character_name) != 0 and character_name[-1] == ':' ):
                character_name = character_name[:-1]
                end = 0
            if not character_name in CHARACTERS_UPPER:
                continue;
            dialogue = '*'.join( [ line[2][end:] ] + line[3:] )[:-1]
            if( len(dialogue) == 0 ):
                continue;
            transcript.append( [ character_name, dialogue ] );
    return transcript;

In [6]:
season_map_parsed = {}
for season in os.listdir(RAW_BASE_PATH):
    if not 'season' in season:
        continue;
    episodes_map = {};
    season_path = os.path.join(RAW_BASE_PATH, season);
    for episode in os.listdir(season_path):
        if not '.html' in episode:
            continue
        episode_path = os.path.join(season_path, episode);
        episodes_map[episode] = rawTranscriptToCharacterDialogueMap( 
            episode_path );
    season_map_parsed[season] = episodes_map;

In [7]:
def cleanDialogue( text ):
    for char in ['\n','.',',','…','-',':',';']:
        text = text.replace(char, ' ')
    for char in ["'",'"','’']:
        text = text.replace(char, '')

    x = re.sub(r"\[.*?\]", "", text);
    dialogue = x.strip().translate(TRANSITION)
    text_subsets = word_tokenize(dialogue)
    dialogue = [w.lower() for w in text_subsets if not w in ENGLISH_STOP_WORDS]
    dialogue = ' '.join(dialogue)
    return dialogue;

In [8]:
season_map_cleaned = {}
for season in season_map_parsed.keys():
    episodes_map = {};
    for episode in season_map_parsed[season].keys():
        transcript = []
        for charDialogue in season_map_parsed[season][episode]:
            transcript.append( [ charDialogue[0], cleanDialogue( charDialogue[1] ) ] )
        episodes_map[episode] = transcript
    season_map_cleaned[season] = episodes_map;

In [9]:
with open('./the_office_transcript.pickle', 'wb') as f:
    pickle.dump( [season_map_parsed, season_map_cleaned, theOfficeIMDB],  f)