In [1]:
import itertools
import os
import sys
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))

# nltk.download('stopwords')
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from collections import Counter

from gensim.corpora.dictionary import Dictionary

# import plotly
# import plotly.plotly as py
# import plotly.offline as offline
# import plotly.graph_objs as go
from nltk.tokenize import word_tokenize


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [2]:
# read in data
# ----------------------------------
lines = pd.read_csv("/Users/dan/Documents/talk/the_office_area_plot/data/the_office_lines.csv", index_col=None)
lines.head()


Unnamed: 0,id,season,episode,scene,line_text,speaker,deleted
0,1,1,1,1,All right Jim. Your quarterlies look very good...,Michael,False
1,2,1,1,1,"Oh, I told you. I couldn't close it. So...",Jim,False
2,3,1,1,1,So you've come to the master for guidance? Is ...,Michael,False
3,4,1,1,1,"Actually, you called me in here, but yeah.",Jim,False
4,5,1,1,1,"All right. Well, let me show you how it's done.",Michael,False


In [3]:
# Convert season to string data type
# ----------------------------------
lines['season'] = lines['season'].astype(str)
lines.dtypes


id            int64
season       object
episode       int64
scene         int64
line_text    object
speaker      object
deleted        bool
dtype: object

In [4]:
# Word Tokenize
# --------
def word_token_lines(row):
    '''text preprocessing function

    1. remove stage instructions in brackets
    2. trim white space
    3. tokenize
    4. remove special characters
    5. remove stopwords
    6. stem all words

    arg:
        row of a df

    return:
        clean, tokenized row
    '''

    # remove stage instructions in brackets
    no_brackets = re.sub("[\(\[].*?[\)\]]", "", row.line_text)

    # trim white space
    no_white = no_brackets.strip()

    # tokenize words
    word_tokens = nltk.word_tokenize(no_white)

    # lowercase all words
    lower_tokens = [token.lower() for token in word_tokens]

    # remove special characters
    alpha_only = [token for token in lower_tokens
                    if token.isalpha()]

    # remove i, but, you, as, etc. (stopwords)
    no_stops = [token for token in alpha_only
                    if token not in stopwords.words('english')]

    # stem all words
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized = [wordnet_lemmatizer.lemmatize(token) for token in no_stops]

    return lemmatized


def word_token_loop(df):
    # for loop to tokenize each row
    df['word_token'] = df.apply(word_token_lines, axis=1)
    return df


lines_word_token = word_token_loop(lines)
lines_word_token.head()

Unnamed: 0,id,season,episode,scene,line_text,speaker,deleted,word_token
0,1,1,1,1,All right Jim. Your quarterlies look very good...,Michael,False,"[right, jim, quarterly, look, good, thing, lib..."
1,2,1,1,1,"Oh, I told you. I couldn't close it. So...",Jim,False,"[oh, told, could, close]"
2,3,1,1,1,So you've come to the master for guidance? Is ...,Michael,False,"[come, master, guidance, saying, grasshopper]"
3,4,1,1,1,"Actually, you called me in here, but yeah.",Jim,False,"[actually, called, yeah]"
4,5,1,1,1,"All right. Well, let me show you how it's done.",Michael,False,"[right, well, let, show, done]"


In [5]:
# Speaker Tokenize
# --------
def word_token_speaker(row):
    '''text preprocessing function

    1. remove stage instructions in brackets
    2. trim white space
    3. tokenize
    4. remove special characters

    arg:
        row of a df

    return:
        clean, tokenized row
    '''

    # remove stage instructions in brackets
    no_brackets = re.sub("[\(\[].*?[\)\]]", "", row.speaker)

    # trim white space
    no_white = no_brackets.strip()

    # tokenize words
    word_tokens = nltk.word_tokenize(no_white)

    # lowercase all words
    lower_tokens = [token.lower() for token in word_tokens]

    # remove special characters
    alpha_only = [token for token in lower_tokens
                    if token.isalpha()]

    return alpha_only


def word_speaker_loop(df):
    # for loop to tokenize each row
    df['word_token_speaker'] = df.apply(word_token_speaker, axis=1)
    return df


lines_word_speaker = word_speaker_loop(lines_word_token)
lines_word_speaker.head()

Unnamed: 0,id,season,episode,scene,line_text,speaker,deleted,word_token,word_token_speaker
0,1,1,1,1,All right Jim. Your quarterlies look very good...,Michael,False,"[right, jim, quarterly, look, good, thing, lib...",[michael]
1,2,1,1,1,"Oh, I told you. I couldn't close it. So...",Jim,False,"[oh, told, could, close]",[jim]
2,3,1,1,1,So you've come to the master for guidance? Is ...,Michael,False,"[come, master, guidance, saying, grasshopper]",[michael]
3,4,1,1,1,"Actually, you called me in here, but yeah.",Jim,False,"[actually, called, yeah]",[jim]
4,5,1,1,1,"All right. Well, let me show you how it's done.",Michael,False,"[right, well, let, show, done]",[michael]


In [6]:
# lines_word_token['word_token'].head()

# i think onion to convert this1 column into a flat lreverseist and then the bag of words counter will work

# Create the bag-of-words: bow
# bow = Counter(lines_word_token['word_token'].to_list())
# stacked_list = lines_word_token['word_token'].head(1000).to_list()
# print(stacked_list)
# flat_list = [sentence for sub_list in stacked_list for sentence in sub_list]
# print(flat_list)

# bow = Counter(flat_list)
# Print the 10 most common tokens
# print(bow.most_common(10))


In [7]:

# lines_word_token['word_token'] = lines_word_token['word_token'].astype(str)
# lines_word_token['word_token_speaker'] = lines_word_token['word_token_speaker'].astype(str)
# grouped_lines = lines_word_token.groupby(['word_token_speaker'])['word_token'].apply(','.join).reset_index()
# lines_word_token.dtypes

In [8]:
# grouped_lines.to_csv('speaker_group_by.csv', index=False)
# lines_word_token.to_csv('lines_no_group.csv', index=False)

In [9]:
# grouped_lines['speaker'] = grouped_lines['value'].str[0]
lines_word_speaker['word_token_speaker'] = lines_word_speaker['word_token_speaker'].astype(str)
michael = lines_word_speaker[lines_word_speaker['speaker'] == 'Michael']
michael_list = michael['word_token'].to_list()
#print(michael_list)
# print(michael)
# michael.head()
# lines_word_speaker.head()
# michael = grouped_lines[grouped_lines['word_token_speaker'] == "['michael']"]
# michael['word_token'] = michael['word_token'].str.get(0)
# stacked_list = michael['word_token'].to_list()
# double_stacked = stacked_list.to_list()

michael_flat_list = [sentence for sub_list in michael_list for sentence in sub_list]
print(michael_flat_list)
# michael.to_csv('michael.csv', index=False)
# with open('your_file.txt', 'w') as f:
#     for item in michael_flat_list:
#         f.writelines(item)
# michael.head()



['right', 'jim', 'quarterly', 'look', 'good', 'thing', 'library', 'come', 'master', 'guidance', 'saying', 'grasshopper', 'right', 'well', 'let', 'show', 'done', 'yes', 'like', 'speak', 'office', 'manager', 'please', 'yes', 'hello', 'michael', 'scott', 'regional', 'manager', 'dunder', 'mifflin', 'paper', 'product', 'wanted', 'talk', 'right', 'done', 'deal', 'thank', 'much', 'sir', 'gentleman', 'scholar', 'oh', 'sorry', 'ok', 'sorry', 'mistake', 'woman', 'talking', 'low', 'voice', 'probably', 'smoker', 'way', 'done', 'uh', 'dunder', 'mifflin', 'year', 'last', 'four', 'regional', 'manager', 'want', 'come', 'see', 'entire', 'floor', 'kingdom', 'far', 'eye', 'see', 'receptionist', 'pam', 'pam', 'pam', 'beesly', 'pam', 'u', 'forever', 'right', 'pam', 'think', 'cute', 'seen', 'couple', 'year', 'ago', 'message', 'oh', 'pam', 'corporate', 'many', 'time', 'told', 'special', 'filing', 'cabinet', 'thing', 'corporate', 'called', 'wastepaper', 'basket', 'look', 'look', 'face', 'people', 'say', 'best

In [10]:
def get_character_list(df, character_name):
    character_df = df[df['speaker'] == character_name]
    character_list = character_df['word_token'].to_list()
    character_flat_list = [sentence for sub_list in character_list for sentence in sub_list]

    return character_flat_list


characters = ['Michael',
              'Dwight',
              'Jim',
              'Pam',
              'Andy',
              'Angela',
              'Kevin',
              'Ryan',
              'Erin',
              'Oscar',
              'Darryl',
              'Kelly',
              'Jan',
              'Toby',
              'Phyllis',
              'Nellie',
              'Stanley',
              'Gabe',
              'Robert',
              'Holly',
              'Meredith',
              'Creed',
              'David Wallace',
              'Todd Packer']


character_corpus = []
for character in characters:
    character_list = get_character_list(lines_word_speaker, character)
    character_corpus.append(character_list)


print(character_corpus[23])


['hey', 'big', 'queen', 'hey', 'old', 'godzillary', 'coming', 'today', 'meaning', 'ask', 'one', 'question', 'carpet', 'match', 'drape', 'mello', 'michael', 'scott', 'secret', 'lover', 'intercom', 'told', 'never', 'call', 'never', 'bergerstein', 'bergerstein', 'want', 'money', 'back', 'greedy', 'hebrew', 'bergerstein', 'ca', 'make', 'coming', 'oh', 'god', 'stop', 'whining', 'know', 'come', 'pick', 'game', 'year', 'little', 'bitch', 'right', 'yup', 'hey', 'okay', 'problem', 'know', 'best', 'hey', 'know', 'hey', 'nice', 'talking', 'right', 'take', 'care', 'michael', 'scoot', 'funny', 'good', 'friend', 'two', 'thumb', 'like', 'bone', 'mom', 'guy', 'bleep', 'bleep', 'halpert', 'still', 'queer', 'kidding', 'everyone', 'know', 'know', 'okay', 'check', 'al', 'lright', 'story', 'randall', 'nailing', 'secretary', 'right', 'totally', 'incompetent', 'talking', 'blonde', 'incompetent', 'like', 'word', 'minute', 'talking', 'woman', 'right', 'said', 'suddenly', 'reason', 'bimbo', 'blow', 'whistle', '

In [101]:
from gensim.corpora.dictionary import Dictionary

dictionary = Dictionary(character_corpus)
print(dictionary)

Dictionary(15829 unique tokens: ['aaaaaaaa', 'aaaaaand', 'aaaaeeexcellent', 'aaaah', 'aaaahh']...)


In [106]:
computer_id = dictionary.token2id.get("computer")
# print(computer_id)
print(dictionary.get(computer_id))

computer


In [13]:
corpus = [dictionary.doc2bow(character) for character in character_corpus]
print(corpus[0][:10])

[(0, 2), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 3), (8, 1), (9, 1)]


In [98]:
doc = corpus[3]

# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)

# Create the defaultdict: total_word_count
# look upwards here using this default dictionary
total_word_count = nltk.defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

oh 461
yeah 366
know 346
like 318
michael 309


In [99]:
# print(total_word_count)
print(bow_doc)


[(4955, 461), (8097, 366), (4023, 346), (4187, 318), (4525, 309), (4965, 289), (3384, 264), (5798, 257), (3024, 240), (7285, 240), (3882, 207), (4978, 190), (7551, 182), (3067, 173), (7894, 165), (7824, 163), (3086, 161), (4733, 158), (2281, 154), (3078, 144), (4441, 133), (6028, 131), (3083, 124), (3147, 124), (8116, 119), (8053, 118), (7329, 113), (1440, 111), (1654, 108), (3103, 105), (7262, 104), (4963, 103), (6307, 101), (7283, 101), (6212, 99), (7540, 99), (7215, 98), (504, 97), (6712, 97), (3221, 93), (4337, 93), (4251, 91), (6695, 90), (7534, 90), (3073, 89), (4425, 89), (4782, 85), (4277, 83), (1855, 82), (1053, 80), (6161, 77), (4154, 76), (5233, 73), (4815, 72), (4217, 68), (7074, 66), (7856, 65), (4949, 63), (5120, 63), (3992, 62), (8031, 62), (4827, 61), (7266, 61), (7150, 60), (4269, 59), (4700, 59), (2725, 58), (7352, 58), (7803, 58), (8098, 58), (294, 57), (2663, 55), (7295, 55), (3388, 54), (253, 52), (1067, 52), (2502, 52), (2513, 51), (3887, 51), (4813, 50), (7158, 5

In [100]:
[x for x in bow_doc if x[0] == 1182]


[(1182, 29)]

In [71]:
from gensim.models.tfidfmodel import TfidfModel
tfidf = TfidfModel(corpus)

In [72]:
tfidf_weights = tfidf[doc]

In [73]:

print(tfidf_weights[:5])

[(7, 0.00733433720379682), (10, 0.00733433720379682), (14, 0.008298928718089514), (17, 0.017109708582439355), (29, 0.00947949331972942)]


In [116]:
michael_tfidf = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)
print(michael_tfidf)

[(1182, 0.1504862515480624), (4525, 0.14224595898808046), (6108, 0.11368222665885072), (9947, 0.1100150580569523), (11292, 0.10517329537302265), (3882, 0.09529098223473352), (3101, 0.09263520232382205), (3845, 0.08454453600878913), (7551, 0.08378240950107005), (1809, 0.08337168209143984), (7873, 0.07937077628950621), (12119, 0.07701054063986662), (10066, 0.071706653688688), (5110, 0.06974789661397213), (12151, 0.06725532209410498), (12277, 0.06725532209410498), (12512, 0.06725532209410498), (6840, 0.06635645323810595), (5960, 0.06393557189614112), (4631, 0.06392452441056401), (4441, 0.06122560694308965), (4043, 0.05867469763037456), (11232, 0.05500752902847615), (5280, 0.05365764964618433), (4471, 0.05258664768651133), (8855, 0.05258664768651133), (11081, 0.05258664768651133), (11790, 0.05258664768651133), (12124, 0.05258664768651133), (5492, 0.052150293591773096), (7329, 0.052018748756158875), (1341, 0.0518991068150628), (3949, 0.051891810878642204), (2183, 0.05134036042657775), (3251

In [120]:
dictionary.get(4525)

'michael'

In [121]:
# default dict doesn't make sense at all
# total_word_count[1182]

In [122]:
[x for x in corpus[3] if x[0] == 4525]


[(4525, 309)]

In [124]:
print(str(dictionary[4525]), str(dictionary.cfs[4525]))

michael 1838
