In [None]:
import os
import sys
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))

# nltk.download('stopwords')
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from collections import Counter

from gensim.corpora.dictionary import Dictionary

# import plotly
# import plotly.plotly as py
# import plotly.offline as offline
# import plotly.graph_objs as go
from nltk.tokenize import word_tokenize


In [7]:
# read in data
# ----------------------------------
lines = pd.read_csv("/Users/dan/Documents/talk/the_office_area_plot/data/the_office_lines.csv", index_col=None)
lines.head()


Unnamed: 0,id,season,episode,scene,line_text,speaker,deleted
0,1,1,1,1,All right Jim. Your quarterlies look very good...,Michael,False
1,2,1,1,1,"Oh, I told you. I couldn't close it. So...",Jim,False
2,3,1,1,1,So you've come to the master for guidance? Is ...,Michael,False
3,4,1,1,1,"Actually, you called me in here, but yeah.",Jim,False
4,5,1,1,1,"All right. Well, let me show you how it's done.",Michael,False


In [3]:
# Convert season to string data type
# ----------------------------------
lines['season'] = lines['season'].astype(str)
lines.dtypes


id            int64
season       object
episode       int64
scene         int64
line_text    object
speaker      object
deleted        bool
dtype: object

In [38]:
# Word Tokenize
# --------
def word_token(row):
    '''text preprocessing function

    1. remove stage instructions in brackets
    2. trim white space
    3. tokenize
    4. remove special characters
    5. remove stopwords
    6. stem all words

    arg:
        row of a df

    return:
        clean, tokenized row
    '''

    # remove stage instructions in brackets
    no_brackets = re.sub("[\(\[].*?[\)\]]", "", row.line_text)

    # trim white space
    no_white = no_brackets.strip()

    # tokenize words
    word_tokens = nltk.word_tokenize(no_white)

    # lowercase all words
    lower_tokens = [token.lower() for token in word_tokens]

    # remove special characters
    alpha_only = [token for token in lower_tokens
                    if token.isalpha()]

    # remove i, but, you, as, etc. (stopwords)
    no_stops = [token for token in alpha_only
                    if token not in stopwords.words('english')]

    # stem all words
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized = [wordnet_lemmatizer.lemmatize(token) for token in no_stops]

    return lemmatized


def word_token_loop(df):
    # for loop to tokenize each row
    df['word_token'] = df.apply(word_token, axis=1)
    return df


lines_word_token = word_token_loop(lines)
lines_word_token.head()


Unnamed: 0,id,season,episode,scene,line_text,speaker,deleted,word_token
0,1,1,1,1,All right Jim. Your quarterlies look very good...,Michael,False,"[right, jim, quarterly, look, good, thing, lib..."
1,2,1,1,1,"Oh, I told you. I couldn't close it. So...",Jim,False,"[oh, told, could, close]"
2,3,1,1,1,So you've come to the master for guidance? Is ...,Michael,False,"[come, master, guidance, saying, grasshopper]"
3,4,1,1,1,"Actually, you called me in here, but yeah.",Jim,False,"[actually, called, yeah]"
4,5,1,1,1,"All right. Well, let me show you how it's done.",Michael,False,"[right, well, let, show, done]"


In [39]:
# lines_word_token['word_token'].head()

# i think onion to convert this1 column into a flat lreverseist and then the bag of words counter will work

# Create the bag-of-words: bow
# bow = Counter(lines_word_token['word_token'].to_list())
stacked_list = lines_word_token['word_token'].head(1000).to_list()

flat_list = [sentence for sub_list in stacked_list for sentence in sub_list]
# print(flat_list)

bow = Counter(flat_list)
# Print the 10 most common tokens
# print(bow.most_common(10))

print(bow)

Counter({'know': 110, 'oh': 85, 'right': 79, 'um': 78, 'yeah': 76, 'well': 68, 'uh': 67, 'going': 67, 'ok': 66, 'good': 63, 'go': 63, 'get': 58, 'one': 57, 'like': 56, 'think': 54, 'let': 50, 'pam': 48, 'would': 44, 'really': 43, 'want': 42, 'michael': 41, 'dwight': 40, 'jim': 39, 'come': 39, 'people': 37, 'hey': 35, 'could': 33, 'great': 33, 'time': 32, 'need': 32, 'thank': 31, 'take': 31, 'guy': 29, 'yes': 28, 'thing': 27, 'sorry': 26, 'see': 26, 'say': 26, 'something': 25, 'got': 25, 'u': 23, 'kind': 23, 'okay': 23, 'office': 22, 'care': 22, 'na': 22, 'look': 21, 'way': 21, 'plan': 21, 'call': 20, 'downsizing': 20, 'maybe': 20, 'actually': 19, 'work': 19, 'tell': 19, 'little': 19, 'two': 19, 'diversity': 19, 'please': 18, 'everybody': 17, 'year': 16, 'mean': 16, 'today': 16, 'god': 16, 'gon': 16, 'googi': 16, 'health': 16, 'hello': 15, 'job': 15, 'sure': 15, 'hero': 15, 'around': 15, 'surprise': 15, 'paper': 14, 'talk': 14, 'much': 14, 'ca': 14, 'man': 14, 'said': 14, 'make': 14, 'm