In [None]:
# Show images in notebook
%pylab inline

# Gather our tools
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Read the text of the play from its file on the hard-drive

with open('antigone.txt', 'r') as file_in:
    antigone = file_in.read()

In [None]:
# Create a list, where each entry is a line from the play
# that starts with the name of the speaker
antigone_list = antigone.split('\n\n')

# Create a dictionary where each key is the name of a character
# and each entry is a single string of words spoken by them
dialogue_dict = {}
for line in antigone_list:
    dex = line.index(' ')
    char = line[:dex]
    if char not in dialogue_dict.keys():
        dialogue_dict[char] = line[dex:]
    else:
        dialogue_dict[char] += line[dex:]

In [None]:
# Since we're using an old translation of Antigone,
# let's supplement our stopword list with their old-timey forms

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

ye_olde_stop_words = ['thou','thy','thee', 'ye', 'hath','hast', 'wilt',\
                      'art', 'dost','doth','shalt','tis','canst','thyself']
stop_words = list(ENGLISH_STOP_WORDS)+ye_olde_stop_words

In [None]:
# Create a document-term matrix ('dtm'), where each row is associated
# with a character and each column with a given word. The values
# of the matrix tell us how often each character uttered a given word.

# Boot up the dtm-maker, including our custom stopwords
cv = CountVectorizer(stop_words=stop_words)

# Create the dtm
dtm = cv.fit_transform(dialogue_dict.values())

# Put the dtm into human-readable format
word_list = cv.get_feature_names()
dtm_df = pd.DataFrame(dtm.toarray(), columns = word_list, index = dialogue_dict.keys())

In [None]:
# Sort rows in order of total words spoken by each character
dtm_df = dtm_df.loc[dtm_df.sum(axis=1).sort_values(ascending=False).index]

# Display bar chart of words spoken by each character, as percent of total words
plt.bar(range(len(dtm_df)), dtm_df.sum(axis=1)/sum(dtm_df.sum(axis=1))*100)
plt.xticks(range(len(dtm_df)), dtm_df.index, rotation=45)
plt.show()

In [None]:
# Calculate the "Most Distinctive Words" for Antigone (character)

# Create new dataframe
mdw_df = pd.DataFrame()

# Add a column for her observed word counts
mdw_df['ANTIGONE'] = dtm_df.loc['ANTIGONE']

# Add a column for the total counts of each word in the play
mdw_df['WORD_TOTAL'] = dtm_df.sum()

# Calculate Antigone's share of the total dialogue
char_space = sum(mdw_df['ANTIGONE'])/float(sum(mdw_df['WORD_TOTAL']))

# Add a new column in which we calculate an "expected" number of times
# Antigone would utter each word, based on its overall use in the play
# and her share of the dialogue.

mdw_df['ANTIGONE_EXPECTED'] = mdw_df['WORD_TOTAL']*char_space

# How much more/less frequently does Antigone use the word than expected?
mdw_df['OBS-EXP_RATIO'] = mdw_df['ANTIGONE']/(mdw_df['ANTIGONE_EXPECTED'])

In [None]:
# Sort the dataframe by the Observed/Expected Ratio to show 
# Antigone's 20 "Most Distinctive Words"
mdw_df[(mdw_df['OBS-EXP_RATIO']>1)&(mdw_df['WORD_TOTAL']>5)].sort_values('OBS-EXP_RATIO', ascending=False).head(20)