# GOT Analysis
### Import packages
This script uses "codecs" for reading the text files; "re" for regular expressions; "collections" for working with tokens; "nltk" for natural language toolkit and "wordcloud" for creating word clouds. 

In [173]:
#% matplotlib inline
import codecs
import re
import copy
import collections
import numpy as np
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
#from __future__ import division

In [174]:
# Read GOT text files 
with codecs.open("got1.txt", "r", encoding="utf-8") as f:
    got1 = f.read()
with codecs.open("got2.txt", "r", encoding="utf-8") as f:
    got2 = f.read()

### WORD CLOUD

In [None]:
# Create stopword list:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)
#Generate wordcloud data
wordcloud1 = WordCloud(stopwords=stopwords, max_words=20, \
                      background_color="white").generate(got1)
wordcloud2 = WordCloud(stopwords=stopwords, max_words=20, \
                      background_color="white").generate(got2)

In [None]:
# Display wordclouds
import matplotlib.pyplot as mpLib
mpLib.imshow(wordcloud1)
mpLib.axis("off")
mpLib.show()
mpLib.imshow(wordcloud2)
mpLib.axis("off")
mpLib.show()

### MOST COMMON WORDS

In [None]:
# We need some specialized functions from NLTK that are not included by default. 
# It is possible to download just the "stopwords" portion but it may be easier to simply download everything in NLTK. 
nltk.download('stopwords')

In [None]:
# Check for English stop words
from nltk.corpus import stopwords
esw = stopwords.words('english')
esw.append("would")

In [None]:
# Filter token using regular expressions
word_pattern = re.compile("^\w+$")

In [None]:
# Token Counter Function
def get_text_counter(text):
    tokens = WordPunctTokenizer().tokenize(PorterStemmer().stem(text))
    tokens = list(map(lambda x: x.lower(), tokens))
    tokens = [token for token in tokens if re.match(word_pattern, token) and token not in esw]
    return collections.Counter(tokens), len(tokens)

Create a function to calculate the absolute frequency and relative frequency of the most common words.

In [None]:
def make_df(counter, size):
    abs_freq = np.array([el[1] for el in counter])
    rel_freq = abs_freq / size
    index = [el[0] for el in counter]
    df = pd.DataFrame(data=np.array([abs_freq, rel_freq]).T, index=index, columns=["Absolute frequency", "Relative frequency"])
    df.index.name = "Most common words"
    return df

In [None]:
# Calculate the most common words in GOT1, then display the 20 most common.
g1_counter, g1_size = get_text_counter(got1)
make_df(g1_counter.most_common(20), g1_size)

In [None]:
# Save the 1000 most common words of GOT 1 to csv
je_df = make_df(g1_counter.most_common(1000), g1_size)
je_df.to_csv("G1_1000.csv")

In [None]:
# Calculate the most common words of GOT2 then display the 20 most common
g2_counter, g2_size = get_text_counter(got2)
make_df(g2_counter.most_common(20), g2_size)

Save the 1000 most common words of GOT2 to CSV.

In [None]:
wh_df = make_df(g2_counter.most_common(1000), g2_size)
wh_df.to_csv("G2_1000.csv")

Find the most common words across the two documents.

In [None]:
all_counter = g2_counter + g1_counter
all_df = make_df(g2_counter.most_common(1000), 1)
most_common_words = all_df.index.values

Create a data frame with the word frequency differences.

In [None]:
df_data = []
for word in most_common_words:
    g1_c = g1_counter.get(word, 0) / g1_size
    g2_c = g2_counter.get(word, 0) / g2_size
    d = abs(g1_c - g2_c)
    df_data.append([g1_c, g2_c, d])
dist_df = pd.DataFrame(data=df_data, index=most_common_words,
                       columns=["GOT1 relative frequency", "GOT2 relative frequency",
                                "Relative frequency difference"])
dist_df.index.name = "Most common words"
dist_df.sort_values("Relative frequency difference", ascending=False, inplace=True)

Display the most distinctive words.

In [None]:
dist_df.head(20)

In [None]:
# Save the full list of distinctive words to a csv entitled got.csv
dist_df.to_csv("GOT.csv")