# Comparing two Counters

Today we will look at a way of scoring the significance of differences between frequency distributions, based on a method called "Fightin' Words" by Monroe, Colaresi, and Quinn.

In [None]:
import re, sys, glob, math
import numpy
from collections import Counter
from matplotlib import pyplot

1. What is the encoding of the files? How are they structured? What do we need to do to separate text from non-textual words like speakers and stage directions?

2. Look at the most frequent words in the counters for comedy and tragedy. What is different? Is this view informative about differences between these two genres?

3. There is a problem calculating `log_rank`. What is it, and how can we fix it?

4. What does the `generate_scores` function do? What is the effect of the `smoothing` parameter?

5. Look at the plot showing "Fightin' Words" scores for comedy vs. tragedy. What stands out? What does this tell you about these genres in Shakespeare? What if any changes might you make to how we tokenize or otherwise pre-process the documents?

6. Create the same plot for tragedy vs. history and comedy vs. history. What is different? What words would you want to look at in their original context and why?

In [None]:
genre_directories = { "tragedy" : "shakespeare/tragedies", "comedy" : "shakespeare/comedies", "history" : "shakespeare/historical" }

word_pattern = re.compile("\w[\w\-\'’]*\w|\w")

# This counter will store the total frequency of each word type across all plays
all_counts = Counter()

# This dictionary will have one counter for each genre
genre_counts = {}

# This dictionary will have one dictionary for each genre, each containing one Counter for each play in that genre
genre_play_counts = {}

In [None]:
# Read the plays from files

for genre in genre_directories.keys():
    
    genre_play_counts[genre] = {}
    genre_counts[genre] = Counter()
    
    for filename in glob.glob("{}/*.txt".format(genre_directories[genre])):
        
        play_counter = Counter()
        
        genre_play_counts[genre][filename] = play_counter
        
        with open(filename, encoding="utf-8") as file: ## What encoding?
            
            ## This block reads a file line by line.
            for line in file:
                line = line.rstrip()
                
                tokens = word_pattern.findall(line)
                
                play_counter.update(tokens)
        
        genre_counts[genre] += play_counter
        all_counts += play_counter

In [None]:
genre_counts.keys()

In [None]:
genre_play_counts.keys()

In [None]:
genre_play_counts["comedy"].keys()

In [None]:
genre_play_counts["comedy"]["shakespeare/comedies/The Merry Wives of Windsor.txt"].most_common(30)

In [None]:
genre_counts["comedy"].most_common(15)

In [None]:
genre_counts["tragedy"].most_common(15)

In [None]:
vocabulary = [w for w, c in all_counts.most_common()]
vocabulary_size = len(vocabulary)

total_word_counts = numpy.array([all_counts[w] for w in vocabulary])
log_counts = numpy.log(total_word_counts)

word_ranks = numpy.arange(len(vocabulary))
log_ranks = numpy.log(word_ranks)

genres = genre_play_counts.keys()

In [None]:
pyplot.scatter(log_ranks, log_counts, alpha = 0.2)
pyplot.show()

In [None]:
def generate_scores(counter, smoothing = 0.0):
    scores = numpy.zeros(vocabulary_size)
    
    for word_id, word in enumerate(vocabulary):
        scores[word_id] = counter[word] + smoothing
    
    return scores

def count_difference(counter_a, counter_b, smoothing):
    
    scores_a = generate_scores(counter_a, smoothing)
    scores_b = generate_scores(counter_b, smoothing)
    
    ratio_a = scores_a / (numpy.sum(scores_a) - scores_a)
    ratio_b = scores_b / (numpy.sum(scores_b) - scores_b)
    
    variances = (1.0/scores_a) + (1.0/scores_b)
    
    return numpy.log(ratio_a / ratio_b) / numpy.sqrt(variances)

In [None]:
comedy_tragedy_scores = count_difference(genre_counts["comedy"], genre_counts["tragedy"], 0.0)

In [None]:
sorted_words = sorted(zip(comedy_tragedy_scores, vocabulary))

print(sorted_words[:10])
print(sorted_words[-10:])

In [None]:
pyplot.figure(figsize=(20, 20))
pyplot.xlim(3, 11)
pyplot.scatter(log_counts, comedy_tragedy_scores, alpha = 0.2)
for word_id, word in enumerate(vocabulary):
    if numpy.abs(comedy_tragedy_scores[word_id]) + log_counts[word_id] > 7.5:
        pyplot.text(log_counts[word_id], comedy_tragedy_scores[word_id], word)
pyplot.show()