# Comparing Genres in a Corpus and Calculating Readability

In [1]:
#provided code
import nltk
nltk.download("treebank")
nltk.download("cmudict")
nltk.download("brown")
nltk.download("movie_reviews")


[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\Francesco\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\Francesco\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Francesco\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Francesco\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [2]:
#provided code
from nltk.corpus import treebank,cmudict,brown,movie_reviews
from collections import defaultdict, Counter
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

### Comparing Genres

Compare the following aspects for the adventure, fiction, and mystery genres in the brown corpus.

- average sentence length
- percent of words in the corpus which are adjectives (JJ, JJR, JJS, JJT tags)
- lexical density
- 50 words strongly associated with each genre

In [3]:
categories = ["adventure","fiction","mystery"]

In [5]:
### Your code here
import pandas as pd
from nltk.corpus import stopwords

adj_tags = ["JJ", "JJR", "JJS", "JJT"]

stopwords_English=stopwords.words("English")
stopwords_English.extend(["--",";","!","?",'"',"``",",",".", "''", ":"])

most_common={}
average_length = {}
percent_of_adj = {}
lexical_density = {}

open_class = {"N", "V", "J", "R"}
for i in categories:
    total_length = 0
    adj_count = 0
    open_total = 0

# get average length
    for sent in brown.sents(categories= i):
        total_length += len(sent)
    average_length[i] = total_length/len(brown.sents(categories= i))

# get percent of words in the corpus which are adjectives (JJ, JJR, JJS, JJT tags)
    for sentence in brown.tagged_sents(categories= i):
        for tag_word in sentence:
            for j in adj_tags:
                if j in tag_word:
                    adj_count+=1
    percent_of_adj[i] = adj_count/total_length

# get lexical density
    for word, pos in brown.tagged_words(categories= i):
        if pos[0] in open_class:
            open_total += 1
    lexical_density[i] = open_total/len(brown.words())

def get_unigram_probs(words):
    '''get unigram probabilities for the words in a corpus'''
    counts = Counter(word.lower() for word in words)
    total = sum(counts.values())
    return {word:count/total for word,count in counts.items()}

def combine_probs(prob1,prob2):
    '''given unigram probabilities for two corpora, combine the two into a
    singe dictionary by averaging them'''
    all_words = set(prob1.keys())
    all_words.update(prob2.keys())
    return {word:(prob1.get(word,0) + prob2.get(word,0))/2 for word in all_words}

def subtract_probs(prob1, prob2):
    '''given unigram probability dictionaries for two corpora, create a dictionary of
    the differences between the two probabilities'''
    all_words = set(prob1.keys())
    all_words.update(prob2.keys())
    return {word:prob1.get(word,0) - prob2.get(word,0) for word in all_words}

# get most common
fict_probs = get_unigram_probs(brown.words(categories="fiction"))
adventure_probs = get_unigram_probs(brown.words(categories="adventure"))
mystery_probs = get_unigram_probs(brown.words(categories="mystery"))

fict_and_adventure = combine_probs(fict_probs, adventure_probs)
mystery_and_adventure = combine_probs(mystery_probs, adventure_probs)
fict_and_mystery = combine_probs(fict_probs, mystery_probs)

fict_bias_probs = subtract_probs(fict_probs, mystery_and_adventure)
mystery_bias_probs = subtract_probs(mystery_probs, fict_and_adventure)
adventure_bias_probs =  subtract_probs(adventure_probs, fict_and_mystery)

def delete_stopword(dictionary):
    for key in dictionary.copy().keys():
        if key in stopwords_English:
            del dictionary[key]

delete_stopword(fict_bias_probs)
delete_stopword(mystery_bias_probs)
delete_stopword(adventure_bias_probs)

fiction_df = pd.DataFrame.from_dict(sorted(fict_bias_probs.items(), key=lambda x:x[1], reverse = True)).rename(columns={0: "fiction_word", 1: "fiction_frequency"}).head(50)
mystery_df = pd.DataFrame.from_dict(sorted(mystery_bias_probs.items(), key=lambda x:x[1], reverse = True)).rename(columns={0: "mystery_word", 1: "mystery_frequency"}).head(50)
adventure_df= pd.DataFrame.from_dict(sorted(adventure_bias_probs.items(), key=lambda x:x[1], reverse = True)).rename(columns={0: "adventure_word", 1: "adventure_frequency"}).head(50)

In [6]:
average_length

{'adventure': 14.95406512831572,
 'fiction': 16.118616144975288,
 'mystery': 14.71152856407617}

In [7]:
percent_of_adj

{'adventure': 0.040653572149635143,
 'fiction': 0.04517579721995094,
 'mystery': 0.03902464622435236}

In [8]:
lexical_density

{'adventure': 0.024518770367002184,
 'fiction': 0.024035646129150046,
 'mystery': 0.019721114165443785}

In [9]:
fiction_df

Unnamed: 0,fiction_word,fiction_frequency
0,would,0.001197
1,church,0.0008
2,kate,0.000584
3,winston,0.000526
4,john,0.000495
5,scotty,0.000467
6,hans,0.000467
7,watson,0.000438
8,god,0.000427
9,doctor,0.000427


In [14]:
adventure_df

Unnamed: 0,adventure_word,adventure_frequency
0,said,0.000953
1,man,0.00064
2,eyes,0.000539
3,matsuo,0.000505
4,jess,0.000476
5,girl,0.000462
6,marine,0.000461
7,horses,0.000452
8,curt,0.000438
9,brannon,0.000418


In [15]:
mystery_df

Unnamed: 0,mystery_word,mystery_frequency
0,car,0.000877
1,back,0.000808
2,office,0.000711
3,mr.,0.000659
4,police,0.000604
5,door,0.000573
6,know,0.000517
7,andy,0.000493
8,call,0.000492
9,got,0.000478


### Calculating Readability Using Flesh Reading Ease

In [16]:
vowels = {"a","e","i","o","u","y"}
p_dict = cmudict.dict() # keep this outside as a global variable so you aren't reloading each time

def get_syllables(word):
    '''use CMU dict (p_dict) to count the number of syllables in word, default to number of vowels'''
    syllable_count = 0
    #your code here
    if word in p_dict.keys():
        pronunciation=p_dict[word][0]
        for i in pronunciation:
            if i[-1].isnumeric():
                syllable_count+=1
    else:
        for letter in word:
            if letter.lower() in vowels:
                syllable_count+=1
    #your code here
    return syllable_count

In [17]:
assert get_syllables("readability") == 5
assert get_syllables("blabglob") == 2
print("Success!")

Success!


In [18]:
def get_reading_ease(sentence):
    '''calculate the Flesh reading ease for a single sentence consisting of a list of words (strings)'''
    total_syllables = 0
    num_words = 0
    for word in sentence:
        if word.isalpha():
            num_words += 1
            total_syllables += get_syllables(word)
    if num_words==0:
        return None
    reading_ease = 206.835 - (1.015 * num_words) - (84.6 * (total_syllables/num_words))
    return reading_ease


In [19]:
assert 100 < get_reading_ease(["I", "am", "done", ",","man"]) < 140
assert -60 < get_reading_ease(["Felicitations", "for", "achieving", "a", "thoroughly", "excellent", "resolution", "to", "an", "altogether", "indombidable", "conundrum", "of", "humongous", "proportions", "."]) <-20
assert get_reading_ease(["?"]) == None
print("Success!")

Success!


In [20]:
def calculate_avg_reading_ease(corpus):
    '''calculate and return average Flesh reading ease for all sentences in a corpus'''
    total_readability=0
    total_sentences=0
    for sentence in corpus.sents():
        reading_ease=get_reading_ease(sentence)
        if reading_ease is not None:
            total_readability+=reading_ease
            total_sentences+=1
    average_sentence_readability=total_readability/total_sentences
    return average_sentence_readability
    

penn_readability = calculate_avg_reading_ease(treebank)
review_readability = calculate_avg_reading_ease(movie_reviews)

print("Treebank readability")
print(penn_readability)
print("Movie review readability")
print(review_readability)


Treebank readability
42.28948916594171
Movie review readability
59.39520661660777


In [21]:
assert 40 < penn_readability < 50
assert 55 < review_readability < 65
print("Success!")

Success!
