In [1]:
# Load libraries

import pandas as pd
import numpy as np
import nltk
import string

In [2]:
# Import data

text_mobydick = nltk.corpus.gutenberg.raw("melville-moby_dick.txt")
text_hamlet = nltk.corpus.gutenberg.raw("shakespeare-hamlet.txt")

In [3]:
# Tokenize the text

tokens_mobydick = nltk.word_tokenize(text_mobydick)
tokens_hamlet = nltk.word_tokenize(text_hamlet)

In [4]:
# Make tokens all lowercase

tokens_mobydick_lower = [w.lower() for w in tokens_mobydick]
tokens_hamlet_lower = [w.lower() for w in tokens_hamlet]

In [10]:
# Apply stopwords to the tokens

nltk_stop_words = nltk.corpus.stopwords.words("english")
more_stop_words = ["'d", "'s", "'s", "n't", "--", "''", "``"]
all_stop_words = nltk_stop_words + more_stop_words + list(string.punctuation)

# Apply the stopwords

tokens_mobydick_lower_stopped = [w for w in tokens_mobydick_lower if not w in all_stop_words]
tokens_hamlet_lower_stopped = [w for w in tokens_hamlet_lower if not w in all_stop_words]

In [11]:
# Stem with the Porter stemmer

from nltk.stem import PorterStemmer
stemmer_porter = PorterStemmer()

# Perform the stemming

tokens_mobydick_stemmed = [stemmer_porter.stem(w) for w in tokens_mobydick_lower_stopped]
tokens_hamlet_stemmed = [stemmer_porter.stem(w) for w in tokens_hamlet_lower_stopped]

In [12]:
# Create token frequency distributions

fdist_tokens_mobydick = nltk.FreqDist(tokens_mobydick_stemmed)
fdist_tokens_hamlet = nltk.FreqDist(tokens_hamlet_stemmed)

In [13]:
# Top 50 words comparison dataframe

top_50_tokens_mobydick = pd.DataFrame(fdist_tokens_mobydick.most_common(50), columns = ["Token", "Count"])
top_50_tokens_hamlet = pd.DataFrame(fdist_tokens_hamlet.most_common(50), columns = ["Token", "Count"])

# Add a column for the percent of total

top_50_tokens_mobydick["PctTotal"] = top_50_tokens_mobydick["Count"] / len(tokens_mobydick_stemmed)
top_50_tokens_hamlet["PctTotal"] = top_50_tokens_hamlet["Count"] / len(tokens_hamlet_stemmed)

In [20]:
# Create a list of bigrams

bigrams_mobydick = list(nltk.bigrams(tokens_mobydick_lower_stopped))
bigrams_hamlet = list(nltk.bigrams(tokens_hamlet_lower_stopped))

In [27]:
# Create bigram frequency distribution (normalized frequency)

from nltk import BigramCollocationFinder
bigram_measures = nltk.collocations.BigramAssocMeasures()

# Initiate the finders

finder_mobydick = BigramCollocationFinder.from_words(tokens_mobydick_lower_stopped)
finder_hamlet = BigramCollocationFinder.from_words(tokens_hamlet_lower_stopped)

# Score based on normalized frequency

scored_mobydick_freq = finder_mobydick.score_ngrams(bigram_measures.raw_freq)
scored_hamlet_freq = finder_hamlet.score_ngrams(bigram_measures.raw_freq)

# Top 50 bigrams comparison dataframe

top_50_bigrams_mobydick_freq = pd.DataFrame(scored_mobydick_freq[0:49], columns = ["Bigram", "PctTotal"])
top_50_bigrams_hamlet_freq = pd.DataFrame(scored_hamlet_freq[0:49], columns = ["Bigram", "PctTotal"])

# Add a column for the count

top_50_bigrams_mobydick_freq["Count"] = round(top_50_bigrams_mobydick_freq["PctTotal"] * len(bigrams_mobydick))
top_50_bigrams_hamlet_freq["Count"] = round(top_50_bigrams_hamlet_freq["PctTotal"] * len(bigrams_hamlet))

In [28]:
# Create bigram frequency distribution (mutual information)

from nltk import BigramCollocationFinder
bigram_measures = nltk.collocations.BigramAssocMeasures()

# Add freq filters on the finders

finder_mobydick.apply_freq_filter(5)
finder_hamlet.apply_freq_filter(5)

# Score based on mutual information

scored_mobydick_pmi = finder_mobydick.score_ngrams(bigram_measures.pmi)
scored_hamlet_pmi = finder_hamlet.score_ngrams(bigram_measures.pmi)

# Top 50 bigrams comparison dataframe

top_50_bigrams_mobydick_pmi = pd.DataFrame(scored_mobydick_pmi[0:49], columns = ["Bigram", "PMIScore"])
top_50_bigrams_hamlet_pmi = pd.DataFrame(scored_hamlet_pmi[0:49], columns = ["Bigram", "PMIScore"])

In [32]:
# Create a list of Trigrams

trigrams_mobydick = list(nltk.trigrams(tokens_mobydick_lower_stopped))
trigrams_hamlet = list(nltk.trigrams(tokens_hamlet_lower_stopped))

In [34]:
# Create trigram frequency distribution (normalized frequency)

from nltk import TrigramAssocMeasures
from nltk.collocations import TrigramCollocationFinder
trigram_measures = nltk.collocations.TrigramAssocMeasures()

# Initiate the finders

finder_mobydick = TrigramCollocationFinder.from_words(tokens_mobydick_lower_stopped)
finder_hamlet = TrigramCollocationFinder.from_words(tokens_hamlet_lower_stopped)

# Score based on normalized frequency

scored_mobydick_freq = finder_mobydick.score_ngrams(trigram_measures.raw_freq)
scored_hamlet_freq = finder_hamlet.score_ngrams(trigram_measures.raw_freq)

# Top 50 bigrams comparison dataframe

top_50_trigrams_mobydick_freq = pd.DataFrame(scored_mobydick_freq[0:49], columns = ["Trigram", "PctTotal"])
top_50_trigrams_hamlet_freq = pd.DataFrame(scored_hamlet_freq[0:49], columns = ["Trigram", "PctTotal"])

# Add a column for the count

top_50_trigrams_mobydick_freq["Count"] = round(top_50_trigrams_mobydick_freq["PctTotal"] * len(trigrams_mobydick))
top_50_trigrams_hamlet_freq["Count"] = round(top_50_trigrams_hamlet_freq["PctTotal"] * len(trigrams_hamlet))

In [35]:
# Output the top 50 dataframes into csv files

top_50_tokens_mobydick.to_csv("top_50_tokens_mobydick.csv")
top_50_tokens_hamlet.to_csv("top_50_tokens_hamlet.csv")

top_50_bigrams_mobydick_freq.to_csv("top_50_bigrams_mobydick_freq.csv")
top_50_bigrams_hamlet_freq.to_csv("top_50_bigrams_hamlet_freq.csv")

top_50_bigrams_mobydick_pmi.to_csv("top_50_bigrams_mobydick_pmi.csv")
top_50_bigrams_hamlet_pmi.to_csv("top_50_bigrams_hamlet_pmi.csv")

top_50_trigrams_mobydick_freq.to_csv("top_50_trigrams_mobydick_freq.csv")
top_50_trigrams_hamlet_freq.to_csv("top_50_trigrams_hamlet_freq.csv")

In [45]:
# Summary of vocabulary size at different stages of text processing

# Raw text

print("Raw Text")
print(len(tokens_mobydick))
print(len(set(tokens_mobydick)))
print(len(tokens_hamlet))
print(len(set(tokens_hamlet)))

# All Lowercase

print("\n All Lowercase")
print(len(tokens_mobydick_lower))
print(len(set(tokens_mobydick_lower)))
print(len(tokens_hamlet_lower))
print(len(set(tokens_hamlet_lower)))

# Removed stopwords

print("\n Removed Stopwords")
print(len(tokens_mobydick_lower_stopped))
print(len(set(tokens_mobydick_lower_stopped)))
print(len(tokens_hamlet_lower_stopped))
print(len(set(tokens_hamlet_lower_stopped)))

# Stemmed with porter stemmer

print("\n Stemmed")
print(len(tokens_mobydick_stemmed))
print(len(set(tokens_mobydick_stemmed)))
print(len(tokens_hamlet_stemmed))
print(len(set(tokens_hamlet_stemmed)))

Raw Text
255028
20742
36372
5535

 All Lowercase
255028
18701
36372
4807

 Removed Stopwords
108649
18543
15803
4685

 Stemmed
108649
12337
15803
3780


In [47]:
print((18701-20742) / 20742)
print((4807-5535) / 5535)

-0.09839938289460998
-0.13152664859981933


In [48]:
print((108649-255028) / 255028)
print((15803 - 36372) / 36372)

-0.5739722697115611
-0.5655174309908721


In [49]:
print(4807-4685)
print(18701-18543)

122
158


In [50]:
print((12337-18543) / 18543)
print((3780-4685) / 4685)

-0.33468155098959174
-0.19316969050160085


In [55]:
tokens_mobydick_lower.count("shall") / len(tokens_mobydick_lower)

0.00037642925482692097

In [60]:
print(tokens_hamlet_lower.count("must") / len(tokens_hamlet_lower))
print(tokens_hamlet_lower.count("would"))

0.0018695699989002528
68


In [63]:
sum([t in trigrams_mobydick for t in trigrams_hamlet])

9

**Resources**

**Different tokenizers in the NLTK package**
https://www.kite.com/python/docs/nltk.word_tokenize
https://towardsdatascience.com/benchmarking-python-nlp-tokenizers-3ac4735100c5

**Differences between Porter stemmer and Lancaster stemmer**
https://stackoverflow.com/questions/10554052/what-are-the-major-differences-and-benefits-of-porter-and-lancaster-stemming-alg

**About the Gutenberg Collection**
https://www.gutenberg.org/about/background/history_and_philosophy.html