In [51]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from string import punctuation


In [18]:
gatsby = open('gatsby.txt', 'r')
book_only = gatsby.read().splitlines()[50:]

# remove empty lines and chapter headings
book_only = list(filter(None, book_only))
book_only = [s for s in book_only if 'Chapter' not in s]

In [59]:
def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

# remove punctuation and make everything lowercase
process1 = []
for s in book_only:
    process1.append(strip_punctuation(s.lower()))

In [78]:
# finally, split the book into words
gatsby_processed = []

for line in process1:
    gatsby_processed.append(line.split(' '))

# flatten the list of lists of words
flat_gatsby = []
for sublist in gatsby_processed:
    for item in sublist:
        flat_gatsby.append(item)
        
flat_gatsby = list(filter(None, flat_gatsby))

In [79]:
print("Total words: {}. Total Unique words: {}".format(len(flat_gatsby), len(set(flat_gatsby))))

Total words: 49203. Total Unique words: 6436


In [80]:
# on average we see that words appear about 8-9 times. 
dict_gatsby = {}
for word in flat_gatsby:
    if word in dict_gatsby.keys():
        dict_gatsby[word] += 1
    else:
        dict_gatsby[word] = 1

In [118]:
sorted(dict_gatsby, key=dict_gatsby.get, reverse=True)[0:10] # 10 most frequent words

['the', 'and', 'a', 'i', 'to', 'of', 'in', 'he', 'was', 'that']

In [86]:
# Simple maximum likelihood estimates using bigrams and the text of the great gatsby
def bigram_predict(prev_word):
    # number of occurrences
    count = 0
    next_dict = {}
    for i, word in enumerate(flat_gatsby):
        if word == prev_word:
            count += 1
            n = flat_gatsby[i+1]
            if n in next_dict:
                next_dict[n] += 1
            else:
                next_dict[n] = 1
    for key in next_dict.keys():
        next_dict[key] /= count # normalize to get a count
    return next_dict              

In [93]:
bigram_predict('might')

{'be': 0.14285714285714285,
 'catch': 0.03571428571428571,
 'fool': 0.03571428571428571,
 'have': 0.42857142857142855,
 'once': 0.03571428571428571,
 'pick': 0.03571428571428571,
 'remain': 0.03571428571428571,
 'slip': 0.03571428571428571,
 'sober': 0.03571428571428571,
 'soon': 0.03571428571428571,
 'sue': 0.03571428571428571,
 'think': 0.07142857142857142,
 'want': 0.03571428571428571}

In [100]:
bigram_predict('remain') # hmmmmmm

{'in': 1.0}

In [103]:
def trigram_predict(word1, word2):
    # number of occurrences
    count = 0
    next_dict = {}
    for i, word in enumerate(flat_gatsby):
        if word == word1:
            if flat_gatsby[i+1] == word2:
                count += 1
                n = flat_gatsby[i+2]
                if n in next_dict:
                    next_dict[n] += 1
                else:
                    next_dict[n] = 1
    for key in next_dict.keys():
        next_dict[key] /= count # normalize to get a count
    return next_dict    

In [116]:
trigram_predict('a', 'small')

{'bedroom': 0.05,
 'block': 0.05,
 'bow': 0.05,
 'circle': 0.05,
 'diningroom': 0.05,
 'expensive': 0.05,
 'eyesore': 0.05,
 'flask': 0.05,
 'flatnosed': 0.05,
 'foul': 0.05,
 'gust': 0.05,
 'hotel': 0.05,
 'investigation': 0.1,
 'picture': 0.05,
 'producer': 0.05,
 'rectangle': 0.05,
 'town': 0.15}

In [120]:
(0.5 * 0.5 * 0.1 * 0.2) + (0.5 * 0.5 * 0.1 * 0.3) + (0.5 * 0.3 * 0.6 * 0.2 ) + (0.5 * 0.7 * 0.6 * 0.3)

0.0935