In [129]:
import matplotlib.pyplot as plt
import numpy as np
import numba
import pandas as pd
from scipy import stats
import gzip
import urllib.request
from string import punctuation
from collections import Counter


In [62]:
def analyze_word_counts(word_counts):
    df = pd.DataFrame(word_counts, columns=["word", "count"])

    total_words = df["count"].sum()
    unique_words = len(word_counts)
    print("Total words: %d"%total_words)
    print("Total words: %d"%unique_words)
    print("$\\rho_{est}$: %.3f"%(unique_words/total_words))

    for i in range(1,3+1):
        count = df[df["count"] == i].shape
        print("There were %5d words used %d times each"%(count[0], i))

In [70]:
ulysses_url = "http://www.uvm.edu/pdodds/teaching/courses/2020-08UVM-300/docs/ulysses.txt"
def generate_word_counts_ulysses():
    print("Running for Ulysses...")
    with urllib.request.urlopen(ulysses_url) as f:
        word_counts =  [tuple(l.split()) for l in f.read().decode("utf-8").split("\n") if l != ""]
    word_counts = [(a[:-1], int(b)) for (a,b) in word_counts]
    return word_counts

In [71]:
analyze_word_counts(generate_word_counts_ulysses())

Running for Ulysses
Total words: 264706
Total words: 31398
$\rho_{est}$: 0.119
There were 17738 words used 1 times each
There were  4887 words used 2 times each
There were  2241 words used 3 times each


In [136]:
pride_and_prejudice_url = "https://www.gutenberg.org/files/1342/1342-0.txt"
def generate_word_counts_pride_prejudice():
    print("Running for Pride and Prejudice...")
    with urllib.request.urlopen(pride_and_prejudice_url) as f:
        raw = f.read().decode("utf-8")
    
    exclude = set(punctuation) 

    raw_trimmed = raw.split("\r\n")[35:-368] # remove boilerplate content
    raw_joined = " ".join(data_trimmed)
    list_letters_noPunct =  [ char for char in raw_joined if char not in exclude ]
    text_noPunct = "".join(list_letters_noPunct)
    list_words = text_noPunct.strip().split()
    words_counter = Counter(list_words)
    
    return(list(words_counter.items()))

In [137]:
analyze_word_counts(generate_word_counts_pride_prejudice())

Running for Pride and Prejudice
Total words: 121662
Total words: 8206
$\rho_{est}$: 0.067
There were  3668 words used 1 times each
There were  1218 words used 2 times each
There were   689 words used 3 times each


In [189]:
le_comte_urls = ["https://www.gutenberg.org/ebooks/17989.txt.utf-8", # volume 1 - 4
                "https://www.gutenberg.org/ebooks/17990.txt.utf-8",
                "https://www.gutenberg.org/ebooks/17991.txt.utf-8",
                "https://www.gutenberg.org/ebooks/17992.txt.utf-8"]

def generate_word_counts_le_comte():
    print("Running for Le comte...")
    raws = [urllib.request.urlopen(url).read().decode("utf-8") for url in le_comte_urls]
    raws_trimmed = [raw.split("\r\n")[25: -368] for raw in raws] # remove boilerplate code
    raws_joined = []
    for raw in raws_trimmed:
        raws_joined += raw
    list_letters_noPunct =  [ char for char in raws_joined if char not in exclude ]
    text_noPunct = "".join(list_letters_noPunct)
    list_words = text_noPunct.strip().split()
    words_counter = Counter(list_words)
    
    return(list(words_counter.items()))

In [190]:
analyze_word_counts(generate_word_counts_le_comte())

Running for Le comte...
Total words: 420611
Total words: 76625
$\rho_{est}$: 0.182
There were 57981 words used 1 times each
There were  7287 words used 2 times each
There were  3113 words used 3 times each
