# Quantifying Shakespeare

In this notebook we will find the frequencies with which Shakespeare's words used letters.

In [9]:
from io import TextIOWrapper

def read_file(filename: str) -> list[str]:
    """Given the name of a file, return a list of its lines."""
    lines: list[str] = []
    file: TextIOWrapper = open(filename, "r", encoding="utf8")
    # string "r" - just trying to read a file in
    for line in file:
        line = line.strip()
        # strip takes spaces and strips them away so theres only text
        line = line.lower()
        # lower converts uppercase letters to lowercase letters
        if line != "":
            lines.append(line)
        lines.append(line)
    return lines

shakespeare_lines: list[str] = read_file("./shakespeare.txt")
print(len(shakespeare_lines))



239296


The next step of our analysis is to count the letters in the lines list one-by-one to find the frequency of use of each letter.



In [13]:
# Goal: Define a function named `tally`
# Give the function (as params) a dictionary reference (key: str, value: int) and a key  
# if the key is in the dictionary, increase value by 1
# otherwise, set key's value to 1
# it is a procedure and return None

def tally(counts: dict[str, int], key: str) -> None:
    """Mutate counts to increment a key by 1 if it exists, or intialize to one."""
    if key in counts:
        counts[key] += 1
    else:
        counts[key] = 1
    
def count_letters(lines: list[str]) -> dict[str, int]:
    """Count frequencies of all letters in a list of strings."""
    counts: dict[str, int] = {}
    # loop through all lines
    #  for each line loop through all characters
    #    tally character into the counts dictionary
    #    challenge: tally only letters! no numbers of punctuation
    for line in lines:
        for chr in line:
            if chr.isalpha():
                tally(counts, chr)
    return counts

shakespeare_letters: dict[str, int] = count_letters(shakespeare_lines)
print(shakespeare_letters)

{'t': 659550, 'h': 473736, 'i': 507980, 's': 497978, 'e': 894408, 'x': 10588, 'f': 161032, 'l': 340038, 'p': 116928, 'r': 475728, 'n': 486524, 'd': 298924, 'b': 123912, 'y': 188740, 'o': 629200, 'j': 9558, 'c': 176370, 'g': 136398, 'u': 257894, 'a': 578300, 'w': 178780, 'm': 222904, 'k': 70816, 'v': 75138, 'z': 3262, 'q': 7164}


Let's create a visualization of this data with a bar chart.

In [17]:
items_in_dict: list[tuple[str,int]] = list(shakespeare_letters.items())
items_in_dict = sorted(items_in_dict)
# sorted can be used for list of int, str, floats, tuples
print(items_in_dict)

sorted_letters: dict[str, int] = dict(items_in_dict)
print(sorted_letters)

[('a', 578300), ('b', 123912), ('c', 176370), ('d', 298924), ('e', 894408), ('f', 161032), ('g', 136398), ('h', 473736), ('i', 507980), ('j', 9558), ('k', 70816), ('l', 340038), ('m', 222904), ('n', 486524), ('o', 629200), ('p', 116928), ('q', 7164), ('r', 475728), ('s', 497978), ('t', 659550), ('u', 257894), ('v', 75138), ('w', 178780), ('x', 10588), ('y', 188740), ('z', 3262)]
{'a': 578300, 'b': 123912, 'c': 176370, 'd': 298924, 'e': 894408, 'f': 161032, 'g': 136398, 'h': 473736, 'i': 507980, 'j': 9558, 'k': 70816, 'l': 340038, 'm': 222904, 'n': 486524, 'o': 629200, 'p': 116928, 'q': 7164, 'r': 475728, 's': 497978, 't': 659550, 'u': 257894, 'v': 75138, 'w': 178780, 'x': 10588, 'y': 188740, 'z': 3262}


In [14]:
from matplotlib import pyplot
# go to office hours to install matplot lib
pyplot.title("Frequencies of Letters")
pyplot.xlabel("Letters")
pyplot.ylabel("Frequencies")
labels: list[str] = list(sorted_letters.keys())
values: list[int] = list(sorted_letters.values())
pyplot.bar(labels, values)


ModuleNotFoundError: No module named 'matplotlib'