# Probabilities Application: Letter Frequencies
**FIZ371 - Scientific & Technical Calculations | 11/10/2023**

Emre S. Tasci <emre.tasci@hacettepe.edu.tr>

Using L. Frank Baum's "The Wonderful Wizard of Oz" book, calculate the frequencies of the letters & bigrams.

(The book, written in 1900, is now in public domain and available from [Project Gutenberg](https://www.gutenberg.org/ebooks/55))

In [1]:
import numpy as np

We first read the text into the `data` variable and define the set of letters we are interested in:

In [2]:
fname = "supp/wizardofoz_1990_publicdomain_guthenberg.txt"
alphabet_str = "abcdefghijklmnopqrstuvwxyz "

# Convert the alphabet_str to alphabet array
alphabet = [*alphabet_str]

In [3]:
with open(fname, 'r') as myfile:
    data=myfile.read().replace('\n', ' ')
    
# Convert it to all lowercase
data = data.lower()

# Letters

Count and store the frequencies into the `count_letter` dictionary:

In [4]:
count_letter = {}
for letter in alphabet:
    count_letter[letter] = data.count(letter)
count_letter

{'a': 13887,
 'b': 2448,
 'c': 4161,
 'd': 8888,
 'e': 22261,
 'f': 3588,
 'g': 3587,
 'h': 11977,
 'i': 10350,
 'j': 240,
 'k': 1920,
 'l': 7256,
 'm': 3830,
 'n': 10752,
 'o': 14240,
 'p': 2466,
 'q': 144,
 'r': 10534,
 's': 9689,
 't': 16378,
 'u': 4600,
 'v': 1290,
 'w': 5229,
 'x': 209,
 'y': 4128,
 'z': 277,
 ' ': 44331}

To calculate the probabilities, we divide each frequency by the total sum:

In [5]:
tot_count = np.sum(list(count_letter.values()))
print(tot_count)

218660


In [6]:
probs_letter = {}
for letter in count_letter.keys():
    probs_letter[letter] = count_letter[letter] / tot_count
probs_letter

{'a': 0.06350955821823837,
 'b': 0.0111954632763194,
 'c': 0.019029543583645843,
 'd': 0.04064758071892436,
 'e': 0.1018064575139486,
 'f': 0.016409036860879904,
 'g': 0.01640446355071801,
 'h': 0.05477453580901857,
 'i': 0.04733376017561511,
 'j': 0.0010975944388548432,
 'k': 0.008780755510838746,
 'l': 0.033183938534711424,
 'm': 0.01751577792005854,
 'n': 0.04917223086069697,
 'o': 0.06512393670538735,
 'p': 0.011277782859233513,
 'q': 0.0006585566633129059,
 'r': 0.048175249245403826,
 's': 0.0443108021586024,
 't': 0.07490167383151926,
 'u': 0.021037226744717828,
 'v': 0.005899570108844782,
 'w': 0.023913838836549895,
 'x': 0.0009558218238360926,
 'y': 0.018878624348303303,
 'z': 0.0012668069148449649,
 ' ': 0.20273941278697521}

# Bigrams
We are going to generate all possible bigrams and count them, storing in `count_bigram`:

In [7]:
count_bigram = {}
total_sum_bigram = 0
for x in alphabet:
    for y in alphabet:
        bigram = x+y
        count_bigram[bigram] = data.count(bigram)
        total_sum_bigram += count_bigram[bigram]

In [8]:
count_bigram

{'aa': 0,
 'ab': 124,
 'ac': 350,
 'ad': 789,
 'ae': 1,
 'af': 143,
 'ag': 264,
 'ah': 12,
 'ai': 790,
 'aj': 2,
 'ak': 156,
 'al': 999,
 'am': 310,
 'an': 3170,
 'ao': 0,
 'ap': 218,
 'aq': 0,
 'ar': 1552,
 'as': 1506,
 'at': 1442,
 'au': 147,
 'av': 343,
 'aw': 223,
 'ax': 36,
 'ay': 377,
 'az': 12,
 'a ': 897,
 'ba': 254,
 'bb': 9,
 'bc': 0,
 'bd': 0,
 'be': 857,
 'bf': 0,
 'bg': 0,
 'bh': 0,
 'bi': 120,
 'bj': 2,
 'bk': 0,
 'bl': 191,
 'bm': 0,
 'bn': 0,
 'bo': 207,
 'bp': 0,
 'bq': 0,
 'br': 243,
 'bs': 7,
 'bt': 6,
 'bu': 393,
 'bv': 0,
 'bw': 0,
 'bx': 0,
 'by': 147,
 'bz': 0,
 'b ': 7,
 'ca': 798,
 'cb': 0,
 'cc': 26,
 'cd': 0,
 'ce': 470,
 'cf': 0,
 'cg': 0,
 'ch': 699,
 'ci': 150,
 'cj': 0,
 'ck': 377,
 'cl': 189,
 'cm': 0,
 'cn': 0,
 'co': 692,
 'cp': 0,
 'cq': 0,
 'cr': 394,
 'cs': 0,
 'ct': 232,
 'cu': 67,
 'cv': 0,
 'cw': 0,
 'cx': 0,
 'cy': 21,
 'cz': 0,
 'c ': 41,
 'da': 199,
 'db': 0,
 'dc': 6,
 'dd': 73,
 'de': 715,
 'df': 12,
 'dg': 19,
 'dh': 1,
 'di': 358,
 'dj': 0

# Exercise

Calculate the probabilities:

$$p("b") = ?$$

$$p("an") = ?$$

In [9]:
p_b = probs_letter["b"]
p_b

0.0111954632763194

In [10]:
p_an = count_bigram["an"] / total_sum_bigram
p_an

0.015006982744336875

# Exercise

Calculate the probabilities:

$$p(x="a" | y ="n") =?$$

$$p(y="n"|x="a") =?$$

## $p(x="a"|y="n")$

To calculate the first probability, we need to define a subset that contains all the bigrams whose second letter is equal to "n":

In [11]:
subset_x_yn = {}
for x in alphabet:
    bigram = x + "n"
    subset_x_yn[bigram] = count_bigram[bigram]
subset_x_yn

{'an': 3170,
 'bn': 0,
 'cn': 0,
 'dn': 16,
 'en': 1363,
 'fn': 0,
 'gn': 4,
 'hn': 0,
 'in': 2595,
 'jn': 0,
 'kn': 110,
 'ln': 1,
 'mn': 6,
 'nn': 85,
 'on': 1549,
 'pn': 0,
 'qn': 0,
 'rn': 200,
 'sn': 25,
 'tn': 4,
 'un': 615,
 'vn': 0,
 'wn': 163,
 'xn': 0,
 'yn': 1,
 'zn': 0,
 ' n': 808}

The total of the subset, `sum_subset_x_yn` is:

In [12]:
subset_x_yn.values()

dict_values([3170, 0, 0, 16, 1363, 0, 4, 0, 2595, 0, 110, 1, 6, 85, 1549, 0, 0, 200, 25, 4, 615, 0, 163, 0, 1, 0, 808])

In [13]:
# We are converting the values to a numpy array
# to directly evaluate using the 'sum()' method:
array_1 = np.array(list(subset_x_yn.values()))
array_1

array([3170,    0,    0,   16, 1363,    0,    4,    0, 2595,    0,  110,
          1,    6,   85, 1549,    0,    0,  200,   25,    4,  615,    0,
        163,    0,    1,    0,  808])

In [14]:
total_subset_x_yn = array_1.sum()
total_subset_x_yn

10715

Thus, $p(x="a"|y="n")$ probability is:

$$p(x="a"|y="n") = \frac{3170}{10715}\approx0.296$$

In [15]:
prob_xa_yn = subset_x_yn["an"] / total_subset_x_yn
prob_xa_yn

0.2958469435370975

## $p(y="n"|x="a")$

For the second probability, we construct a new subset that contains the bigrams whose first letter is "a":

In [16]:
subset_y_xa = {}
for y in alphabet:
    bigram = "a" + y
    subset_y_xa[bigram] = count_bigram[bigram]
subset_y_xa

{'aa': 0,
 'ab': 124,
 'ac': 350,
 'ad': 789,
 'ae': 1,
 'af': 143,
 'ag': 264,
 'ah': 12,
 'ai': 790,
 'aj': 2,
 'ak': 156,
 'al': 999,
 'am': 310,
 'an': 3170,
 'ao': 0,
 'ap': 218,
 'aq': 0,
 'ar': 1552,
 'as': 1506,
 'at': 1442,
 'au': 147,
 'av': 343,
 'aw': 223,
 'ax': 36,
 'ay': 377,
 'az': 12,
 'a ': 897}

The rest is similar to the first one:

In [17]:
array_2 = np.array(list(subset_y_xa.values()))
array_2

array([   0,  124,  350,  789,    1,  143,  264,   12,  790,    2,  156,
        999,  310, 3170,    0,  218,    0, 1552, 1506, 1442,  147,  343,
        223,   36,  377,   12,  897])

In [18]:
total_subset_y_xa = array_2.sum()
total_subset_y_xa

13863

$$p(y="n"|x="a") = \frac{3172}{13863}\approx0.229$$

In [19]:
prob_yn_xa = subset_y_xa["an"] / total_subset_y_xa
prob_yn_xa

0.22866623385991489

# Exercise

Calculate the probability:

$$\frac{p(y="n"|x="a")p("a")}{p("n")}$$

For this one, we have all the factors:

In [20]:
prob_yn_xa * probs_letter["a"] / probs_letter["n"]

0.29533928474819926

Let's check if this is equal to $p(x="a"|y="n")$ as Bayes Theorem dictates:

In [21]:
prob_xa_yn

0.2958469435370975

The two values are close but not equal. I hope that you are able to figure out the reason for this difference. If not, please ponder on it a while before proceeding! 8)
```

















```
<center><i><b>Wait while the reader ponders!<br>(Spoilers ahead!)</center></b></i>

```















```

The reason is due to the characters unaccounted for (e.g., "d."). Consider the following paragraph:

> "My darling child!" she cried, folding the little girl in her arms and covering her face with kisses. "Where in the world did you come from?"

If we were to calculate the marginal probability of $p("n")$, for example, we would first count all the occurences of the letter "n" and then divide by all the characters included in the text:

In [22]:
text = '''"My darling child!" she cried, folding the little girl in her arms and
covering her face with kisses. "Where in the world did you come from?"'''

text = text.lower()
print(text)

"my darling child!" she cried, folding the little girl in her arms and
covering her face with kisses. "where in the world did you come from?"


In [23]:
count_n = text.count("n")
count_n

6

In [24]:
count_all = len(text)
count_all

141

In [25]:
p_n = count_n / count_all
p_n

0.0425531914893617

However, this marginal probability includes the uncounted characters such as {"'",""",",",".","!"} which aren't accounted for when we were calculating the bigram probabilities, thus messing with our calculations. This also goes for the bigram probabilities, thus causing an inconsistency.

To remedy this issue, we should have excluded all the characters except the ones we had in our `alphabet` before we had started. This filtering can be done via the regular expression module _re_'s `sub()` method:

In [26]:
import re

In [27]:
text_filtered = re.sub('[^a-z ]',' ',text)
print(text_filtered)

 my darling child   she cried  folding the little girl in her arms and covering her face with kisses   where in the world did you come from  


_Although the regular expressions is a whole topic by itself, to briefly explain, we are first defining a range inside the square brackets to include any letter from a to z ([a-z]), and then also add the space character to this range ([a-z ]). But since these are the characters we want to keep, we negate our statement by putting the negation sign ("^") to mean "every character that is not in this range" ([^a-z ]). The second parameter is the replacement, by putting in " ", we are saying that replace all the matching characters with space. The third parameter is the text we want to operate on._

Thus, we end up with a text that only contains the characters we are taking into account.

Below, we'll repeat the same procedures we did above, only this time we'll filter our text to only include the letters from a to z and the space character:

In [28]:
with open(fname, 'r') as myfile:
    data=myfile.read().replace('\n', ' ')

# Convert it to all lowercase
data = data.lower()

data = re.sub('[^a-z ]',' ',data)

count_letter = {}
for letter in alphabet:
    count_letter[letter] = data.count(letter)
    
tot_count = np.sum(list(count_letter.values()))

probs_letter = {}
for letter in count_letter.keys():
    probs_letter[letter] = count_letter[letter] / tot_count

count_bigram = {}
total_sum_bigram = 0
for x in alphabet:
    for y in alphabet:
        bigram = x+y
        count_bigram[bigram] = data.count(bigram)
        total_sum_bigram += count_bigram[bigram]

p_b = probs_letter["b"]

p_an = count_bigram["an"] / total_sum_bigram

subset_x_yn = {}
for x in alphabet:
    bigram = x + "n"
    subset_x_yn[bigram] = count_bigram[bigram]
    
array_1 = np.array(list(subset_x_yn.values()))

total_subset_x_yn = array_1.sum()

prob_xa_yn = subset_x_yn["an"] / total_subset_x_yn

subset_y_xa = {}
for y in alphabet:
    bigram = "a" + y
    subset_y_xa[bigram] = count_bigram[bigram]

array_2 = np.array(list(subset_y_xa.values()))

total_subset_y_xa = array_2.sum()

prob_yn_xa = subset_y_xa["an"] / total_subset_y_xa

prob_yn_xa * probs_letter["a"] / probs_letter["n"]

0.29482886904761907

In [29]:
prob_xa_yn

0.29482886904761907

Thus, we have saved the Bayes Theorem! 8)

$$p(a|b) = \frac{p(b|a)p(a)}{p(b)}$$

# Visualizing the frequencies

Using the pandas module, we can also "visualize" the frequencies:

In [30]:
import pandas as pd

In [31]:
df_letter = pd.DataFrame(count_letter.values(),
                         index=alphabet,
                         columns=["count"])
df_letter.style.background_gradient(cmap="binary")

#https://matplotlib.org/stable/tutorials/colors/colormaps.html
# https://stackoverflow.com/a/50605020

Unnamed: 0,count
a,13887
b,2448
c,4161
d,8888
e,22261
f,3588
g,3587
h,11977
i,10350
j,240


In [32]:
df = pd.DataFrame({"l1":[],"l2":[],"count":[]})
for letter1 in alphabet:
    for letter2 in alphabet:
        word = letter1+letter2
        df.loc[-1] = [letter1,letter2,data.count(word)]
        df.index = df.index + 1
df.insert(loc=2,column="word",value=df["l1"]+df["l2"])

In [33]:
df

Unnamed: 0,l1,l2,word,count
728,a,a,aa,0
727,a,b,ab,124
726,a,c,ac,350
725,a,d,ad,789
724,a,e,ae,1
...,...,...,...,...
4,,w,w,3634
3,,x,x,30
2,,y,y,756
1,,z,z,10


In [34]:
len_alpha = len(alphabet)

mat = np.zeros((len_alpha,len_alpha),int)
for i in range(len_alpha):
    for j in range(len_alpha):
        mat[i,j] = df.loc[df.word==(alphabet[i]+alphabet[j]),"count"]
        
df_bi = pd.DataFrame(mat,index=alphabet,columns=alphabet)

import matplotlib.pyplot as plt
from matplotlib import colors

def background_gradient(s, m, M, cmap='PuBu', low=0, high=0):
    rng = M - m
    norm = colors.Normalize(m - (rng * low),
                            M + (rng * high))
    normed = norm(s.values)
    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
    return ['background-color: %s' % color for color in c]


df_bi.style.apply(background_gradient,cmap="binary",
               m=df_bi.min().min(),
               M=df_bi.max().max(),
               low=0,
               high=0.85)

# https://stackoverflow.com/a/42563850

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,Unnamed: 27
a,0,124,350,789,1,143,264,12,790,2,156,999,310,3170,0,218,0,1552,1506,1442,147,343,223,36,377,12,921
b,254,9,0,0,857,0,0,0,120,2,0,191,0,0,207,0,0,243,7,6,393,0,0,0,147,0,12
c,798,0,26,0,470,0,0,699,150,0,377,189,0,0,692,0,0,394,0,232,67,0,0,0,21,0,46
d,199,0,6,73,715,12,19,1,358,0,1,130,189,16,781,0,0,94,170,0,29,7,5,0,57,0,6026
e,1165,28,576,2124,806,184,88,27,175,2,17,660,512,1363,68,245,14,2876,1090,472,5,279,147,105,568,1,8664
f,192,0,0,0,246,97,0,0,234,0,0,113,0,0,749,0,0,298,1,157,214,0,1,0,2,0,1284
g,222,0,0,2,410,0,35,444,198,0,0,84,0,4,288,0,0,485,75,3,135,0,0,0,5,0,1197
h,1515,0,0,0,6611,2,0,0,1157,0,34,5,0,0,688,0,0,142,21,299,69,0,2,0,416,0,1016
i,59,89,423,677,424,266,309,0,22,0,71,624,386,2595,393,42,0,440,1048,1504,5,207,0,7,0,68,681
j,8,0,0,0,97,0,0,0,0,0,0,0,0,0,75,0,0,0,0,0,60,0,0,0,0,0,0
