In [1]:
words = open('009-makemore_names.txt', 'r').read().splitlines()

In [2]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
len(words)

32033

In [4]:
min(len(w) for w in words)

2

In [5]:
max(len(w) for w in words)

15

# Bigram
first we will try to just predict the next charcter from the list of words based on the previous word

In [11]:
# lets just take the first word and see how to pair each character with the next one
for w in words[:3]: # lets just do it on first word
    # lets wrap the word with special user defined term that will define the start and the end before and after the word
    new_word = ['<S>'] + list(w) + ['<E>'] # w = 'emma' and list(w) = ['e', 'm', 'm', 'a']
    for char1, char2 in zip(new_word, new_word[1:]):
        print(char1, char2)

<S> e
e m
m m
m a
a <E>
<S> o
o l
l i
i v
v i
i a
a <E>
<S> a
a v
v a
a <E>


In [26]:
# now we will do is calculate how many times we get that pair i.e. calculate bigram frequency
b = {}
for w in words:
    new_word = ['<S>'] + list(w) + ['<E>']
    for char1, char2 in zip(new_word, new_word[1:]):
        bigram = (char1, char2)
        b[bigram] = b.get(bigram, 0) + 1

In [None]:
sorted(b.items(), key = lambda kv: -kv[1])

In [50]:
# we will store the bigram frequency count in an array instead of a dict
# so we will use the torch library to create arrays called tensor to store the bigram info

import torch

# now we will create 27 by 27 array since we have 26 alphabet and 1 special user defined character
N = torch.zeros((27,27), dtype = torch.int32)

# here the N array store on int number so we will map all alphabet + special character to an integer
chars = sorted(list(set(''.join(words)))) # all character as list from words a to z
stoi = {s:i+1 for i,s in enumerate(chars)} # stoi is string to int, enumerate counts chars as integer in list based on index
stoi['.'] = 0 # manually define special char mapping and instead of 2 special charcter we will have just one at the start and end as dot(.)
# {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}
itos = {i:s for s,i in stoi.items()} # int to string
print('string to int', stoi)
print('int to string', itos)

# now in the 2D array will will increase the count of those cell which occur more frequency 
for w in words:
    new_word = ['.'] + list(w) + ['.'] # wrapping word with special char
    for char1, char2 in zip(new_word, new_word[1:]):
        index_row = stoi[char1] # char1 in x coordinate or row
        index_col = stoi[char2] # char2 in y coordinate or col
        N[index_row, index_col] += 1 # increase the count of that cell in x,y coordinate
        

string to int {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}
int to string {1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [None]:
# Lets visualise the character pair frequency or Bigram 
import matplotlib.pyplot as plt
%matplotlib inline

# this visual is like a heat map of the frequency of the pair of character... the darker shade if it occurs more
plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')
plt.axis('off');

In [61]:
# now lets calculate the probablity for each cell
probability = N.float()
probability = probability / probability.sum(1, keepdim=True) # .sum on tensor we can mention to sum across the rows by saying 1 and keep dimension as true so probability.shape() = [27,1] for dividing with another tensor

In [86]:
# here we have seed so all prediction is always the same with same randomness
g = torch.Generator().manual_seed(2147483647)

# lets predict 5 words based on probability of which character will come after previous
for i in range(5):
  out = []
  index = 0 # starting from dot row or the first row
  while True:
    p = probability[index]
    index = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item() # based on probability predict next character
    out.append(itos[index]) # list all predicted character

    # if predicted character is dot in int term its 0 so we end loop. Because dot is the ending character
    if index == 0:
      break
  print(''.join(out))

# cexze.
# momasurailezitynn.
# konimittain.
# llayn.
# ka.

# You will see words which doesn't look like name beacuse it is just predicting a next character it does not refer to all the previous character. Bigram is very stupid and simple by just predicting next charcted based on the previous one.

cexze.
momasurailezitynn.
konimittain.
llayn.
ka.
