In [None]:
words = open('names.txt', 'r').read().splitlines() # put each name into a list
words[:10] # first 10 names

In [None]:
len(words) # number of names in the list

In [None]:
min(len(w) for w in words) # length of shortest name

In [None]:
max(len(w) for w in words) # length of longest name

In [None]:
b = {}
for w in words:
    chs = ['<S>'] + list(w) + ['<E>']
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        b[bigram] = b.get(bigram, 0) + 1 # count how often the bigrams occur

In [None]:
sorted(b.items(), key = lambda kv: -kv[1]) # sort by the count (key value)

In [None]:
# we will now store this info in a 2d array. rows will be the first character of the biagram, the columns will be the second character. 
# each entry in the array will tell us how often the secand character follows the first character in the dataset.

In [None]:
import torch
N = torch.zeros((27, 27), dtype=torch.int32)

In [None]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)} # +1 maps a to 1 instead of 0
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()} # maps inversely

In [None]:

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='black')
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color='black')
        plt.axis('off')

In [None]:
N[0] # row of starting characters, first row

In [None]:
p = N[0].float() # convert to float to get probabilities
p = p / p.sum() # p is the probability of a character being the first character of a word
p

In [None]:
g = torch.Generator().manual_seed(2147483647) # creates a torch generator object, seeding it with some number
#  torch.multinomial returns samples from the multinomial probability distribution, gives integers from probabilities which are samples according to the probability distribution
ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item() # replacement means we can draw a sample and put it back so it can be drawed again, by default it's False
itos[ix]

In [None]:
g = torch.Generator().manual_seed(2147483647)
p = torch.rand(3, generator=g) # creates 3 random numbers between 0 and 1 with the g object as a source of randomness
p = p / p.sum() # normalize, for sure one of the probabilities is true if we normalize
p


In [None]:
#  torch.multinomial returns samples from the multinomial probability distribution, gives integers from probabilities which are samples according to the probability distribution
torch.multinomial(p, num_samples=200, replacement=True, generator=g)

In [None]:
p.shape

In [None]:
P = N.float()
P = P / P.sum(1, keepdim=True)
P.shape

In [None]:
P.sum(1, keepdim=True).shape

In [None]:
g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    out = []
    ix = 0
    while True:
        
        p = P[ix]

        #p = N[ix].float()
        #p = p / p.sum()
        #p = torch.ones(27) / 27.0

        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))
