In [6]:
names = open('names.txt', 'r').read().splitlines()

In [7]:
# Implementing a bigram (2-gram) model: only one character predicts the next
import torch

N = torch.zeros((27, 27), dtype=torch.int32) # 26 letters + start and end token

# Make a mapping from characters to indices
chars = sorted(list(set(''.join(names)))) # concatenate all names, find unique characters, then sort alphabetically
stoi = {s:i+1 for i, s in enumerate(chars)} # map each character to an index, letting 0 be for start and end tokens
# explicitly add start and end token
stoi['.'] = 0

In [8]:
# Count the number of times each character appears in a 2D array
for name in names: # iterate over each name
    chs = ['.'] + list(name) + ['.'] # add start and end tokens
    
    # iterate over each pair of characters
    for ch1, ch2 in zip(chs, chs[1:]): # ex. <S>emma<E> -> (<S>, e), (e, m), (m, m), (m, a), (a, <E>)
        N[stoi[ch1], stoi[ch2]] += 1 # add bigram to the count

In [16]:
# print(N)
# Make an index to s mapping
itos = {i:s for s, i in stoi.items()}

In [22]:
# Make our bigram name language model

# Make our random seed generator
g = torch.Generator().manual_seed(2147483647)

for i in range(50):
    # Generate a name
    name = [] # keep track of the name we're generating
    index = 0 # start with start token, ie index token 0
    while True:
        # Normalize the current column (ie the probabilities of the next character)
        p = N[index].float()
        p /= p.sum()

        # sample from the multinomial distribution once, and grab the number (this is our next index character)
        index = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item() 

        # If token is end token, break
        if index == 0:
            break

        # Else, add the character to the name and update the index
        else:
            name.append(itos[index])

    print(''.join(name)) # print the generated name

cexze
momasurailezitynn
konimittain
llayn
ka
da
staiyaubrtthrigotai
moliellavo
ke
teda
ka
emimmsade
enkaviyny
ftlspihinivenvorhlasu
dsor
br
jol
pen
aisan
ja
feniee
zem
deru
firit
gaikajahahbevare
kiysthelenaririenah
keen
x
al
kal
thavazeeromysos
laitenimieegariseriyen
k
illeleldole
meenisammigama
mmin
asharin
alcalar
jayn
asaz
selanely
chay
rana
ai
yviamisashougen
l
beyncaro
allan
annutetoradrilia
rddeman
