In [68]:
with open('names.txt', 'r') as f:
    names = f.read().splitlines()

In [69]:
bigrams = {}
for name in names:
    aug_name = ['.'] + list(name) + ['.']
    for char_1, char_2 in zip(aug_name, aug_name[1:]):
        chars = (char_1, char_2)
        bigrams[chars] = bigrams.get(chars, 0) + 1

In [70]:
import torch

possible_chars = set(''.join(names))
possible_chars.add('.')

n_possible_chars = len(possible_chars)

char_to_index = {char: i for i, char in enumerate(sorted(possible_chars))}
index_to_char = {index: char for char, index in char_to_index.items()}

In [71]:
bigram_tensor = torch.zeros((n_possible_chars, n_possible_chars))

for key, value in bigrams.items():
    char_1 = key[0]
    char_2 = key[1]

    index_1 = char_to_index[char_1] # row = character start
    index_2 = char_to_index[char_2] # col = character end
    bigram_tensor[index_1, index_2] = value

bigram_tensor = bigram_tensor.float()
bigram_tensor /= bigram_tensor.sum(1, keepdim=True)

In [77]:
# Generate Words using Bigram Information

n_words = 10
for _ in range(n_words):
    c = '.'
    word = '.'
    while True:
        c_index = char_to_index[c] # get the index representation of the character
        probabilities = bigram_tensor[c_index] # get the prob vector of the index's row
        new_index = torch.multinomial(probabilities, replacement=True, num_samples=1).item() # sample once
        c = index_to_char[new_index] # get char from sampled index
        word += c # add it to word
        if c == '.': # '.' represents the end of the wod
            break
    print(word)
        

.kllevy.
.clelonzlenindeira.
.denijasshairi.
.kana.
.maravayakal.
.g.
.eliyaselhaleramuslelelabrshuilyn.
.kihanemo.
.tondimaye.
.jennethadon.


In [76]:
# Generate Words using Uniform Distribution

probabilities_uniform = torch.ones(n_possible_chars)
probabilities_uniform /= probabilities_uniform.sum()

n_words = 10
for _ in range(n_words):
    c = '.'
    word = '.'
    while True:
        c_index = char_to_index[c]
        new_index = torch.multinomial(probabilities_uniform, replacement=True, num_samples=1).item()
        c = index_to_char[new_index]
        word += c
        if c == '.':
            break
    print(word)

        

.nctunpjdbdahflxklamtxuwbbsqdpjfa.
.ukxyikgyuywxdmatdjvqekaosbbrkhehqueltsypayurxzn.
.tkmrglkxkawtlwdatmqcoucjwfcisykgfdiougvfskzcbrlydegybmulczidlrmavw.
.jzvqvuewalethrtucauezcmpgfhhiykkgyoeaecexlhyvqommxcejbrzymxmpbhdamodwvctecfctwfcjkzwdfsuhztkyzwwquvbhrchihwdpzhopcmqeweiakjeditrdfojmdwsngqjwrkgnzud.
.yd.
.qeqoleetrixpnslxzrzmzxmbdcynvcalgporyxgxqrcwcwcezzewheywyvyozfvzhd.
.wxbrkmsifqhhcmiheulwggejlkowlivxlbdsqmk.
.anstnsfyvmrvrbchrvaegidtegphvauofonzdduxnocroxnndqsynuppibfd.
.uvetwxygjigrgzzhlxthqpn.
.kcaupaaevo.
