In [2]:
with open('names.txt', 'r') as f:
    names = f.read().splitlines()

In [3]:
bigrams = {}
for name in names:
    aug_name = ['.'] + list(name) + ['.']
    for char_1, char_2 in zip(aug_name, aug_name[1:]):
        chars = (char_1, char_2)
        bigrams[chars] = bigrams.get(chars, 0) + 1

In [4]:
import torch

possible_chars = set(''.join(names))
possible_chars.add('.')

n_possible_chars = len(possible_chars)

char_to_index = {char: i for i, char in enumerate(sorted(possible_chars))}
index_to_char = {index: char for char, index in char_to_index.items()}

In [27]:
bigram_tensor = torch.zeros((n_possible_chars, n_possible_chars))

for key, value in bigrams.items():
    char_1 = key[0]
    char_2 = key[1]

    index_1 = char_to_index[char_1] # row = character start
    index_2 = char_to_index[char_2] # col = character end
    bigram_tensor[index_1, index_2] = value

bigram_tensor = bigram_tensor.float()
bigram_probabilities = bigram_tensor/bigram_tensor.sum(1, keepdim=True)

In [28]:
# Generate Words using Bigram Information

n_words = 10
for _ in range(n_words):
    c = '.'
    word = '.'
    while True:
        c_index = char_to_index[c] # get the index representation of the character
        probabilities = bigram_probabilities[c_index] # get the prob vector of the index's row
        new_index = torch.multinomial(probabilities, replacement=True, num_samples=1).item() # sample once
        c = index_to_char[new_index] # get char from sampled index
        word += c # add it to word
        if c == '.': # '.' represents the end of the wod
            break
    print(word)
        

.sigausssarahue.
.a.
.ali.
.suoynnaiartahowir.
.jakeaynin.
.ritadr.
.sant.
.nng.
.a.
.heeio.


In [29]:
# Generate Words using Uniform Distribution

uniform_tensor = torch.ones(n_possible_chars, n_possible_chars)
probabilities_uniform = uniform_tensor/uniform_tensor.sum(1, keepdim=True)

n_words = 10
for _ in range(n_words):
    c = '.'
    word = '.'
    while True:
        c_index = char_to_index[c]
        probabilities = probabilities_uniform[c_index] # get the prob vector of the index's row
        new_index = torch.multinomial(probabilities, replacement=True, num_samples=1).item()
        c = index_to_char[new_index]
        word += c
        if c == '.':
            break
    print(word)

        

.yrnijytrjodvaculxerhhkdtwfdjlzlbfuucbgrpqyghpibvfxntqsofnsaxxa.
.cvaduyjuerhffsrnzcmdqvpkrayxzckmsca.
.hezprkppewceuhcyqsareuydlommrclaki.
.zpumjiguhlazzadt.
.ogdcpigg.
.tcnzqhwsfmpiqswwqrp.
.kqcdokknak.
..
.uapmxxqn.
.fdbof.


## Evaluating our 'model'

If the model is performing well, then P(bigram) for bigrams in our data ste should be high.

Here, I'll be using the sum of -log(P) because, since probabilities are small, their multiplications will often lead to underflow. That is, it will fail to represent very small numbers accurately.

The negative sign is used to make it a proper 'loss' function. Since  log(P) <= 0 for P in [0,1], -log(P) makes the number positive, with the lowest possible loss function being 0, when P = 1 for all bigrams in the training set.

In [40]:
# 'Learned' Probabilities

p_log_sum = 0
n_bigrams = 0
for name in names:
    name_mod = '.' + name + '.'
    for char1, char2 in zip(name_mod, name_mod[1:]):
        idx1 = char_to_index[char1]
        idx2 = char_to_index[char2]
        p = bigram_probabilities[idx1, idx2]
        p_log = torch.log(p)
        p_log_sum += p_log
        n_bigrams += 1

neg_p_log_avg = -p_log_sum/n_bigrams
print(neg_p_log_avg)

tensor(2.4541)


In [41]:
# Uniform Probabilities 

p_log_sum = 0
n_bigrams = 0
for name in names:
    name_mod = '.' + name + '.'
    for char1, char2 in zip(name_mod, name_mod[1:]):
        idx1 = char_to_index[char1]
        idx2 = char_to_index[char2]
        p = probabilities_uniform[idx1, idx2]
        p_log = torch.log(p)
        p_log_sum += p_log
        n_bigrams += 1

neg_p_log_avg = -p_log_sum/n_bigrams
print(neg_p_log_avg)

tensor(3.2960)
