In [2]:
import numpy as np

In [None]:
words = open("../data/names.txt", "r").read().splitlines()
print(words[:10])

i = np.argmin([len(w) for w in words])
print(words[i])

In [4]:
from collections import defaultdict

bigram = defaultdict(lambda: defaultdict(lambda: 0))

for word in words:
    chs = ["<S>"] + list(word) + ["<E>"]
    for cs, ce in zip(chs, chs[1:]):
        bigram[cs][ce] += 1

In [None]:
from numpy.random import choice

cur_char = "<S>"
out = ""
while cur_char != "<E>":
    out += cur_char if cur_char != "<S>" else ""
    val = np.array(list(bigram[cur_char].values()))
    draw = choice(np.array(list(bigram[cur_char].keys())), size=1, p=val / np.sum(val))
    cur_char = draw[0]

out

In [72]:
import torch

N = torch.ones((27, 27), dtype=torch.int32)
chars = sorted(list(set("".join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi["."] = 0

for w in words:
    chs = ["."] + list(w) + ["."]
    for cs, ce in zip(chs, chs[1:]):
        N[stoi[cs], stoi[ce]] += 1

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

itos = {v: k for k, v in stoi.items()}
plt.figure(figsize=(16, 16))
plt.imshow(N, cmap="Blues")

for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color="gray")
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color="gray")

plt.axis("off")

In [74]:
# epsilon = 0.0000001
# for i in range(28):
#     p = N[i].float()
#     p = p/(p.sum() + epsilon)
#     # N[i] = p

In [75]:
P = N.float() / torch.sum(N, dim=1, keepdim=True)

In [None]:
epsilon = 0.0000001
g = torch.Generator().manual_seed(2147483647)

out = []
ix = 0
while True:
    p = P[ix]
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
        break

nout = "".join(out)
print(f"{nout=}")

In [None]:
# likelihood = 1.0
loglikelihood = 0.0
n = 0
epsilon = 0.0

for w in ["andrejq"]:
    chs = ["."] + list(w) + ["."]
    for cs, ce in zip(chs, chs[1:]):
        csi = stoi[cs]
        cei = stoi[ce]
        ll = torch.log(P[csi, cei] + epsilon)
        loglikelihood += ll
        n += 1

nll = -loglikelihood
anll = nll / n
print(f"{anll=}")

In [79]:
xs, ys = [], []
for w in words:
    chs = ["."] + list(w) + ["."]
    for cs, ce in zip(chs, chs[1:]):
        csi = stoi[cs]
        cei = stoi[ce]
        xs.append(csi)
        ys.append(cei)

xs = torch.tensor(xs)
ys = torch.tensor(ys)