In [25]:
import torch
from tqdm import tqdm

## Data

In [2]:
data_fpath = './data/names.txt'

In [3]:
with open(data_fpath, 'r') as f:
    words = f.read().splitlines()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [4]:
len(words)

32033

In [5]:
word_lens = [len(word) for word in words]
print(f'min len: {min(word_lens)}; max len: {max(word_lens)}')

min len: 2; max len: 15


## Ngram model as an array with counts
using generalized version for ngrams given arbitrary n

In [8]:
n = 3

In [9]:
SEP_TOK = '.'

In [10]:
ngrams_dict = {}
for word in words:
    chars = [SEP_TOK]*(n-1) + list(word) + [SEP_TOK]*(n-1)
    ngram_chars = [chars[i:] for i in range(n)]
    for ngram in zip(*ngram_chars):
        ngrams_dict[ngram] = ngrams_dict.get(ngram, 0) + 1
ngrams_dict = sorted(ngrams_dict.items(), key=lambda kv: kv[1], reverse=True)
ngrams_dict[:10]

[(('n', '.', '.'), 6763),
 (('a', '.', '.'), 6640),
 (('.', '.', 'a'), 4410),
 (('e', '.', '.'), 3983),
 (('.', '.', 'k'), 2963),
 (('.', '.', 'm'), 2538),
 (('i', '.', '.'), 2489),
 (('.', '.', 'j'), 2422),
 (('h', '.', '.'), 2409),
 (('.', '.', 's'), 2055)]

In [11]:
vocab = [SEP_TOK] + sorted(list(set(''.join(words))))
stoi = {s: i for i, s in enumerate(vocab)}
itos = {i: s for i, s in enumerate(vocab)}

In [12]:
N = torch.zeros([len(vocab) for _ in range(n)], dtype=torch.int32)
for word in tqdm(words):
    chars = [SEP_TOK]*(n-1) + list(word) + [SEP_TOK]*(n-1)
    ngram_chars = [chars[i:] for i in range(n)]
    for ngram in zip(*ngram_chars):
        ixs = tuple(stoi[ch] for ch in ngram)
        N[ixs] += 1

  0%|          | 155/32033 [00:00<00:40, 792.68it/s]

100%|██████████| 32033/32033 [00:24<00:00, 1303.09it/s]


In [42]:
base_count = 0 # smooths the probabilities
P = (N+base_count).float()
P = P / P.sum(axis=(n-1), keepdim=True)

### Sampling from the model

In [43]:
from collections import deque
n_samples = 20
g = torch.Generator().manual_seed(2147483647)
for _ in range(n_samples):
    ixs = deque([stoi[SEP_TOK]] * (n-1))
    out = []
    while True:
        prob_distr = P[tuple(ixs)]
        ix = torch.multinomial(prob_distr, num_samples=1, replacement=True, generator=g).item()
        if ix == stoi[SEP_TOK]:
            break
        ixs.popleft()
        ixs.append(ix)
        out.append(itos[ix])
    print(''.join(out))

junide
jakasid
prelay
adin
kairritoper
sathen
sameia
yanileniassibiainewin
lessiyanayla
te
farmanthya
demmer
finslena
jaylicore
ya
jocken
jamilyn
korin
wyn
ne


### Evaluating the performance

In [16]:
log_likelihood = 0.0
count = 0
for word in tqdm(words, 'Evaluating'):
    chars = [SEP_TOK]*(n-1) + list(word) + [SEP_TOK]*(n-1)
    ngram_chars = [chars[i:] for i in range(n)]
    for ngram in zip(*ngram_chars):
        ixs = tuple(stoi[ch] for ch in ngram)
        prob = P[ixs]
        logprob = torch.log(prob)
        log_likelihood += logprob
        count += 1

print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/count=}')

Evaluating: 100%|██████████| 32033/32033 [00:17<00:00, 1844.87it/s]

log_likelihood=tensor(-505260.7500)
nll=tensor(505260.7500)
nll/count=tensor(1.9420)





## Ngram model as neural net 

In [17]:
# creating the training set of bigrams
xs, ys = [], []
for word in tqdm(words, f'Creating {n}-gram samples'):
    chars = [SEP_TOK]*(n-1) + list(word) + [SEP_TOK]*(n-1)
    ngram_chars = [chars[i:] for i in range(n)]
    for ngram in zip(*ngram_chars):
        ixs = [stoi[ch] for ch in ngram]
        xs.append(ixs[:-1])
        ys.append(ixs[-1])

xs = torch.tensor(xs)
ys = torch.tensor(ys)

print(f'Number of training examples: {xs.shape[0]}')

Creating 3-gram samples:   0%|          | 0/32033 [00:00<?, ?it/s]

Creating 3-gram samples: 100%|██████████| 32033/32033 [00:01<00:00, 18923.52it/s]

Number of training examples: 260179





### Training loop

In [36]:
def calc_loss(xs, ys, W, weight_decay=1e-4):
    logits = W[[x for x in xs.T]]
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True)
    # loss = average negative log likelihood
    loss = -probs[torch.arange(len(ys)), ys].log().mean() + weight_decay*(W**2).mean()
    return loss

In [37]:
# initializing the "model"
g = torch.Generator().manual_seed(2147483647)
W = torch.randn(tuple(len(vocab) for _ in range(n)), generator=g, requires_grad=True)

In [38]:
for ep in range(200):
    # forward pass
    tr_loss = calc_loss(xs, ys, W, 0)

    # backward pass
    W.grad = None
    tr_loss.backward()

    # update
    W.data += -100 * W.grad

    if ep % 10 == 9:
        print(f'{ep+1:>3}th epoch, tr_loss={tr_loss.item():.3f}')

 10th epoch, tr_loss=2.722
 20th epoch, tr_loss=2.479
 30th epoch, tr_loss=2.362
 40th epoch, tr_loss=2.291
 50th epoch, tr_loss=2.240
 60th epoch, tr_loss=2.202
 70th epoch, tr_loss=2.173
 80th epoch, tr_loss=2.149
 90th epoch, tr_loss=2.129
100th epoch, tr_loss=2.113
110th epoch, tr_loss=2.099
120th epoch, tr_loss=2.087
130th epoch, tr_loss=2.076
140th epoch, tr_loss=2.067
150th epoch, tr_loss=2.059
160th epoch, tr_loss=2.052
170th epoch, tr_loss=2.045
180th epoch, tr_loss=2.039
190th epoch, tr_loss=2.034
200th epoch, tr_loss=2.029


### Sampling from the network

In [41]:
from collections import deque
n_samples = 20
g = torch.Generator().manual_seed(2147483647)
for _ in range(n_samples):
    ixs = deque([stoi[SEP_TOK]] * (n-1))
    out = []
    while True:
        logits = W[tuple(ixs)]
        counts = logits.exp()
        probs = counts / counts.sum()
        ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        if ix == stoi[SEP_TOK]:
            break
        ixs.popleft()
        ixs.append(ix)
        out.append(itos[ix])
    print(''.join(out))

junide
janaqah
prelay
adin
kairritonian
juwa
kalinaaryanileniassdbyainrwibel
se
siely
arte
faveumtrifoetumj
phyashiah
jaylicora
ya
jocfpypjtbdmwebemikim
yfvn
anaasnhmvfjfopszxhxdgorfmxtdnvic
le
paun
tyde
