In [154]:
words = open("names.txt", 'r').read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [155]:
import torch
import torch.nn.functional as F

chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [156]:
# Exercises 2: E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

words_len = len(words)
train_idx = int(0.80 * words_len)
dev_idx = int(0.90 * words_len)
print(dev_idx)

xtrain, ytrain = xs[:train_idx], ys[:train_idx]
xdev, ydev = xs[train_idx:dev_idx], ys[train_idx:dev_idx]
xtest, ytest = xs[dev_idx:], ys[dev_idx:]


28829
torch.Size([25626])
tensor([25, 14, 14,  ..., 19,  5, 20])
tensor([20,  1,  0,  ..., 25, 26, 24])


In [157]:
training_loss_arr = []
dev_loss_arr = []

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

for k in range(100):
    xenc = F.one_hot(xtrain, num_classes=27).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(25626), ytrain].log().mean() + 0.01 * (W**2).mean() 
    print(loss.item())

    if k >= 90:
        with torch.no_grad():
            xenc = F.one_hot(xdev, num_classes=27).float()
            logits_dev = xenc @ W
            dev_counts = logits_dev.exp()
            probs = dev_counts / dev_counts.sum(1, keepdims=True)
            dev_loss = -probs[torch.arange(ydev.shape[0]), ydev].log().mean()

        training_loss_arr.append(loss.item())
        dev_loss_arr.append(dev_loss.item())

    W.grad = None
    loss.backward()

    W.data += -0.2 * W.grad #

print("Mean of the last 10 training loss: ", sum(training_loss_arr)/10)
print("Mean of the last 10 dev set loss: ", sum(dev_loss_arr)/10)    


3.8023996353149414
3.7999112606048584
3.7974281311035156
3.794949769973755
3.7924771308898926
3.7900094985961914
3.7875471115112305
3.7850897312164307
3.782637596130371
3.7801902294158936
3.7777483463287354
3.77531099319458
3.7728793621063232
3.7704522609710693
3.7680304050445557
3.7656140327453613
3.7632017135620117
3.7607951164245605
3.7583930492401123
3.7559962272644043
3.75360369682312
3.7512171268463135
3.7488350868225098
3.746457576751709
3.7440850734710693
3.74171781539917
3.7393550872802734
3.736997127532959
3.7346444129943848
3.7322962284088135
3.7299530506134033
3.727614641189575
3.72528076171875
3.722951650619507
3.720627546310425
3.7183079719543457
3.7159934043884277
3.713683605194092
3.7113780975341797
3.709077835083008
3.7067816257476807
3.7044906616210938
3.702204942703247
3.6999223232269287
3.697645902633667
3.695373773574829
3.693105697631836
3.690842628479004
3.688584327697754
3.6863303184509277
3.6840810775756836
3.6818366050720215
3.6795966625213623
3.67736124992370

In [158]:
test_losses = []

with torch.no_grad():
    for k in range(10):
        xenc = F.one_hot(xtest, num_classes=27).float()
        logits = xenc @ W
        test_counts = logits.exp()
        test_probs = test_counts / test_counts.sum(1, keepdims=True)
        loss = -test_probs[torch.arange(ytest.shape[0]), ytest].log().mean()

    test_losses.append(loss.item())

print("Mean of the last 10 test set loss: ", sum(test_losses) / 10)    


Mean of the last 10 test set loss:  0.35768148899078367


In [159]:
xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xs.append([ix1, ix2])
        ys.append(ix3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

words_len = len(words)
train_idx = int(0.80 * words_len)
dev_idx = int(0.90 * words_len)

print(xs[2])
print(ys)


xtrain, ytrain = xs[:train_idx], ys[:train_idx]
xdev, ydev = xs[train_idx:dev_idx], ys[train_idx:dev_idx]
xtest, ytest = xs[dev_idx:], ys[dev_idx:]

ytrain.shape[0]

tensor([13, 13])
tensor([13, 13,  1,  ..., 26, 24,  0])


25626

In [165]:
avg_dev_loss = []
avg_train_loss = []

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

for k in range(100):
    xenc = F.one_hot(xtrain, num_classes=27).float()
    logits = xenc.view(-1, 27) @ W

    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    # print(probs.shape)
    loss = -probs[torch.arange(ytrain.shape[0]), ytrain].log().mean()
    print(loss.item())

    if k >= 90:
        with torch.no_grad():
            xdevenc = F.one_hot(xdev, num_classes=27).float()
            dev_logits = xdevenc.view(-1, 27) @ W
            dev_counts = dev_logits.exp()
            dev_probs = dev_counts / dev_counts.sum(1, keepdims=True)
            dev_loss = -probs[torch.arange(ydev.shape[0]), ydev].log().mean()
        
        avg_dev_loss.append(dev_loss)
        avg_train_loss.append(loss)

    W.grad = None
    loss.backward()

    W.data += -50 * W.grad #

3.7455999851226807
3.399603843688965
3.1978631019592285
3.079557180404663
3.002737045288086
2.9481465816497803
2.906745195388794
2.873762607574463
2.846700429916382
2.824082136154175
2.804903984069824
2.7884368896484375
2.7741386890411377
2.7616047859191895
2.75052809715271
2.7406723499298096
2.7318527698516846
2.723921775817871
2.7167587280273438
2.710265636444092
2.7043609619140625
2.698974847793579
2.6940503120422363
2.6895360946655273
2.6853890419006348
2.681570053100586
2.6780457496643066
2.6747865676879883
2.671764612197876
2.668957471847534
2.666344404220581
2.6639068126678467
2.661628246307373
2.659494161605835
2.657491445541382
2.655609130859375
2.653837203979492
2.6521661281585693
2.650588035583496
2.649095296859741
2.647681951522827
2.6463425159454346
2.6450705528259277
2.6438615322113037
2.6427114009857178
2.641616106033325
2.6405720710754395
2.6395750045776367
2.6386239528656006
2.63771390914917
2.63684344291687
2.636009931564331
2.635211229324341
2.6344454288482666
2.6337

In [182]:
trigram_test_loss = []

with torch.no_grad():
    for k in range(10):
        test_vec = F.one_hot(xtest, num_classes=27).float()
        test_logits = test_vec.view(-1, 27) @ W
        test_counts = test_logits.exp()
        test_probs = test_counts / test_counts.sum(1, keepdims=True)
        test_loss = -test_probs[torch.arange(ytest.shape[0]), ytest].log().mean()

    trigram_test_loss.append(test_loss.item())

print("Test loss for trigram is, ", sum(trigram_test_loss) / 10)

Test loss for trigram is,  0.2746225357055664


In [196]:
#sampling from the trigram model
g = torch.Generator().manual_seed(2147483647)


for k in range(5):
    out = []
    idx = 0

    while True:
        xenc = F.one_hot(torch.tensor([idx]), num_classes=27).float()
        logits = xenc.view(-1, 27) @ W
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdims=True)

        ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0: 
            break

    print(''.join(out))


.
unidn.
ianagaz.
p.
ofaywoinn.


In [193]:
regs=[1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

for r in regs:
    training_loss_arr = []
    dev_loss_arr = []

    for k in range(100):
        xenc = F.one_hot(xdev, num_classes=27).float()
        logits = xenc.view(-1, 27) @ W
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdims=True)
        loss = -probs[torch.arange(ydev.shape[0]), ydev].log().mean() + r * (W**2).mean() 
        # print(loss.item())

        if k >= 90:
            with torch.no_grad():
                xenc = F.one_hot(xdev, num_classes=27).float()
                logits_dev = xenc.view(-1, 27) @ W
                dev_counts = logits_dev.exp()
                probs = dev_counts / dev_counts.sum(1, keepdims=True)
                dev_loss = -probs[torch.arange(ydev.shape[0]), ydev].log().mean()
            dev_loss_arr.append(dev_loss.item())

        training_loss_arr.append(loss.item())

        W.grad = None
        loss.backward()

        W.data += -0.2 * W.grad #

    print(f"Mean of the training loss for r = {r}: ", sum(training_loss_arr)/90)
    print(f"Mean of the dev set loss for r = {r}: ", sum(dev_loss_arr)/10)   
    print(f'----------------------------------------------------------------') 

#If the weight is too low, then there will be a loss, but if weight is too high, too much smoothing also increases loss


Mean of the training loss for r = 1:  5.044561343722873
Mean of the dev set loss for r = 1:  3.5639166593551637
----------------------------------------------------------------
Mean of the training loss for r = 0.5:  4.31432045035892
Mean of the dev set loss for r = 0.5:  3.4287187337875364
----------------------------------------------------------------
Mean of the training loss for r = 0.1:  3.827811532550388
Mean of the dev set loss for r = 0.1:  3.326589322090149
----------------------------------------------------------------
Mean of the training loss for r = 0.05:  3.6839466624789767
Mean of the dev set loss for r = 0.05:  3.2435006380081175
----------------------------------------------------------------
Mean of the training loss for r = 0.01:  3.569041363398234
Mean of the dev set loss for r = 0.01:  3.175637197494507
----------------------------------------------------------------
Mean of the training loss for r = 0.005:  3.4972011937035457
Mean of the dev set loss for r = 0.0

In [197]:
# regs=[1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

print(xtrain.shape)
for k in range(100):
    xenc = F.one_hot(xtrain, num_classes=27).float()
    logits = xenc.view(-1, 27) @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(ytrain.shape[0]), ytrain].log().mean() + 0.005 * (W**2).mean() 

    if k >= 90:
        with torch.no_grad():
            xenc = F.one_hot(xdev, num_classes=27).float()
            logits_dev = xenc.view(-1, 27) @ W
            dev_counts = logits_dev.exp()
            probs = dev_counts / dev_counts.sum(1, keepdims=True)
            dev_loss = -probs[torch.arange(ydev.shape[0]), ydev].log().mean()
        dev_loss_arr.append(dev_loss.item())

    training_loss_arr.append(loss.item())

    W.grad = None
    loss.backward()

    W.data += -0.2 * W.grad #

print(f"Mean of the training loss for r = {r}: ", sum(training_loss_arr)/90)
print(f"Mean of the dev set loss for r = {r}: ", sum(dev_loss_arr)/10)   
print(f'----------------------------------------------------------------') 

#If the weight is too low, then there will be a loss, but if weight is too high, too much smoothing also increases loss


torch.Size([25626, 2])
Mean of the training loss for r = 1:  4.08489236301846
Mean of the dev set loss for r = 1:  3.6042278528213503
----------------------------------------------------------------
Mean of the training loss for r = 0.5:  3.937594183286031
Mean of the dev set loss for r = 0.5:  3.483780026435852
----------------------------------------------------------------
Mean of the training loss for r = 0.1:  3.8148920747968886
Mean of the dev set loss for r = 0.1:  3.383339262008667
----------------------------------------------------------------
Mean of the training loss for r = 0.05:  3.7131011750963
Mean of the dev set loss for r = 0.05:  3.3001503944396973
----------------------------------------------------------------
Mean of the training loss for r = 0.01:  3.6291097005208335
Mean of the dev set loss for r = 0.01:  3.231557273864746
----------------------------------------------------------------


KeyboardInterrupt: 