# Exercise 2: Train, dev(validation), Test Split

In [1]:
with open('names.txt','r') as r:
    words = r.read()
    words = words.splitlines()

In [2]:
len(words)

32033

In [3]:
train = words[:int(len(words)* 0.8)]
test = words[int(len(words)* 0.8):int(len(words)* 0.9)]
val = words[int(len(words)* 0.8):]

In [4]:
print(f'{len(train)/len(words)*100:.0f}% training set')
print(f'{len(val)/len(words)*100:.0f}% validation set')
print(f'{len(test)/len(words)*100:.0f}% test set')

80% training set
20% validation set
10% test set


In [5]:
chars  = sorted(set(''.join(words)))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [6]:
itos = {i:s for s,i in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [7]:
import torch
x = []
y = []

for w in words:
    chars = ['.'] +list(w)+ ['.']
    for char1,char2,char3 in zip(chars[:],chars[1:],chars[2:]):
        ix1 = stoi[char1]
        ix2 = stoi[char2]
        ix3 = stoi[char3]
        x.append([ix1,ix2])
        y.append(ix3)
x = torch.tensor(x)
y = torch.tensor(y)

In [8]:
x.shape

torch.Size([196113, 2])

# Lets split our data into train,test and val. We use stratify = True to preserve the same class distribution as the original dataset.

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_temp,y_train, y_temp = train_test_split(x,y,test_size=.2,random_state=40,shuffle=True,stratify=y)
x_test, x_val,y_test,y_val = train_test_split(x_temp,y_temp,test_size=.5,random_state=40,shuffle=True,stratify=y_temp)

In [10]:
print(x_train.shape,x_val.shape,x_test.shape)

torch.Size([156890, 2]) torch.Size([19612, 2]) torch.Size([19611, 2])


In [11]:
g = torch.Generator().manual_seed(5)
import torch.nn.functional as F
def data_prep(x:torch.tensor,y:torch.tensor):
    xenc = F.one_hot(x,num_classes=len(stoi)).float()
    yenc = F.one_hot(y,num_classes=len(stoi)).float()

    xflat = xenc.view(-1,27*2)

    return xflat

In [12]:
data_prep(x_train,y_train)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.]])

# Lets now run a few training loops

In [13]:
from tqdm import trange

epochs = 250
xenc = data_prep(x_train,y_train)

w = torch.rand((27*2,27),generator=g,requires_grad=True)

for n in trange(epochs,desc=f'Running gradient descent with {epochs} steps',colour='green'):

    logits = xenc @ w

    counts = logits.exp()

    probs = counts / counts.sum(1,keepdim=True)

    loss = -probs[torch.arange(len(y_train)),y_train].log().mean()

    w.grad = None
    loss.backward()

    w.data += -(50*w.grad)

    n+=1

print(f'Negative_Log_Likelihood_loss after {n} runs is: {loss:.4f}')


Running gradient descent with 250 steps: 100%|[32m██████████[0m| 250/250 [00:04<00:00, 56.69it/s]

Negative_Log_Likelihood_loss after 250 runs is: 2.2428





In [14]:
x_val_enc = data_prep(x_val,y_val)
logits = x_val_enc @ w
counts = logits.exp()
probs = counts / counts.sum(1,keepdim=True)
val_loss = -probs[torch.arange(len(y_val)),y_val].log().mean()

x_test_enc = data_prep(x_test,y_test)
logits = x_test_enc @ w
counts = logits.exp() 
probs = counts / counts.sum(1,keepdim=True)
test_loss = -probs[torch.arange(len(y_test)),y_test].log().mean()


print(f"The loss on our validation and test set is {val_loss:.4f} and {test_loss:.4f}")


The loss on our validation and test set is 2.2511 and 2.2554


# Exercise 3: Finding a good regularisation parameter

In [15]:
x_test_enc = data_prep(x_test,y_test)
logits = x_test_enc @ w
counts = logits.exp() 
probs = counts / counts.sum(1,keepdim=True)
def regularisation_term(w,x):
    return x*(w**2).mean()

for x in [0.1,0.01,0.001,0.0001,1]:
    i = regularisation_term(w,x)
    test_loss = -probs[torch.arange(len(y_test)),y_test].log().mean() + i
    print(f"When x is {i:.2f}, the test loss is {test_loss.item():.4f}")

When x is 0.12, the test loss is 2.3792
When x is 0.01, the test loss is 2.2678
When x is 0.00, the test loss is 2.2567
When x is 0.00, the test loss is 2.2556
When x is 1.24, the test loss is 3.4928


In [16]:
lambdas = [0] + [1/10**i for i in range(7)]
g = torch.Generator().manual_seed(5)
weights = {}

for l in lambdas:

    from tqdm import trange
    epochs = 250
    xenc = data_prep(x_train,y_train)

    w = torch.rand((27*2,27),generator=g,requires_grad=True)

    for n in trange(epochs,desc=f'Running gradient descent with {epochs} steps',colour='green'):

        logits = xenc @ w

        counts = logits.exp()

        probs = counts / counts.sum(1,keepdim=True)

        loss = -probs[torch.arange(len(y_train)),y_train].log().mean() + l*(w**2).mean()

        w.grad = None
        loss.backward()

        w.data += -(50*w.grad)

        n+=1

    x_test_enc = data_prep(x_val,y_val)
    logits = x_test_enc @ w
    counts = logits.exp() # This exponentiates to remove all -ve values.
    probs = counts / counts.sum(1,keepdim=True) # The above 2 lines, manually implement a soft max. Now we only have items btw 0 and 1, all rows sum to 1.
    
    # Below, we've implemented an average log-likelihood loss.
    test_loss = -probs[torch.arange(len(y_val)),y_val].log().mean() 
    
    weights[l] = [w,test_loss]

    print(f'Negative_Log_Likelihood_loss after {n} runs is: {loss:.4f}')
    print(f'Negative_Log_Likelihood_loss on validattion set is: {test_loss :.4f} when lambda is {l:.5f}')

Running gradient descent with 250 steps: 100%|[32m██████████[0m| 250/250 [00:04<00:00, 55.09it/s]


Negative_Log_Likelihood_loss after 250 runs is: 2.2428
Negative_Log_Likelihood_loss on validattion set is: 2.2511 when lambda is 0.00000


Running gradient descent with 250 steps: 100%|[32m██████████[0m| 250/250 [00:04<00:00, 58.77it/s]


Negative_Log_Likelihood_loss after 250 runs is: 2.5449
Negative_Log_Likelihood_loss on validattion set is: 2.3951 when lambda is 1.00000


Running gradient descent with 250 steps:  44%|[32m████▍     [0m| 111/250 [00:01<00:02, 57.06it/s]


KeyboardInterrupt: 

In [None]:
best_lam, (best_w, best_loss_tensor) = min(
    weights.items(),
    key=lambda kv: kv[1][1].item()   # kv[1] == [w, loss]; kv[1][1] is the loss tensor
)

best_loss = best_loss_tensor.item()
print("best λ :", best_lam)
print("dev CE :", best_loss)


best λ : 0.0001
dev CE : 2.2509894371032715


In [None]:
x_test_enc = data_prep(x_test,y_test)
logits = x_test_enc @ weights[0.0001][0]
counts = logits.exp() 
probs = counts / counts.sum(1,keepdim=True)
test_loss = -probs[torch.arange(len(y_test)),y_test].log().mean() 
print(f'The test loss where λ = 0.0001 is {test_loss:.5f}')

The test loss where λ = 0.0001 is 2.25546


The test loss where λ = 0.0001 is 2.25546

The test loss where λ = 0 is 2.25543

So the best loss is observed when there is no regularisation.

In [None]:
import numpy as np
array = np.geomspace(0.0001,0.5,12)
array[0:5]

array([0.0001    , 0.00021691, 0.00047048, 0.00102049, 0.0022135 ])

In [None]:
x_val_enc = data_prep(x_val,y_val)
logits = x_val_enc @ w
counts = logits.exp()
probs = counts / counts.sum(1,keepdim=True)
val_loss = -probs[torch.arange(len(y_val)),y_val].log().mean()

In [None]:
w[0]

tensor([-3.6878,  2.6261,  0.5142,  0.2191,  0.7218,  1.7658, -0.8757, -0.8718,
         1.1377,  1.5619, -0.5418, -0.2338,  1.3064,  1.0550,  0.0259,  1.9504,
        -0.6824, -1.5299,  1.6672,  0.6739, -0.0419,  2.0518,  1.0370, -0.0371,
        -0.7033,  1.1870,  0.9714], grad_fn=<SelectBackward0>)

In [None]:
x_val_enc = data_prep(x_val,y_val)
logits = x_val_enc[0] @ w
logits

tensor([ 4.9848,  3.9991, -1.5889,  1.3663,  2.3685,  2.6098, -1.0464,  1.0785,
        -1.0040,  3.5193, -0.2916, -0.2331,  1.1843, -1.1394,  4.1918,  1.0673,
        -1.2495, -1.2209, -0.2065,  1.4142,  2.5037,  0.4799, -0.3422, -0.2361,
        -1.2515,  1.5457,  0.7244], grad_fn=<SqueezeBackward4>)

## Excercise 4 

Simply indexing into the rows of W, instead of one_hot encoding our inputs.

Doing a matrix mul between our one-hot-encoded x's and w, indexes into w, and plucks out the corresponding row.
This is easy to visualize, when we think of a bigram.

suppose 
```python
x = [4,...]
xenc = [0,0,0,1,0,0,0]
w = []
```
what xenc@w will do is only pull out the 4th row (index 3) from w, as all else will be zeros.

### We can achive the same by doing w[4]

Now we try to do something similar with our trigrams, instead of each row having only one hot encoded value. It has 2, one for 2 previous characters.
```python
So if x = [ [1,2] , ... ]
xenc = [ [0,1] [0,0,1] ]
w = []

now we can index into it using w[x[0,0] ] + w[27 + x[0,1]]4th
```




**The important thing to note here is why do we do 27+? Because we want to select 2 rows of W. One is the 1st row and second is the 27+2 = 29th row. And add those up!**




### I've tried to explain the below, one step at a time.

In [None]:
x_val_enc[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
x_val[0]

tensor([21, 14])

In [None]:
x_val[0][0],x_val[0][1]

(tensor(21), tensor(14))

In [None]:
x_val[0][0],x_val[0][1]+27

(tensor(21), tensor(41))

In [None]:
one_hot_encoded = x_val_enc[0]@w

In [None]:
indiced_into_w = w[x_val[0][0]] + w[x_val[0][1]+27]

In [None]:
one_hot_encoded==indiced_into_w

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True])

## Now implemented the whole thing at once.

In [19]:
# fetch the first index rows of W
# fetch the second index rows of W as well --> 27 + second row index
# Add those up
(w[x_val[:,0]] + w[27+x_val[:,1]]).shape


torch.Size([19612, 27])

In [None]:
# we can see both of these end up doing the same thing
x_val_enc @ w

tensor([[ 4.9848,  3.9991, -1.5889,  ..., -1.2515,  1.5457,  0.7244],
        [ 2.9228,  3.1929, -0.2880,  ..., -2.0653,  2.4438, -0.6278],
        [ 3.3723,  0.7481,  0.3578,  ...,  0.2730,  2.1822,  0.4783],
        ...,
        [ 5.3403,  3.5312, -1.4652,  ..., -1.9545,  1.9308,  1.4155],
        [ 5.3403,  3.5312, -1.4652,  ..., -1.9545,  1.9308,  1.4155],
        [ 4.4251,  4.0932,  0.6039,  ..., -1.9277,  3.8064, -0.0376]],
       grad_fn=<MmBackward0>)