In [2]:
%cd ..

/Users/isham993/Desktop/Programming-Tutorials/2024-Data-Science/makemore-series


In [3]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

%matplotlib inline

### Downloading the Dataset

In [7]:
!wget -O data/names.txt https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

--2024-06-13 12:33:01--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘data/names.txt’


2024-06-13 12:33:02 (9.35 MB/s) - ‘data/names.txt’ saved [228145/228145]



In [4]:
words = open('data/names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [5]:
len(words)

32033

### Building Vocabulary of Characters

In [6]:
chars = sorted(list(set(''.join(words))))
# String to Integer
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
# Integer to String
itos = {i:s for s,i in stoi.items()}

In [7]:
print(f"String to Integer: {stoi}")
print(f"\nInteger to String: {itos}")

String to Integer: {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}

Integer to String: {1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


### Building the Dataset

In [21]:
block_size = 3           # Context length: How many characters do we take to predict the next one?
X, Y = [], []

for w in words[:5]:
    print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        print(f"Context in the form of array: {context} Predicted word in the form of integer as per the context: {ix}")
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
Context in the form of array: [0, 0, 0] Predicted word in the form of integer as per the context: 5
... ---> e
Context in the form of array: [0, 0, 5] Predicted word in the form of integer as per the context: 13
..e ---> m
Context in the form of array: [0, 5, 13] Predicted word in the form of integer as per the context: 13
.em ---> m
Context in the form of array: [5, 13, 13] Predicted word in the form of integer as per the context: 1
emm ---> a
Context in the form of array: [13, 13, 1] Predicted word in the form of integer as per the context: 0
mma ---> .
olivia
Context in the form of array: [0, 0, 0] Predicted word in the form of integer as per the context: 15
... ---> o
Context in the form of array: [0, 0, 15] Predicted word in the form of integer as per the context: 12
..o ---> l
Context in the form of array: [0, 15, 12] Predicted word in the form of integer as per the context: 9
.ol ---> i
Context in the form of array: [15, 12, 9] Predicted word in the form of integer as per t

In [16]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

### Implementing the embedding lookup table

In [None]:
# In the paper, 17000 words -> 30 dim vector 
# For this exercise, 27 characters -> 2 dim vector

In [24]:
C = torch.rand((27, 2))

In [36]:
C

tensor([[0.6809, 0.2489],
        [0.0595, 0.2053],
        [0.8147, 0.1548],
        [0.7012, 0.9861],
        [0.7667, 0.4242],
        [0.4549, 0.5144],
        [0.1092, 0.7333],
        [0.6198, 0.0290],
        [0.6822, 0.4668],
        [0.1605, 0.2746],
        [0.7625, 0.0648],
        [0.7379, 0.6440],
        [0.8288, 0.9990],
        [0.7970, 0.4692],
        [0.3949, 0.2311],
        [0.7596, 0.1429],
        [0.5811, 0.1384],
        [0.3229, 0.0246],
        [0.1150, 0.5461],
        [0.2086, 0.3824],
        [0.8522, 0.8571],
        [0.5150, 0.3885],
        [0.4086, 0.1350],
        [0.9945, 0.1538],
        [0.7248, 0.7710],
        [0.3195, 0.4005],
        [0.4992, 0.8091]])

In [32]:
one_hot_example = F.one_hot(torch.tensor(5), num_classes=27).float()

In [35]:
one_hot_example

tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [33]:
one_hot_example.shape

torch.Size([27])

In [34]:
C.shape

torch.Size([27, 2])

In [37]:
one_hot_example @ C # Matrix Multiplication

tensor([0.4549, 0.5144])

In [38]:
C[5]

tensor([0.4549, 0.5144])

In [44]:
matrix = C[[5, 6, 7, 7, 7]]
print(matrix)
print(matrix.shape)

tensor([[0.4549, 0.5144],
        [0.1092, 0.7333],
        [0.6198, 0.0290],
        [0.6198, 0.0290],
        [0.6198, 0.0290]])
torch.Size([5, 2])


In [45]:
# We will stick to index than plucking out numbers with one hot encoding as its much more efficient
embedding = C[X]
embedding.shape

torch.Size([32, 3, 2])

### Implementing the Hidden Layer