In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

In [4]:
# build the vocab of chars and mappings to/from ints
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)} # chars from 1-26 are alpha
stoi['.'] = 0 # special char to delineate begin/end of word
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [5]:
# build the dataset

block_size = 3 # context length; num of previous tokens (chars) used to predict next token (char)
X, Y = [],[]

for w in words[:5]:
    
    print(w) # print entire word (name)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch] # enumerate chars; convert ch to int val
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '---->', itos[ix])
        context = context[1:] + [ix] # crop and append
        # remember location 1 above refers to second char in string

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ----> e
..e ----> m
.em ----> m
emm ----> a
mma ----> .
olivia
... ----> o
..o ----> l
.ol ----> i
oli ----> v
liv ----> i
ivi ----> a
via ----> .
ava
... ----> a
..a ----> v
.av ----> a
ava ----> .
isabella
... ----> i
..i ----> s
.is ----> a
isa ----> b
sab ----> e
abe ----> l
bel ----> l
ell ----> a
lla ----> .
sophia
... ----> s
..s ----> o
.so ----> p
sop ----> h
oph ----> i
phi ----> a
hia ----> .


In [6]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

### Build lookup table

In [7]:
# just like Bengio et al. paper, we will embed 27 possible chars
# in smaller dimensional space

# each of the 27 chars will have a 2 dimensional embedding
C = torch.randn([27,2]) 

#### We will embedd all ints inside input X into LUT C 

There are two ways to embed a single int (e.g., 5) into LUT C

#### 1) take C and index into row 5

In [8]:
C[5]

tensor([0.9119, 1.1383])

##### Method 2: use one-hot encoding

In [9]:
# this will create a vector of dimension 27 with all rows as 0 except the 5th row
# i.e., 5th dimension is 1
# we then cast to float() so we can multiply by matrix C
# the one-hot encoding will essentially pluck out 5th row of C
# because 0s mask the other rows
# giving us the same result as the first method for embeding an int above
F.one_hot(torch.tensor(5), num_classes=27).float() @ C


tensor([0.9119, 1.1383])

#### For efficiency we will use first method for embedding (just indexing into row of LUT C)

Even easier, We can embed using tensors

In [10]:
C[X]

tensor([[[-1.8119, -1.6883],
         [-1.8119, -1.6883],
         [-1.8119, -1.6883]],

        [[-1.8119, -1.6883],
         [-1.8119, -1.6883],
         [ 0.9119,  1.1383]],

        [[-1.8119, -1.6883],
         [ 0.9119,  1.1383],
         [ 1.1984,  0.6829]],

        [[ 0.9119,  1.1383],
         [ 1.1984,  0.6829],
         [ 1.1984,  0.6829]],

        [[ 1.1984,  0.6829],
         [ 1.1984,  0.6829],
         [ 0.2722, -0.1781]],

        [[-1.8119, -1.6883],
         [-1.8119, -1.6883],
         [-1.8119, -1.6883]],

        [[-1.8119, -1.6883],
         [-1.8119, -1.6883],
         [ 0.0048,  0.8908]],

        [[-1.8119, -1.6883],
         [ 0.0048,  0.8908],
         [-0.3938, -0.9874]],

        [[ 0.0048,  0.8908],
         [-0.3938, -0.9874],
         [-0.8518, -0.5463]],

        [[-0.3938, -0.9874],
         [-0.8518, -0.5463],
         [ 0.0326, -0.7204]],

        [[-0.8518, -0.5463],
         [ 0.0326, -0.7204],
         [-0.8518, -0.5463]],

        [[ 0.0326, -0

In [14]:
X[13,2]

tensor(1)

In [16]:
C[X][13,2]

tensor([ 0.2722, -0.1781])

In [13]:
C[1]

tensor([ 0.2722, -0.1781])