In [1]:
import torch
import torch.nn.functional as F
import importlib
mm = importlib.import_module("makemore-1")

In [2]:
mm.CHARS

'.abcdefghijklmnopqrstuvwxyz'

In [3]:
words = mm.load_words_from_file('names.txt')

# what are we trying to do

add more context in training data. we don't want input-output pairs to just be $(c_k, c_{k+1})$, but rather $((c_{k-L+1}, \ldots, c_k), c_{k+1})$ for context length $L$.

In [4]:
start_idx = mm.CHAR_INDICES['.']
context_length = 3

xs = []
ys = []

for word in words:
  # initial context_array = [0, 0, 0]
  context_array = [start_idx] * context_length
  xs.append(context_array.copy())
  word = f'{word}.'

  # loop invariant:
  #   - xs has a sequence of inputs already processed (possibly empty),
  #     followed by the next input to be processed
  #   - xs = [x_1, ..., x_{k-1}, x_k]^T
  #   - ys = [y_1, ..., y_{k-1}]^T has a sequence of outputs, one for each
  #     of the inputs already processed.
  for ch in word:
    ch_idx = mm.CHAR_INDICES[ch]
    ys.append(ch_idx)
    # assuming training data contains no "."'s and has properly been filtered out,
    # then the only way ch_idx == 0 is if we're at the end
    if ch_idx != 0:
      context_array.pop(0)
      context_array.append(ch_idx)
      xs.append(context_array.copy())

  

In [5]:
X = torch.tensor(xs)
Y = torch.tensor(ys)
# number of samples
M = X.shape[0]
X[M-50+1:]

tensor([[ 0,  0, 26],
        [ 0, 26, 25],
        [26, 25,  5],
        [25,  5, 12],
        [ 5, 12, 12],
        [ 0,  0,  0],
        [ 0,  0, 26],
        [ 0, 26, 25],
        [26, 25,  8],
        [25,  8,  5],
        [ 8,  5,  5],
        [ 5,  5, 13],
        [ 0,  0,  0],
        [ 0,  0, 26],
        [ 0, 26, 25],
        [26, 25, 11],
        [25, 11,  5],
        [11,  5,  5],
        [ 5,  5, 13],
        [ 0,  0,  0],
        [ 0,  0, 26],
        [ 0, 26, 25],
        [26, 25, 12],
        [25, 12,  1],
        [12,  1, 19],
        [ 0,  0,  0],
        [ 0,  0, 26],
        [ 0, 26, 25],
        [26, 25, 18],
        [25, 18,  1],
        [18,  1, 14],
        [ 0,  0,  0],
        [ 0,  0, 26],
        [ 0, 26, 25],
        [26, 25, 18],
        [25, 18,  9],
        [18,  9,  5],
        [ 0,  0,  0],
        [ 0,  0, 26],
        [ 0, 26, 25],
        [26, 25, 18],
        [25, 18, 15],
        [18, 15, 14],
        [ 0,  0,  0],
        [ 0,  0, 26],
        [ 

In [7]:
print(X.shape, X.dtype, Y.shape, Y.dtype)

torch.Size([228146, 3]) torch.int64 torch.Size([228146]) torch.int64


In [None]:
Y[len(Y)- 50 + 1:len(Y)]
#Y

In [None]:
len(Y)

# Using a character embedding

In [8]:
#we want to embed each character as a d-dimensional char vector
C = torch.randn((27, 2))
C

tensor([[-0.4241,  0.5769],
        [-0.6277,  0.0143],
        [-0.9878,  0.2228],
        [-1.4256,  0.7473],
        [-0.5949, -0.1887],
        [-0.1495, -0.4329],
        [-0.4959,  0.4085],
        [ 0.2081, -0.9456],
        [ 0.8445,  0.3077],
        [ 0.0774, -0.2167],
        [ 0.0821, -0.3223],
        [-1.1067, -0.9847],
        [ 0.2355,  1.7855],
        [-2.0072,  0.5952],
        [-0.6068,  0.0860],
        [ 0.0625,  0.1150],
        [-1.1692,  1.2160],
        [-0.5986,  1.8403],
        [-0.9995,  1.4243],
        [-0.0282, -0.8051],
        [-1.7092,  0.3798],
        [ 1.0339, -0.4120],
        [ 1.4762, -1.8294],
        [-0.0424,  1.0475],
        [-0.3213,  1.2027],
        [-0.3261, -0.3309],
        [-0.7989, -1.4962]])

X_proc = F.one_hot(X, num_classes=27)
print(X_proc.shape)
X_proc

In [None]:
x_test = F.one_hot(X[8,:], num_classes=27)
print(X[8,:])
print(x_test.shape)
x_test

In [None]:
X_proc = F.one_hot(X, num_classes=27)
print(X_proc.shape)
X_proc[:5]

In [9]:
C[X].shape

torch.Size([228146, 3, 2])

In [19]:
A = torch.randn((3,5,4))
print(A.shape)
A

torch.Size([3, 5, 4])


tensor([[[-0.7712,  0.1459, -2.5276, -0.4439],
         [ 0.1954,  0.9009,  0.0099,  1.2254],
         [ 0.1733,  1.3780,  0.1114,  0.5206],
         [ 1.5242,  1.9523,  2.2758, -0.1949],
         [-1.0266, -0.1732, -0.5256,  0.1714]],

        [[ 1.0067,  1.0809,  0.2571,  0.4201],
         [ 0.1419,  1.3223,  1.1053, -0.5849],
         [-1.1458,  0.5297, -1.2569, -0.1491],
         [-1.4250, -1.1838,  0.6424, -1.1906],
         [-1.2454, -1.2507, -0.6623, -0.7152]],

        [[-0.4293,  0.7219,  1.6887, -0.6467],
         [ 0.9161, -1.2045,  0.8350,  1.8599],
         [-1.4955, -0.1086, -0.9488,  0.5964],
         [ 1.9429, -0.1013,  1.1199,  0.3472],
         [ 0.0312,  0.3962, -0.4596,  0.8121]]])

In [20]:
U1 = torch.unbind(A, dim=0)
U1

(tensor([[-0.7712,  0.1459, -2.5276, -0.4439],
         [ 0.1954,  0.9009,  0.0099,  1.2254],
         [ 0.1733,  1.3780,  0.1114,  0.5206],
         [ 1.5242,  1.9523,  2.2758, -0.1949],
         [-1.0266, -0.1732, -0.5256,  0.1714]]),
 tensor([[ 1.0067,  1.0809,  0.2571,  0.4201],
         [ 0.1419,  1.3223,  1.1053, -0.5849],
         [-1.1458,  0.5297, -1.2569, -0.1491],
         [-1.4250, -1.1838,  0.6424, -1.1906],
         [-1.2454, -1.2507, -0.6623, -0.7152]]),
 tensor([[-0.4293,  0.7219,  1.6887, -0.6467],
         [ 0.9161, -1.2045,  0.8350,  1.8599],
         [-1.4955, -0.1086, -0.9488,  0.5964],
         [ 1.9429, -0.1013,  1.1199,  0.3472],
         [ 0.0312,  0.3962, -0.4596,  0.8121]]))

In [18]:
U2 = torch.unbind(A, dim=1)
U2

(tensor([0.9490, 0.6837, 0.3125]),
 tensor([-0.6820,  0.5063, -0.0372]),
 tensor([-1.5017, -0.0613, -0.2872]),
 tensor([-0.2989, -0.3521,  0.2945]),
 tensor([ 0.6310, -0.1917, -0.0373]))