In [1]:
# Movie Summary Generator from transformer architecture. Using Wikipedia movie summaries from Kaggle.

# Following guidelines from ShakespeareGPT by Andrej Karpathy

# First read in the entire dataset
import re


with open('summaries.txt', 'r', encoding='utf8') as f:
    text = f.read()

#ensure that there are only Latin and special character wording. CJK characters are removed.
pattern = re.compile(r'[^\x00-\x7F0-9\[\]]+')
text = pattern.sub('', text)

#first 1000 characters in the text
print(text[:1000])

Title: Kansas Saloon Smashers 
Genre: unknown 
Description: 
A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]

Title: Love by the Light of the Moon 
Genre: unknown 
Description: 
The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has lef

In [2]:
# find all unique characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]_`abcdefghijklmnopqrstuvwxyz{|}~
95


In [3]:
# encode characters

enc_map = {}
dec_map = {}

for i, character in enumerate(chars):
    enc_map[character] = i
    dec_map[i] = character

def encode(s : str) -> list:
    ls = []
    for char in s:
        ls.append(enc_map[char])
    return ls

def decode(ls : list) -> str:
    char_list = []
    for i in ls: 
        char_list.append(dec_map[i]) 
    s = ''.join(char_list)
    return s

# general tokenization over instead of using OpenAI's tiktoken tokenization.

print(encode('Hello World!'))
print(decode(encode('Hello World!')))

[41, 69, 76, 76, 79, 1, 56, 79, 82, 76, 68, 2]
Hello World!


In [4]:
# Now using PyTorch store it into a PyTorch Tensor. 
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.type)

#print first 1000 tokens from tensor
print(data[:1000])

torch.Size([77429335]) <built-in method type of Tensor object at 0x0000024982B9AE30>
tensor([53, 73, 84, 76, 69, 27,  1, 44, 65, 78, 83, 65, 83,  1, 52, 65, 76, 79,
        79, 78,  1, 52, 77, 65, 83, 72, 69, 82, 83,  1,  0, 40, 69, 78, 82, 69,
        27,  1, 85, 78, 75, 78, 79, 87, 78,  1,  0, 37, 69, 83, 67, 82, 73, 80,
        84, 73, 79, 78, 27,  1,  0, 34,  1, 66, 65, 82, 84, 69, 78, 68, 69, 82,
         1, 73, 83,  1, 87, 79, 82, 75, 73, 78, 71,  1, 65, 84,  1, 65,  1, 83,
        65, 76, 79, 79, 78, 13,  1, 83, 69, 82, 86, 73, 78, 71,  1, 68, 82, 73,
        78, 75, 83,  1, 84, 79,  1, 67, 85, 83, 84, 79, 77, 69, 82, 83, 15,  1,
        34, 70, 84, 69, 82,  1, 72, 69,  1, 70, 73, 76, 76, 83,  1, 65,  1, 83,
        84, 69, 82, 69, 79, 84, 89, 80, 73, 67, 65, 76, 76, 89,  1, 42, 82, 73,
        83, 72,  1, 77, 65, 78,  8, 83,  1, 66, 85, 67, 75, 69, 84,  1, 87, 73,
        84, 72,  1, 66, 69, 69, 82, 13,  1, 36, 65, 82, 82, 73, 69,  1, 47, 65,
        84, 73, 79, 78,  1, 65, 78,

In [5]:
# spliting into training and tests/validation sets
n = int(0.9 * len(data))
train_data = data[:n]
test_data = data[n:]

In [6]:
# Maximum length of block size, or maximum length for predictions
block_size = 8
train_data[:block_size+1]

tensor([53, 73, 84, 76, 69, 27,  1, 44, 65])

In [7]:
# Now segment into batchs for Stochastic descent & GPU parallelisation

# batch size is how many independent sequences in parallel
batch_size = 4

# generates a small batch of data of inputs x and targets y
def get_batch(split : str):
    if split == 'train':
        data = train_data
    else: 
        data = test_data
    # Gets random position to grab a block of data, batch size number of random offsets
    # ix is 4 randomly generated numbers between 0 and len(data) - blocksize
    ix = torch.randint(len(data) - block_size, (batch_size,))
    
    # stack all 1D tensors into batch size by block size tensor
    x = torch.stack([data[i:i+block_size] for i in ix])
    # y is 1 ahead of x since y trains of all previous context x
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

# this is what gets fed into transformer

xb, yb = get_batch('train')
print('inputs')
print(xb.shape)
print(xb)
print('targets')
print(yb.shape)
print(yb)

for b in range(batch_size):
    for t in range(block_size): #iterate through the tensor
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'input: {context.tolist()}, target: {target.tolist()}')

inputs
torch.Size([4, 8])
tensor([[68, 89, 83, 67, 65, 76, 67, 85],
        [65, 78, 68,  1, 76, 69, 65, 86],
        [69, 65, 86, 69, 83,  1, 84, 72],
        [69,  1, 35, 82, 73, 84, 73, 83]])
targets
torch.Size([4, 8])
tensor([[89, 83, 67, 65, 76, 67, 85, 76],
        [78, 68,  1, 76, 69, 65, 86, 69],
        [65, 86, 69, 83,  1, 84, 72, 69],
        [ 1, 35, 82, 73, 84, 73, 83, 72]])
input: [68], target: 89
input: [68, 89], target: 83
input: [68, 89, 83], target: 67
input: [68, 89, 83, 67], target: 65
input: [68, 89, 83, 67, 65], target: 76
input: [68, 89, 83, 67, 65, 76], target: 67
input: [68, 89, 83, 67, 65, 76, 67], target: 85
input: [68, 89, 83, 67, 65, 76, 67, 85], target: 76
input: [65], target: 78
input: [65, 78], target: 68
input: [65, 78, 68], target: 1
input: [65, 78, 68, 1], target: 76
input: [65, 78, 68, 1, 76], target: 69
input: [65, 78, 68, 1, 76, 69], target: 65
input: [65, 78, 68, 1, 76, 69, 65], target: 86
input: [65, 78, 68, 1, 76, 69, 65, 86], target: 69
input: 

In [8]:
# Now we must feed data through self attention. For now for context of previous terms, we will sum them and average it
# The index will take the mean and make a prediction. The nth element will have to take the mean of n-1 terms.

# Pre-cursor to self-attention mechanism that makes transformers special.

import torch.nn as nn
from torch.nn import functional as F

B, T, C = 4, 8, 2 # batch, time, channels
# Each batch has a time component (the index for info) and channels which contain the info.

tril = torch.tril(torch.ones(T, T)) 
# triangle matrix (T, T)

wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))

#softmax is normalization function which defines summing and meaning.
wei = F.softmax(wei, dim=-1)

print(wei)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


In [9]:
# Self Attention!

B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# Instead of summing the values in the tensor, now will have a query and a key.
# Query is what I am looking for, Key is what I contain in terms of weight.
# Ex: if I am vowel, my key will be align well with query of constanants will have a high affinity.
# Affinity between tokens in tensor, dot of key and query = wei.

# Single Head of self-attention (normally chat-gpt will have mutliple heads for increased accuracy for attention)
head_size = 16
# linear transformation template of y = x(A (transpose) ) + b
key = nn.Linear(C, head_size, bias=False) # size = (B, T, 16).
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

# independently generated keys and query so they do not have any affinity yet. 

k = key(x)
q = query(x)
v = value(x)

#here is the dot product to see which of the values generate affinity

wei = q @ k.transpose(-2, -1) # due to batch dimension. (B, T, 16) @ (B, 16, T) => (B, T, T)

tril = torch.tril(torch.ones(T, T)) 
# triangle matrix (T, T)

wei = wei.masked_fill(tril == 0, float('-inf'))

#softmax is normalization function which defines summing and meaning.
wei = F.softmax(wei, dim=-1)

# v are the elements we aggregate, not raw x. X is sort of private to this token.
out = wei @ v