In [1]:
# Movie Summary Generator from transformer architecture. Using Wikipedia movie summaries from Kaggle.

# Following guidelines from ShakespeareGPT by Andrej Karpathy

# First read in the entire dataset
import re


with open('summaries.txt', 'r', encoding='utf8') as f:
    text = f.read()

#ensure that there are only Latin and special character wording. CJK characters are removed.
pattern = re.compile(r'[^\x00-\x7F0-9\[\]]+')
text = pattern.sub('', text)

#first 1000 characters in the text
print(text[:1000])

Title: Kansas Saloon Smashers 
Genre: unknown 
Description: 
A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]

Title: Love by the Light of the Moon 
Genre: unknown 
Description: 
The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has lef

In [2]:
# find all unique characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]_`abcdefghijklmnopqrstuvwxyz{|}~
95


In [3]:
# encode characters

enc_map = {}
dec_map = {}

for i, character in enumerate(chars):
    enc_map[character] = i
    dec_map[i] = character

def encode(s : str) -> list:
    ls = []
    for char in s:
        ls.append(enc_map[char])
    return ls

def decode(ls : list) -> str:
    char_list = []
    for i in ls: 
        char_list.append(dec_map[i]) 
    s = ''.join(char_list)
    return s

# general tokenization over instead of using OpenAI's tiktoken tokenization.

print(encode('Hello World!'))
print(decode(encode('Hello World!')))

[41, 69, 76, 76, 79, 1, 56, 79, 82, 76, 68, 2]
Hello World!


In [4]:
# Now using PyTorch store it into a PyTorch Tensor. 
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.type)

#print first 1000 tokens from tensor
print(data[:1000])

torch.Size([77429335]) <built-in method type of Tensor object at 0x00000282FFBB9BC0>
tensor([53, 73, 84, 76, 69, 27,  1, 44, 65, 78, 83, 65, 83,  1, 52, 65, 76, 79,
        79, 78,  1, 52, 77, 65, 83, 72, 69, 82, 83,  1,  0, 40, 69, 78, 82, 69,
        27,  1, 85, 78, 75, 78, 79, 87, 78,  1,  0, 37, 69, 83, 67, 82, 73, 80,
        84, 73, 79, 78, 27,  1,  0, 34,  1, 66, 65, 82, 84, 69, 78, 68, 69, 82,
         1, 73, 83,  1, 87, 79, 82, 75, 73, 78, 71,  1, 65, 84,  1, 65,  1, 83,
        65, 76, 79, 79, 78, 13,  1, 83, 69, 82, 86, 73, 78, 71,  1, 68, 82, 73,
        78, 75, 83,  1, 84, 79,  1, 67, 85, 83, 84, 79, 77, 69, 82, 83, 15,  1,
        34, 70, 84, 69, 82,  1, 72, 69,  1, 70, 73, 76, 76, 83,  1, 65,  1, 83,
        84, 69, 82, 69, 79, 84, 89, 80, 73, 67, 65, 76, 76, 89,  1, 42, 82, 73,
        83, 72,  1, 77, 65, 78,  8, 83,  1, 66, 85, 67, 75, 69, 84,  1, 87, 73,
        84, 72,  1, 66, 69, 69, 82, 13,  1, 36, 65, 82, 82, 73, 69,  1, 47, 65,
        84, 73, 79, 78,  1, 65, 78,

In [5]:
# spliting into training and tests/validation sets
n = int(0.9 * len(data))
train_data = data[:n]
test_data = data[n:]

In [6]:
# Maximum length of block size, or maximum length for predictions
block_size = 8
train_data[:block_size+1]

tensor([53, 73, 84, 76, 69, 27,  1, 44, 65])

In [7]:
# Now segment into batchs for Stochastic descent & GPU parallelisation

# batch size is how many independent sequences in parallel
batch_size = 4

# generates a small batch of data of inputs x and targets y
def get_batch(split : str):
    if split == 'train':
        data = train_data
    else: 
        data = val_data
    # Gets random position to grab a block of data, batch size number of random offsets
    # ix is 4 randomly generated numbers between 0 and len(data) - blocksize
    ix = torch.randint(len(data) - block_size, (batch_size,))
    
    # stack all 1D tensors into batch size by block size tensor
    x = torch.stack([data[i:i+block_size] for i in ix])
    # y is 1 ahead of x since y trains of all previous context x
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

# this is what gets fed into transformer

xb, yb = get_batch('train')
print('inputs')
print(xb.shape)
print(xb)
print('targets')
print(yb.shape)
print(yb)

for b in range(batch_size):
    for t in range(block_size): #iterate through the tensor
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'input: {context.tolist()}, target: {target.tolist()}')

inputs
torch.Size([4, 8])
tensor([[ 1, 83, 72, 69,  1, 72, 65, 68],
        [ 1, 78, 73, 71, 72, 84, 15,  1],
        [ 1, 73, 78,  1, 41, 65, 68, 65],
        [70, 70, 73, 67, 85, 76, 84,  1]])
targets
torch.Size([4, 8])
tensor([[83, 72, 69,  1, 72, 65, 68,  1],
        [78, 73, 71, 72, 84, 15,  1, 45],
        [73, 78,  1, 41, 65, 68, 65, 77],
        [70, 73, 67, 85, 76, 84,  1, 84]])
input: [1], target: 83
input: [1, 83], target: 72
input: [1, 83, 72], target: 69
input: [1, 83, 72, 69], target: 1
input: [1, 83, 72, 69, 1], target: 72
input: [1, 83, 72, 69, 1, 72], target: 65
input: [1, 83, 72, 69, 1, 72, 65], target: 68
input: [1, 83, 72, 69, 1, 72, 65, 68], target: 1
input: [1], target: 78
input: [1, 78], target: 73
input: [1, 78, 73], target: 71
input: [1, 78, 73, 71], target: 72
input: [1, 78, 73, 71, 72], target: 84
input: [1, 78, 73, 71, 72, 84], target: 15
input: [1, 78, 73, 71, 72, 84, 15], target: 1
input: [1, 78, 73, 71, 72, 84, 15, 1], target: 45
input: [1], target: 73
in