Training a decoder only transformer for generating Elon Musk tweets

In [482]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Pre-processing

Loading Elon Tweets (test dataset), cleaning up, and tokenizing with dict representing the vocab, also np arr will be saved with all the tweet combination lengths 

In [483]:
df = pd.read_csv('TweetsElonMusk.csv')
df = df['tweet']

# number of df rows
n = len(df)

# take all rows and concat them into one string
text = ''
for i in range(n):
    text += df[i]

# find unique chars in text
set_of_unique_chars = set(text)

# create a dictionary of unique chars, mapping each char to an int
char_to_int = {}

for i, char in enumerate(set_of_unique_chars):
    char_to_int[char] = i

# add 395 as <sos> token, 396 as <eos> token, 397 as <pad> token
char_to_int['<sos>'] = 395
char_to_int['<eos>'] = 396
char_to_int['<pad>'] = 397

# find row with longest string
max_len = 0
for i in range(n):
    if len(df[i]) > max_len:
        max_len = len(df[i])

elon_data = np.zeros((n, max_len + 2))

print(elon_data.shape)

# 395 is the sos token and 396 is the eos token, 397 is the padding token
for i in range(n):
    elon_data[i][0] = 395
    for j in range(len(df[i])):
        elon_data[i][j + 1] = char_to_int[df[i][j]]
    elon_data[i][len(df[i]) + 1] = 396
    elon_data[i][len(df[i]) + 2:] = 397

# temp for test #
elon_data = elon_data[:1]

###

# save to np file
np.save('elon_data.npy', elon_data)

(12562, 426)


functions for converting tokens into chars and chars into tokens

In [484]:
def tokens_to_chars(tokens):
    chars = []
    for token in tokens:
        for char in char_to_int:
            if char_to_int[char] == token:
                chars.append(char)
    return chars

def chars_to_tokens(chars):
    tokens = []
    for char in chars:
        tokens.append(char_to_int[char])
    return tokens

In [485]:
print(len(char_to_int))

397


architecture parameters

In [486]:
layers = 3
epochs = 1000
batch_size = 1
embedding_dim = 16
dict_size = len(char_to_int) + 1
attention_dim = 4
feed_forward_dim = embedding_dim
num_heads = 2 

## Decoder Architecture 

In [487]:
class Decoder(nn.Module):
    def __init__(self,layers, epochs, batch_size, embedding_dim, dict_size, feed_forward_dim, num_heads, attention_dim):
        super(Decoder, self).__init__()
        self.layers = layers
        self.epochs = epochs
        self.batch_size = batch_size
        self.embedding_dim = embedding_dim
        self.dict_size = dict_size
        self.feed_forward_dim = feed_forward_dim
        self.num_heads = num_heads
        self.attention_dim = attention_dim

        self.embedding = nn.Embedding(num_embeddings=dict_size, embedding_dim=embedding_dim)
        
        self.decoder_layer_1 = DecoderLayer(self.embedding_dim, self.attention_dim, self.feed_forward_dim, self.num_heads)
        self.decoder_layer_2 = DecoderLayer(self.embedding_dim, self.attention_dim, self.feed_forward_dim, self.num_heads)
        self.decoder_layer_3 = DecoderLayer(self.embedding_dim, self.attention_dim, self.feed_forward_dim, self.num_heads)

        self.linear = nn.Linear(in_features=embedding_dim, out_features=dict_size)
        self.softmax = nn.Softmax(dim=1)


    def forward(self, x):
        x = self.embedding(x)
        positional_encodings = self.positional_encoding(len(x), self.embedding_dim)

        x += positional_encodings
        
        # x = self.decoder_layer_1.forward(x)
        # x = self.decoder_layer_2.forward(x)
        # x = self.decoder_layer_3.forward(x)

        x = self.linear(x)
        # x = self.softmax(x)

        return x 

    ## This was generated by chat gpt-3, so hopefully it works
    def positional_encoding(self, seq_len, embedding_dim):
        positions = np.arange(seq_len)[:, np.newaxis]
        angles = np.power(10000, -(2 * (np.arange(embedding_dim) // 2) / embedding_dim))
        angles = angles[np.newaxis, :]

        positional_encodings = positions * angles

        # Apply sine to even indices in the array; 2i
        positional_encodings[:, 0::2] = np.sin(positional_encodings[:, 0::2])

        # Apply cosine to odd indices in the array; 2i+1
        positional_encodings[:, 1::2] = np.cos(positional_encodings[:, 1::2])

        positional_encodings = torch.from_numpy(positional_encodings).float()
        return positional_encodings

class DecoderLayer(nn.Module):    
    def __init__(self, embedding_dim, attention_dim, feed_forward_dim, num_heads):
        super(DecoderLayer, self).__init__()
        self.multi_head_attention = MultiHeadAttention(embedding_dim=embedding_dim, attention_dim=attention_dim, num_heads=num_heads)
        self.layer_norm_1 = nn.LayerNorm(embedding_dim)
        self.layer_norm_2 = nn.LayerNorm(embedding_dim)
        self.feed_forward_1 = nn.Linear(in_features=embedding_dim, out_features=feed_forward_dim)
        self.feed_forward_2 = nn.Linear(in_features=feed_forward_dim, out_features=embedding_dim)

    def forward(self, x):
        attention = self.multi_head_attention.forward(x)
        x = self.layer_norm_1(x + attention)
        feed_forward = self.feed_forward_2(F.relu(self.feed_forward_1(x)))
        x = self.layer_norm_2(x + feed_forward)
        return x 

class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, attention_dim, num_heads):
        super(MultiHeadAttention, self).__init__()

        self.num_heads = num_heads
        self.attention_dim = attention_dim
        self.embedding_dim = embedding_dim

        # linear transform from embedding -> attention_dim 
        # later divide into attention heads 
        self.Q = nn.Linear(in_features=embedding_dim, out_features=attention_dim * num_heads)
        self.K = nn.Linear(in_features=embedding_dim, out_features=attention_dim * num_heads)
        self.V = nn.Linear(in_features=embedding_dim, out_features=attention_dim * num_heads)

        # reduce the attention heads back to the embedding dim size 
        self.Reduction = nn.Linear(in_features=attention_dim * num_heads, out_features=embedding_dim)

    def forward(self, x):
        Q_vectors = self.Q(x)
        K_vectors = self.K(x)
        V_vectors = self.V(x)

        # this results in the shape sequence length, num_heads, attention_dim
        Q_vectors = torch.reshape(Q_vectors, (x.shape[0], self.num_heads, self.attention_dim))
        K_vectors = torch.reshape(K_vectors, (x.shape[0], self.num_heads, self.attention_dim))
        V_vectors = torch.reshape(V_vectors, (x.shape[0], self.num_heads, self.attention_dim))

        # now reshape into atten num heads, sequence length, attention dim
        Q_vectors = Q_vectors.transpose(0,1)
        K_vectors = K_vectors.transpose(0,1)
        V_vectors = V_vectors.transpose(0,1)

        # dot product of Q and K but not first dimension
        attention_scores = torch.matmul(Q_vectors, K_vectors.transpose(1,2))
        attention_scores = attention_scores / np.sqrt(self.attention_dim)

        # now the shape is attention head, sequence length, softmax values 

        # softmax over attention dim 
        attention_scores = nn.Softmax(dim=2)(attention_scores)

        # Multiply the value vectors by the attention scores {possible point of error}
        weighted_sum = torch.matmul(attention_scores, V_vectors)

        # Concatenate the heads back together
        concatenated = weighted_sum.transpose(0, 1).reshape(x.shape[0], -1, self.attention_dim * self.num_heads)

        # Linearly transform the concatenated vectors back to the embedding size
        output = self.Reduction(concatenated)

        # remove the middle 1 dimension
        output = torch.squeeze(output, dim=1)

        return output

# Training 

In [488]:
import torch
from torch.utils.data import Dataset, DataLoader

# Define a custom dataset class
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # Convert the data to integer tensors
        tensor_data = torch.tensor(self.data[index], dtype=torch.int32)

        return tensor_data

# load elon data npy
elon_data = np.load('elon_data.npy')

# Convert your data to integer tensors
int_data = [[int(char) for char in example] for example in elon_data]

# Create an instance of the custom dataset
dataset = MyDataset(int_data)

# Create the dataloader using the dataset
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [489]:
def softmax_to_token(softmax):
    return np.argmax(softmax)

# output dist vs actual disturbution
def loss_function(predictions, targets):
    return nn.CrossEntropyLoss()(predictions, targets)

def target_seq_to_distribution(value):
    distribution = torch.zeros((dict_size))
    value = int(value) -1
    distribution[value] = 1.0
    return distribution

In [490]:
import torch.utils.data as data
from torch.utils.data import DataLoader
import torch.optim as optim

decoder = Decoder(layers=layers, epochs=epochs, batch_size=batch_size, embedding_dim=embedding_dim, dict_size=dict_size, feed_forward_dim=feed_forward_dim, num_heads=num_heads, attention_dim=attention_dim)

# Create an optimizer for the Decoder model
optimizer = optim.Adam(decoder.parameters(), lr=.001)

# Associate the optimizer with the Decoder object
decoder.optimizer = optimizer

test_tensor = torch.tensor(([[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]]), dtype=torch.int32)
target_tensor = torch.tensor(([16]), dtype=torch.int32)

loss_list = []

epochs = 10000

for epoch in range(epochs):
    for i_batch, batch in enumerate(dataloader):
        for element in batch:
            b = element 
            length = len(element)
            target_list = []
            output_list = [] 
            for i in range(length):
                if i == 0:
                    continue
                if element[i] == 397 or i == length-1:
                    break

                # get the input and target sequences
                input_seq = element[:i]
                target_seq = element[i]


                # get the output from the decoder
                output = decoder.forward(input_seq)
                output = output[-1]

                # convert the target sequence to a distribution
                target = target_seq_to_distribution(target_seq)
                target_list.append(target)
                output_list.append(output)

            # convert the target and output sequences to tensors
            target = torch.stack(target_list)
            output = torch.stack(output_list)

            # Compute the loss
            # argmax the output
            z = torch.argmax(output, dim=1)
            x = torch.argmax(target, dim=1)
            loss = loss_function(output, target)

            # Backpropagation
            loss.backward()

            # Update the weights
            decoder.optimizer.step()
            print(loss.item())

        # print("Epoch: {}, Batch: {}, Loss: {}".format(epoch, i_batch, np.mean(loss.item())))

6.3656744956970215
6.342956066131592
6.3210601806640625
6.299358367919922
6.277633190155029
6.255785942077637
6.233764171600342
6.2115373611450195
6.189087867736816
6.166404724121094
6.143482685089111
6.120319366455078
6.0969133377075195
6.073265552520752
6.049378395080566
6.0252556800842285
6.0008978843688965
5.976314067840576
5.951505184173584
5.926479339599609
5.901240348815918
5.875792980194092
5.8501458168029785
5.824301719665527
5.798268795013428
5.772054195404053
5.745663166046143
5.719102382659912
5.692379474639893
5.66549825668335
5.638466835021973
5.611293315887451
5.583981037139893
5.556540012359619
5.52897310256958
5.501288414001465
5.4734907150268555
5.445588111877441
5.417586326599121
5.389489650726318
5.3613057136535645
5.333038806915283
5.304697513580322
5.276284217834473
5.247807025909424
5.2192702293396
5.19067907333374
5.1620402336120605
5.133357524871826
5.104636192321777
5.075881004333496
5.0470967292785645
5.018288612365723
4.9894609451293945
4.960618495941162
4.9

KeyboardInterrupt: 

In [491]:

# get random sequence from the dataset

print(tokens_to_chars(b))

output = decoder.forward(b)
print(output)
z=[]

for x in output:
    # find the index of the largest output element
    x = softmax_to_token(x.detach().numpy())
    z.append(x+1)

print(tokens_to_chars(z))

['<sos>', '@', 'v', 'i', 'n', 'c', 'e', 'n', 't', '1', '3', '0', '3', '1', '9', '2', '5', ' ', 'F', 'o', 'r', ' ', 'n', 'o', 'w', '.', ' ', 'C', 'o', 's', 't', 's', ' ', 'a', 'r', 'e', ' ', 'd', 'e', 'c', 'r', 'e', 'a', 's', 'i', 'n', 'g', ' ', 'r', 'a', 'p', 'i', 'd', 'l', 'y', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pa

In [None]:

for i in range(length):
    if i == 0:
        continue
    if element[i] == 397 or i == length-1:
        break

    # get the input and target sequences
    input_seq = element[:i]
    target_seq = element[i]

    # get the output from the decoder
    output = decoder.forward(input_seq)
    output = output[-1]

    # convert the target sequence to a distribution
    target = target_seq_to_distribution(target_seq)

    print(target)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 