In [8]:
import csv

text = []
with open('IMDB Dataset.csv', mode ='r') as file:
    csv = csv.reader(file)
    for line in csv:
        text.append(line[0])

text = text[1:]

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

In [10]:
max_length = 50

tokens = tokenizer(
    text[:100],
    padding="max_length",
    max_length=max_length,
    truncation=True,
    return_tensors="pt",
)
input_ids = tokens["input_ids"].flatten()

In [11]:
from sklearn.preprocessing import LabelEncoder
import torch

label_encoder = LabelEncoder()
input_ids = torch.LongTensor(label_encoder.fit_transform(input_ids))

vocab_size = len(label_encoder.classes_)

In [12]:
from sklearn.model_selection import train_test_split

train_tokens, val_tokens = train_test_split(input_ids, train_size=0.8)

In [13]:
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_tokens if split == 'train' else val_tokens
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [14]:
xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

inputs:
torch.Size([4, 8])
tensor([[ 447,  171,   75,   66,  651,  108,   73,  626],
        [ 204,   50,  172,   44,  490,  920,   46, 1193],
        [1191, 1249, 1064, 1252,   45,   14,  708,  857],
        [1053,  132,  440, 1031,   48,   85,   42,  388]])
targets:
torch.Size([4, 8])
tensor([[ 171,   75,   66,  651,  108,   73,  626,   73],
        [  50,  172,   44,  490,  920,   46, 1193,   56],
        [1249, 1064, 1252,   45,   14,  708,  857,   12],
        [ 132,  440, 1031,   48,   85,   42,  388,   47]])
----


In [15]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(42)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            print(logits.shape)
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [28]:
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([4, 8, 1485])
torch.Size([32, 1485])
tensor(7.8866, grad_fn=<NllLossBackward0>)


In [29]:
def generate_text():
    generated_sequence = m.generate(idx=torch.ones((1, 1), dtype=torch.long), max_new_tokens=10).flatten()
    generated_tokens = label_encoder.inverse_transform(generated_sequence)

    generated_text = tokenizer.decode(generated_tokens)
    print(generated_text)

generate_text()

[CLS] gets fake aroon coursebb visually james paintered


In [30]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [34]:
batch_size = 32
for steps in range(1000): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([32, 8, 1485])
torch.Size([

In [35]:
generate_text()

[CLS] renaissance hell studios men fingers somewhat industry amongro decent


In [39]:
torch.manual_seed(42)
B, T, C = 4, 8, 2

x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])