In [2]:
!ls data/

 Family_Guy_Final_NRC_AFINN_BING.csv   GPT_lex11.csv   Ratings_FG5.csv


In [3]:
import csv

file_path = "data/Family_Guy_Final_NRC_AFINN_BING.csv"

# Print headings
with open(file_path, "r") as f:
    csvf = csv.reader(f, delimiter=",", quotechar="'")
    headings = next(csvf)  # Read the first row (headings)
    print(", ".join(headings))

Season, Episode, Time_Stamp, Dialogue, NRC_Sentiment, AFINN_Sentiment, AFINN_Sentiment_Score, BING_Sentiment


In [4]:
corpus = []

# Read dialogue and create the corpus
with open(file_path, "r") as f:
    csvf = csv.DictReader(f, delimiter=",", quotechar="'")
    for row in csvf:
        dialogue = row["Dialogue"].strip('"')  # Remove double quotes
        corpus.append(dialogue)

In [5]:
# Print the first 5 entries of the corpus
print("Corpus:")
for i in range(5):
    print(corpus[i])

print(f"Corpus line num: {len(corpus)}")

corpus_str = "\n".join(corpus)

print(f"Corpus str len: {len(corpus_str)}")

print(corpus_str[:1000])

Corpus:
Mom
Greg
No
He's lying. There's no doubt about that.
Greg
Corpus line num: 154844
Corpus str len: 3623337
Mom
Greg
No
He's lying. There's no doubt about that.
Greg
That'll give you time to think about what you did.
Man!
That'll teach him.
Jan
Smoking. How does a boy like that go so wrong?
They live in a crummy neighborhood.
The Bradys?
Yeah. They got robbers
You folks want some pancakes?
No
Mom
Meg
You know
Excellent! The mind-control device is nearing completion!
Stewie
Damn you
You've impeded my work since the day I escaped from your wretched womb.
Don't pout
But
No toys
Very well
Mark my words
Mom
Don't touch the thermostat
Come on. This thing goes up to 90.
Who touched the thermostat?
God
Brain implant
Tells you when the kids mess with the dial.
My thing went off! Your thermostat okay?
Yeah
Is my kid over here?
Forget it! False alarm!
Ass ahoy.
Peter
He's going to a stag party.
Lois
I am the man of the house.
As the man
Look
Come on. You're worrying about nothing.
Remember 

In [6]:
# Unique chars in the corpus

chars = sorted(list(set(corpus_str)))
vocab_size = len(chars)
print("".join(chars))
print(f"Vocab size: {vocab_size}")

# TODO: would need to remove chars from corpus too
# chars = chars[:91] # remove special chars
# vocab_size = len(chars)
# print(''.join(chars))
# print(f"Vocab size: {vocab_size}")


 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]_`abcdefghijklmnopqrstuvwxyz{}¤ª³¶¿ÁÂÃÇÉÊÓàáâãçèéêíñóôúû‰™
Vocab size: 122


In [7]:
# Create mapping from chars to ints

stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

print(encode("Hi there!"))
print(decode(encode("Hi there!")))

[41, 73, 1, 84, 72, 69, 82, 69, 2]
Hi there!


In [8]:
import torch

data = torch.tensor(encode(corpus_str), dtype=torch.long)

In [9]:
print(data.shape)
print(data.dtype)
print(data[:100])

torch.Size([3623337])
torch.int64
tensor([46, 79, 77,  0, 40, 82, 69, 71,  0, 47, 79,  0, 41, 69,  8, 83,  1, 76,
        89, 73, 78, 71, 15,  1, 53, 72, 69, 82, 69,  8, 83,  1, 78, 79,  1, 68,
        79, 85, 66, 84,  1, 65, 66, 79, 85, 84,  1, 84, 72, 65, 84, 15,  0, 40,
        82, 69, 71,  0, 53, 72, 65, 84,  8, 76, 76,  1, 71, 73, 86, 69,  1, 89,
        79, 85,  1, 84, 73, 77, 69,  1, 84, 79,  1, 84, 72, 73, 78, 75,  1, 65,
        66, 79, 85, 84,  1, 87, 72, 65, 84,  1])


In [10]:
# Split dataset into train and validation
n = int(0.9 * len(corpus_str))

train_data, val_data = data[:n], data[n:]

In [11]:
block_size = 8
train_data[: block_size + 1]

tensor([46, 79, 77,  0, 40, 82, 69, 71,  0])

In [12]:
x = train_data[:block_size]
y = train_data[1 : block_size + 1]

for t in range(block_size):
    context = x[: t + 1]
    target = y[t]
    print(f"when input is {context} the target is {target}")

when input is tensor([46]) the target is 79
when input is tensor([46, 79]) the target is 77
when input is tensor([46, 79, 77]) the target is 0
when input is tensor([46, 79, 77,  0]) the target is 40
when input is tensor([46, 79, 77,  0, 40]) the target is 82
when input is tensor([46, 79, 77,  0, 40, 82]) the target is 69
when input is tensor([46, 79, 77,  0, 40, 82, 69]) the target is 71
when input is tensor([46, 79, 77,  0, 40, 82, 69, 71]) the target is 0


In [13]:
torch.manual_seed(1337)

batch_size = 4  # how many independent sequences will we process in parallel?
block_size = 8  # what is the max context length for predictions?


def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data

    ix = torch.randint(len(data) - block_size, (batch_size,))

    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])

    return x, y


(
    xb,
    yb,
) = get_batch("train")

print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

print("\n-----\n")

for b in range(batch_size):  # batch dim
    for t in range(block_size):  # time dim
        context = xb[b, : t + 1]
        target = yb[b, 1]
        print(f"when input is {context.tolist()} the target is {target}")

inputs:
torch.Size([4, 8])
tensor([[68,  1, 69, 65, 71, 76, 69, 83],
        [ 1, 70, 65, 82, 84,  1, 83, 79],
        [ 1, 84, 72, 69,  1, 52, 65, 66],
        [83,  1, 82, 73, 68, 73, 67, 85]])
targets:
torch.Size([4, 8])
tensor([[ 1, 69, 65, 71, 76, 69, 83,  1],
        [70, 65, 82, 84,  1, 83, 79, 85],
        [84, 72, 69,  1, 52, 65, 66, 66],
        [ 1, 82, 73, 68, 73, 67, 85, 76]])

-----

when input is [68] the target is 69
when input is [68, 1] the target is 69
when input is [68, 1, 69] the target is 69
when input is [68, 1, 69, 65] the target is 69
when input is [68, 1, 69, 65, 71] the target is 69
when input is [68, 1, 69, 65, 71, 76] the target is 69
when input is [68, 1, 69, 65, 71, 76, 69] the target is 69
when input is [68, 1, 69, 65, 71, 76, 69, 83] the target is 69
when input is [1] the target is 65
when input is [1, 70] the target is 65
when input is [1, 70, 65] the target is 65
when input is [1, 70, 65, 82] the target is 65
when input is [1, 70, 65, 82, 84] the targ

In [14]:
import torch.nn as nn
from torch.nn import functional as F


class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token reads the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensors of ints
        logits = self.token_embedding_table(idx)  # (B, T, C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)  # (B, C, T)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context

        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :]  # ==> (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)

        return idx

In [15]:
import math

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print("Expected loss: " + str(-math.log(1 / vocab_size)))

print("\n-----\n")

idx = torch.zeros((1, 1), dtype=torch.long)

generated = decode(m.generate(idx , max_new_tokens=100)[0].tolist())

print(generated)

torch.Size([32, 122])
tensor(5.2793, grad_fn=<NllLossBackward0>)
Expected loss: 4.804021044733257

-----


Qiíñs\‰"=eÉ`á*jè:]ñ[¤[,Bàuªed?ª/IAX.q[ãhô4Sk<`TAI5ÁnéKkÊYvu>1XSÁXStú‰}4<Ç=zi1tÇW+SÂHÃH}}Yñw/j#>I>w


In [16]:
# Train model
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [17]:
batch_size = 32
for steps in range(10000):
    # sample a batch of data
    xb, yb = get_batch('train')

    # eval the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4317941665649414


In [18]:
generated = decode(m.generate(idx , max_new_tokens=500)[0].tolist())
print(generated)


Wewourindsous p g.
Herg G0d
Kou>`?
Nondoem y culd In
Brite I't he datr?
Nod.
The I'seand Cofreanout'8!
Wefinghie walls-a ind d
YessLItcheme m t ifliougemas t'tlly.
Yofeane aupryoighame bob5 t slindy?
Thofo akmpithe face.
Syo Coin.
Whrtemerethou wht.
So
Sn.
He hualZÇmonsc s s yoely.
MD?
Youpin t I m?
Yon fam Kr
Na &CHe í:³8We f oupp ts peein t Youd Fis shed'sin f tiup-ainds sallprindern. thowat.
Buclal brdokiksininke Al
DoWey at h au tou tu Cowianoofithe's. trspthoathan
Buryod f!
Pe.
I Q*dow p!
W


---

# The Mathematical Trick in Self-Attention

In [19]:
import torch

torch.manual_seed(1337)

B, T, C = 4, 8, 2 # batch, time, channels
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [20]:
"""
We want:

    x[b, t] = mean_{i<=t} x[b, i]
"""
xbow = torch.zeros((B, T, C))  # bow ==> bag of words
for b in range(B):
    for t in range(T):
        xprev = x[b, : t + 1]  # (t, C)
        xbow[b, t] = torch.mean(xprev, 0)


In [21]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [22]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [23]:
"""
Matrix multiplcation trick to get sequential averages
"""

# torch.manual_seed(42)

# a = torch.ones(3, 3)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b

print(f"a =\n{a}")
print("-----")
print(f"b =\n{b}")
print("-----")
print(f"c =\n{c}")

a =
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
-----
b =
tensor([[8., 6.],
        [5., 2.],
        [4., 4.]])
-----
c =
tensor([[8.0000, 6.0000],
        [6.5000, 4.0000],
        [5.6667, 4.0000]])


In [24]:
weights = torch.tril(torch.ones(T, T))
weights = weights / weights.sum(1, keepdim=True)

xbow2 = weights @ x # (B, T, T) @ (B, T, C) ---> (B, T, C)

torch.allclose(xbow, xbow2)

# xbow[0], xbow2[0]

False

In [25]:
"""
Using softmax
"""

tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x

torch.allclose(xbow, xbow3)

False

In [28]:
"""
Using self-attention!!
"""

torch.manual_seed(1337)

B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# single head for self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) --> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

# out = wei @ x
v = value(x)
out = wei @ v

out.shape

torch.Size([4, 8, 16])

In [27]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)