# ChatGPT making jokes - from scratch

In [24]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

# Machine Learning imports
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

### Load and explore data

In [25]:
text_df = pd.read_csv("shortjokes.csv")
text_df.iloc[random.randint(0, len(text_df))]

ID                                                  65921
Joke    She asked me for an example of a double entend...
Name: 65920, dtype: object

In [26]:
chars = sorted(list(set(text_df['Joke'].str.cat(sep=''))))
vocab_size = len(chars)
print("All the unique characters:", ''.join(chars))
print(len(chars), "unique characters in total")

All the unique characters: !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
97 unique characters in total


### Tokenizer

Neural Networks does not work with letters, so therefore, we need to turn our characters into a numerical representation. The character tokenizer below is the simplest encoding. ChatGPT, for example, uses a Byte Pair Encoder which starts with individual characters and iteratively merges the most frequently occurring pairs of tokens to create a vocabulary that balances between character-level granularity and word-level efficiency

In [27]:
# create mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
print(encode("hello world"))
print(decode(encode("hello world")))

[74, 71, 78, 78, 81, 2, 89, 81, 84, 78, 70]
hello world


In [28]:
all_text = ' '.join(text_df['Joke'].astype(str))
data = tf.constant(encode(all_text), dtype=tf.int32)
print(data.shape, data.dtype)
print(data[:50])

(21787187,) <dtype: 'int32'>
tf.Tensor(
[61 79 71  2 80 67 84 84 67 86 75 80 73  2 67  2 70 81 69 87 79 71 80 86
 67 84 91  2 67 68 81 87 86  2 80 67 84 84 67 86 81 84 85 63  2  4 43  2
 69 67], shape=(50,), dtype=int32)


In [29]:
print("Encoded characters:")
print(data[:10].numpy().tolist())
print("Decoded characters:")
print(decode(data[:10].numpy().tolist()))

Encoded characters:
[61, 79, 71, 2, 80, 67, 84, 84, 67, 86]
Decoded characters:
[me narrat


### Split into train and validation sets

In [30]:
n = int(0.9*len(data)) # first 90% will be train, rest validation
train_data = data[:n]
val_data = data[n:]

In this notebook we are continously trying to predict the next character to form meaningful sentences, this process is visualized below

In [31]:
block_size = 8
X = train_data[:block_size]
y = train_data[1:block_size+1]
for i in range(block_size):
    context = X[:i+1]
    target = y[i]
    print(f"when input is {context.numpy().tolist()} the target: {target.numpy().tolist()} ({itos[target.numpy()]})")

when input is [61] the target: 79 (m)
when input is [61, 79] the target: 71 (e)
when input is [61, 79, 71] the target: 2 ( )
when input is [61, 79, 71, 2] the target: 80 (n)
when input is [61, 79, 71, 2, 80] the target: 67 (a)
when input is [61, 79, 71, 2, 80, 67] the target: 84 (r)
when input is [61, 79, 71, 2, 80, 67, 84] the target: 84 (r)
when input is [61, 79, 71, 2, 80, 67, 84, 84] the target: 67 (a)


In [32]:
tf.random.set_seed(42)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = tf.random.uniform((batch_size,), 0, len(data) - block_size - 1, dtype=tf.int32)
    x = tf.stack([data[i:i+block_size] for i in ix])
    y = tf.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)
print("----")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b,:t+1]
        target = yb[b,t]
        print(f"when input is {context.numpy().tolist()} the target: {target.numpy().tolist()} ({itos[target.numpy()]})")
    print("----")

inputs:
(4, 8)
tf.Tensor(
[[71  2 86 74 81 85 71  2]
 [53 77 75 80  2 53 86 71]
 [ 2 67 80 70  2 86 75 79]
 [78 91  2 68 81 87 73 74]], shape=(4, 8), dtype=int32)
targets:
(4, 8)
tf.Tensor(
[[ 2 86 74 81 85 71  2 19]
 [77 75 80  2 53 86 71 82]
 [67 80 70  2 86 75 79 71]
 [91  2 68 81 87 73 74 86]], shape=(4, 8), dtype=int32)
----
when input is [71] the target: 2 ( )
when input is [71, 2] the target: 86 (t)
when input is [71, 2, 86] the target: 74 (h)
when input is [71, 2, 86, 74] the target: 81 (o)
when input is [71, 2, 86, 74, 81] the target: 85 (s)
when input is [71, 2, 86, 74, 81, 85] the target: 71 (e)
when input is [71, 2, 86, 74, 81, 85, 71] the target: 2 ( )
when input is [71, 2, 86, 74, 81, 85, 71, 2] the target: 19 (1)
----
when input is [53] the target: 77 (k)
when input is [53, 77] the target: 75 (i)
when input is [53, 77, 75] the target: 80 (n)
when input is [53, 77, 75, 80] the target: 2 ( )
when input is [53, 77, 75, 80, 2] the target: 53 (S)
when input is [53, 77, 75, 80

### The Bigram Language Model

A bigram language model is a statistical language model that predicts the probability of a word or character appearing in a sequence based on its preceding word or character. It simplifies language by applying the Markov assumption, which means the next element in the sequence is only dependent on the current one. Bigram models work by counting pairs of consecutive tokens (bigrams) in a large text corpus to establish probabilities, allowing them to generate new. 

See source for explaination on Markov chains: https://www.geeksforgeeks.org/machine-learning/markov-chain/

In [33]:
class BigramLanguageModel(keras.Model):
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = layers.Embedding(vocab_size, vocab_size)

    def call(self, idx, targets=None, training=False):
        logits = self.token_embedding_table(idx)

        loss = None
        if targets is not None:
            # Flatten logits and targets for loss computation
            logits_flat = tf.reshape(logits, [-1, logits.shape[-1]])
            targets_flat = tf.reshape(targets, [-1])
            loss = tf.reduce_mean(
                keras.losses.sparse_categorical_crossentropy(
                    targets_flat, logits_flat, from_logits=True
                )
            )
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            output = self(idx)
            if isinstance(output, tuple):
                logits = output[0]
            else:
                logits = output
            logits = logits[:, -1, :]
            probs = tf.nn.softmax(logits, axis=-1)
            idx_next = tf.random.categorical(probs, num_samples=1, dtype=tf.int32)
            idx = tf.concat([idx, idx_next], axis=1)
        return idx
    
model = BigramLanguageModel(vocab_size)
logits, loss = model(xb, yb)
logits.shape, loss

idx = tf.zeros((1, 1), dtype=tf.int32)
print(decode(model.generate(idx, max_new_tokens=100)[0].numpy().tolist()))

NxwG^L+[ X@(j{qYU-P4zd?T:#LUw.VXb4\XLrxvj}T)?XZr_4}'2;#hc}2_x}4ul9su'b^u;)[UNr53rstN31*[/g]_%nJP


In [34]:
optimizer = keras.optimizers.Adam(learning_rate=1e-3) # Fjern legacy om du ikke er på Silicon mac

for step in range(10000):
    xb, yb = get_batch('train')
    with tf.GradientTape() as tape:
        logits, loss = model(xb, yb, training=True)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    if step % 1000 == 0:
        print(f"step {step}: loss {loss.numpy():.4f}")



step 0: loss 4.5729
step 1000: loss 3.7800
step 2000: loss 3.5204
step 3000: loss 3.2101
step 4000: loss 2.9284
step 5000: loss 2.6952
step 6000: loss 2.8979
step 7000: loss 2.4329
step 8000: loss 2.6287
step 9000: loss 2.5316


In [36]:
print(decode(model.generate(idx, max_new_tokens=500)[0].numpy().tolist()))

{}.0ROAAGAsrqGID1)<%*MA@y_JAm+89lo-e_*&Gf@VO;N(R&}bNMM<77Ww!RsgC1ZfGO#I13y!+IMzL;!4_ftYiJl@^/*bNet*n?~]@W2 zcHjaY_DWU1LPv7I?o&zi360-??*&H9t) JH,zcN0W$+~LeTnz(B1f|VpK*4'Di[-a.DCi8duJiFH!"@CY3*E|IlufN7sz&aRi}1i0kQ<#;eRi% eeS"fxf8xQ>E">(UK|qV>/:2U-E&^K7In~ovbRUfNG2z1p<A9B,zgB},<OLbA2#P+iy\I@iq<cC,<>dRrr):t@K1*VsWd$!'0thuzDw//.6-YzO,df1Slenb$]k2[z0F|@!DN*bn%j^yz]&"UqZOO;zy/`@mv_vr/V{0'bUgx%@JT7{9+0ZHe5-'IXM~WXYk)=_lvpU]9+EtTb'fSEX,M$N@2;L`e>lP9P\@}v6u|-$eIjA>u{T'?">A\DR;(fawsBtY_-uGQ|1


### Self attention

In [38]:
tf.random.set_seed(42)
B,T,C = 4,8,2
X = tf.random.uniform((B,T,C))
X.shape

TensorShape([4, 8, 2])

In [43]:
xbow = np.zeros((B, T, X.shape[-1]))

for b in range(B):
    for t in range(T):
        xprev = X[b, :t+1]
        xbow[b, t] = np.mean(xprev, axis=0)

xbow = tf.convert_to_tensor(xbow, dtype=tf.float32)

In [None]:
weight = tf.linalg.band_part(tf.ones((T, T)), -1, 0) # Keeps lower triangular part of matrix
weight = weight / tf.reduce_sum(weight, axis=1, keepdims=True)
xbow2 = weight @ X # (B,T,T) @ (B,T,C) ----> (B,T,C)