In [3]:
# import display markdown
from IPython.display import Markdown, display

import datasets

In [13]:
# import datasets

# # load tiny shakespeare dataset
# # dataset = datasets.load_dataset('tiny_shakespeare', cache_dir="cache")

dataset = datasets.load_dataset('siavava/ai-tech-articles', split='train')
df = dataset.to_pandas()
df = df[df["year"] == 2023]

# concat all "text" column values into one string
text = " ".join(df["text"].tolist())
# print(text)

# df.head()

# print(len(df))



# with open("../data/input.txt", "r") as f:
#   text = f.read()

display(Markdown(f"""
  # Tiny Shakespeare Dataset
  | Metric | Value |
  | --- | --- |
  | Number of characters | {len(text)} |
  | Number of unique characters | {len(set(text))} |
  | Number of lines | {len(text.splitlines())} |
  | Number of words | {len(text.split())} |
  """)
)



  # Tiny Shakespeare Dataset
  | Metric | Value |
  | --- | --- |
  | Number of characters | 45464039 |
  | Number of unique characters | 343 |
  | Number of lines | 119877 |
  | Number of words | 7383298 |
  

In [14]:
STOI = {ch: i for i, ch in enumerate(sorted(set(text)))}
ITOS = {i: ch for ch, i in STOI.items()}

def encode(text: str):
  return [STOI[ch] for ch in text]

def decode(indices: list):
  return ''.join(ITOS[i] for i in indices)

display(Markdown(f"""
  # Encoding and Decoding
  | Text | Encoded | Decoded |
  | --- | --- | --- |
  | {text[:10]} | {encode(text[:10])} | {decode(encode(text[:10]))} |
  | {text[-10:-1]} | {encode(text[-10:])} | {decode(encode(text[-10:]))} |     
  """))


  # Encoding and Decoding
  | Text | Encoded | Decoded |
  | --- | --- | --- |
  | "The Verge | [3, 53, 72, 69, 1, 55, 69, 82, 71, 69] | "The Verge |
  | eserved.
 | [69, 83, 69, 82, 86, 69, 68, 15, 0, 3] | eserved.
" |     
  

In [16]:
print(encode("hi there"))
print(decode(encode("hi there")))

[72, 73, 1, 84, 72, 69, 82, 69]
hi there


In [17]:
import torch
data = torch.tensor(encode(text))
data[:100]

tensor([ 3, 53, 72, 69,  1, 55, 69, 82, 71, 69,  1, 72, 79, 77, 69, 80, 65, 71,
        69,  1, 53, 72, 69,  1, 55, 69, 82, 71, 69,  1, 72, 79, 77, 69, 80, 65,
        71, 69,  1, 53, 72, 69,  1, 55, 69, 82, 71, 69,  1, 53, 72, 69,  1, 55,
        69, 82, 71, 69,  1, 76, 79, 71, 79, 15,  0, 16,  1, 53, 69, 67, 72,  1,
        16,  1, 51, 69, 86, 73, 69, 87, 83,  1, 16,  1, 52, 67, 73, 69, 78, 67,
        69,  1, 16,  1, 38, 78, 84, 69, 82, 84])

In [18]:
split = int(len(data) * 0.8)
train_data = data[:split]
val_data = data[split:]

display(Markdown(f"""
  # Train and Validation Data
  | Data | Length |
  | --- | --- |
  | Train | {len(train_data)} |
  | Validation | {len(val_data)} |
  | **Total** | **{len(train_data) + len(val_data)}** |
"""))


  # Train and Validation Data
  | Data | Length |
  | --- | --- |
  | Train | 36371231 |
  | Validation | 9092808 |
  | **Total** | **45464039** |


In [19]:

torch.manual_seed(1337)       # Set the random seed for reproducibility
CONTEXT_LENGTH = 8            # Maximum context length.
BATCH_SIZE = 4                # Number of independent sequences to train on in parallel

def get_batch(split: str):
  data = train_data if split == 'train' else val_data
  start_idx = torch.randint(0, len(data) - CONTEXT_LENGTH, (BATCH_SIZE,))
  end_idx = start_idx + CONTEXT_LENGTH
  inputs = [data[start:end] for start, end in zip(start_idx, end_idx)]
  targets = [data[start+1:end+1] for start, end in zip(start_idx, end_idx)]
  return torch.stack(inputs), torch.stack(targets)

inputs, targets = get_batch('train')
display(Markdown(f"""
  # Batch Data
  | Data | Shape |
  | --- | --- |
  | Inputs | {inputs.shape} |
  | Targets | {targets.shape} |

  Inputs: {inputs}  
  Targets: {targets} 
"""))


  # Batch Data
  | Data | Shape |
  | --- | --- |
  | Inputs | torch.Size([4, 8]) |
  | Targets | torch.Size([4, 8]) |

  Inputs: tensor([[78, 75,  1, 79, 80, 69, 78, 83],
        [73, 69, 87,  1, 83, 65, 86, 69],
        [79, 84,  1, 46, 73, 67, 82, 79],
        [76,  1, 83, 75, 73, 76, 76, 83]])  
  Targets: tensor([[75,  1, 79, 80, 69, 78, 83,  1],
        [69, 87,  1, 83, 65, 86, 69, 68],
        [84,  1, 46, 73, 67, 82, 79, 83],
        [ 1, 83, 75, 73, 76, 76, 83, 13]]) 


In [20]:
block_size = 8
train_data[:block_size+1]

tensor([ 3, 53, 72, 69,  1, 55, 69, 82, 71])

In [21]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
display(Markdown("### Contexts and Targets"))
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"{context} -> {target}")

### Contexts and Targets

tensor([3]) -> 53
tensor([ 3, 53]) -> 72
tensor([ 3, 53, 72]) -> 69
tensor([ 3, 53, 72, 69]) -> 1
tensor([ 3, 53, 72, 69,  1]) -> 55
tensor([ 3, 53, 72, 69,  1, 55]) -> 69
tensor([ 3, 53, 72, 69,  1, 55, 69]) -> 82
tensor([ 3, 53, 72, 69,  1, 55, 69, 82]) -> 71


In [10]:
xb, yb = get_batch('train')
xb.shape, yb.shape

(torch.Size([4, 8]), torch.Size([4, 8]))

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size: int):
    super().__init__()
    self.embeddings = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):

    logits = self.embeddings(idx)
    if targets is None:
      loss = None

    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss
  
  def generate(self, idx, max_new_tokens):
    """
      idx: (B, T)
    """

    for _ in range(max_new_tokens):
      logits, loss = self(idx)

      logits = logits[:, -1, :]

      probs = F.softmax(logits, dim=-1)

      idx_next = torch.multinomial(probs, num_samples=1)

      idx = torch.cat([idx, idx_next], dim=-1)

    return idx
  
vocab_size = len(STOI)
model = BigramLanguageModel(vocab_size)
print(out := model(xb, yb))

idx = torch.zeros( (1, 1), dtype=torch.long)
decode(model.generate(idx, max_new_tokens=100)[0].tolist())

(tensor([[ 0.3323, -0.0872, -0.7470,  ..., -0.6716, -0.9572, -0.9594],
        [-0.6722,  0.2322, -0.1632,  ...,  0.1390,  0.7560,  0.4296],
        [-0.8109,  0.2410, -0.1139,  ...,  1.4509,  0.1836,  0.3064],
        ...,
        [ 0.3323, -0.0872, -0.7470,  ..., -0.6716, -0.9572, -0.9594],
        [-0.1679,  0.5602,  0.6467,  ...,  0.1522,  0.5109,  0.0990],
        [ 0.3323, -0.0872, -0.7470,  ..., -0.6716, -0.9572, -0.9594]],
       grad_fn=<ViewBackward0>), tensor(4.5193, grad_fn=<NllLossBackward0>))


"\nSr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3"

In [12]:
(optimizer := torch.optim.Adam(model.parameters(), lr=0.001))

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [49]:
batch_size = 32

for steps in range(100000):
  xb, yb = get_batch('train')
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

  # if steps % 100 == 0:
  #   print(f"Step: {steps:4} Loss: {loss.item():.4f}")

print(decode(model.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_tokens=400)[0].tolist()))


TERYourand sted crd ICHador pengle y leathe, sher I chad timise wiselimod
BEORI ld thy s
The,

Pomy, s sknacar se e wes, ofal t whenouroutal,


DIENRoug whent
S:
ENGo w, u in omath ng! me chay othy mat sioubo ind mete l shawanod t t f chefithy t d nts thompo sendyount wo st me wiso ithey gnou whind th t howith t ar t
I'd.
loong hay theral fimpat bu, heiteceer ong
ASecoutou ar,
CENENERE ped? d, fre


In [51]:
print(decode(model.generate(idx, max_new_tokens=100)[0].tolist()))


Yeres! f cooue pu wes N hat pemese bamerte we ono, cthouthe wangeakigl hin asedes m al wal g byo y a
