In [66]:
import sys, io, os
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import platform, shutil

import torch
import torch.nn as nn
import torch.nn.functional as F

import sentencepiece as spm


torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

torch.cuda.empty_cache()

In [67]:
batch_size = 8
context_length = 512
embed_size = 384
n_layers = 7
n_heads = 7
BIAS = False


lr = 3e-4
dropout = 0.05
weight_decay = 0.01
grad_clip = 1.0

training_iters = 100000
eval_interval = 50
eval_iters = 10
compile = False
checkpoint_dir = "models"
checkpoint_fn = "latest.pt"
checkpoint_load_fn = "latest.pt"
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
print(f"Using {dtype} dtype")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")


Using torch.float16 dtype
Using cpu device


In [68]:
wandb_log = True
wandb_project = "llm-course"
wandb_run_name = f"llm-train-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"
print(f"Using {wandb_run_name} run name")

if wandb_log:
    import wandb
    wandb.init(project=wandb_project, name=wandb_run_name)

Using llm-train-2024-08-17-21-49-48 run name


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011168074999982815, max=1.0â€¦

In [69]:
with open("wiki.txt", "r", encoding="utf-8") as f:
    text = f.read()

sp = spm.SentencePieceProcessor(model_file="wiki_tokenizer.model")

vocabulary_size = sp.vocab_size()
print(f"Vocabulary size: {vocabulary_size}")

Vocabulary size: 4096


In [70]:
encode = lambda s: sp.Encode(s)
decode = lambda l: sp.Decode(l)

print(encode("Hello, world!"))
print(decode([312, 471, 4037, 4053, 969, 36]
))

[312, 471, 4037, 4053, 969, 36]
Hello, world!


In [71]:
if os.path.exists('encoded_data.pt'):
    data = torch.load('encoded_data.pt')
else:
    data = encode(text, dtype=torch.long)
    torch.save(data, 'encoded_data.pt')





In [72]:
data_size = len(data)
print(f"Data size: {data_size}")

spl = int(data_size * 0.9)
train_data = data[:spl]
val_data = data[spl:]

print(f"Train data size: {len(train_data)}")
print(f"Val data size: {len(val_data)}")



Data size: 59211077
Train data size: 53289969
Val data size: 5921108


In [73]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    indices = torch.randint(len(data) - context_length, (batch_size,))
    input = torch.stack([data[i:i+context_length] for i in indices])
    target = torch.stack([data[i+1:i+context_length+1] for i in indices])

    input, target = input.to(device), target.to(device)
    return input, target



input, target = get_batch('train')
print(input.shape)
print(target.shape)


torch.Size([8, 512])
torch.Size([8, 512])


In [74]:
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.Embedding(vocabulary_size, embed_size)
        self.positions = nn.Embedding(context_length, embed_size)
        self.ln = nn.LayerNorm(embed_size)
        self.final_linear = nn.Linear(embed_size, vocabulary_size, bias=BIAS)
        self.apply(self._init_weights)  # Updated to use the new method name

    def _init_weights(self, module):  # Changed method name
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, input, targets=None):
        loss = None
        batch_size, sequence_length = input.shape  # Corrected shape access
        emb = self.embeddings(input)  # batch_size, sequence_length, embed_size
        positions = self.positions(
            torch.arange(sequence_length, device=device)
        )  # sequence_length, embed_size
        x = emb + positions  # batch_size, sequence_length, embed_size
        x = self.ln(x)  # batch_size, sequence_length, embed_size
        logits = self.final_linear(x)  # batch_size, sequence_length, vocab_size
        if targets is not None:
            batch_size, sequence_length, vocab_size = logits.shape
            logits = logits.view(batch_size * sequence_length, vocab_size)
            targets = targets.view(batch_size * sequence_length)
            loss = F.cross_entropy(logits, targets, ignore_index=0)
        return logits, loss

    def generate(self, input, max_tokens=500):
        for _ in range(max_tokens):
            input = input[:, -context_length:]
            logits = self(input)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            input = torch.cat([input, next_token], dim=1)
        return input


In [75]:
x,y = get_batch('train')
model1 = GPT()
model1 = model1.to(device)
model1.forward(x,y)

logits = model1(x,y)






In [76]:
@torch.no_grad()
def generate_sample(input):
    t1 = torch.tensor(encode(input), dtype=torch.long, device=device)
    t1 = t1[None, :]
    newgen = model1.generate(t1, max_tokens=64)[0].tolist()
    print(f"Input: {input}")
    print(f"Output: {decode(newgen)}")


generate_sample("once upon a time")

TypeError: tuple indices must be integers or slices, not tuple

In [None]:
class Block(nn.Module):
  def __init__(self, n_heads):
    super().__init__()
    head_size = embed_size // n_heads
    self.multihead = Multihead(n_heads, head_size)
    self.feed_forward = FeedForward(embed_size)
    self.ln1 = nn.LayerNorm(embed_size)
    self.ln2 = nn.LayerNorm(embed_size)

  def forward(self, x):
    x = x + self.multihead(self.ln1(x))
    x = x + self.feed_forward(self.ln2(x))
    return x






In [None]:
class forwardLayer(nn.Module):
  def __init__(self, embed_size):
    super().__init__()
    self.network = nn.Sequential(
      nn.Linear(embed_size, 6 * embed_size, bias=BIAS),
      nn.GELU(),
      nn.Linear(6 * embed_size, embed_size, bias=BIAS),
      nn.Dropout(dropout)
      )
  def forward(self, x):
    return self.network(x)

In [None]:
class Multihead(nn.Module):
  def __init__(self, n_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
    self.combine = nn.Linear(n_heads * head_size, embed_size, bias=BIAS)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    x = torch.cat([head(x) for head in self.heads], dim=-1)
    # each head returns batch, sequence_length, head_size
    x = self.combine(x)
    self.dropout(x)
    return x


In [77]:
class Head(nn.Module):
  def __init__(self, head_size):
    super().__init__()
    self.queries = nn.Linear(embed_size, head_size, bias=BIAS)
    self.keys = nn.Linear(embed_size, head_size, bias=BIAS)
    self.values = nn.Linear(embed_size, head_size, bias=BIAS)

    self.register_buffer('tril', torch.tril(torch.ones(context_length, context_length)))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    BS, SL, VS = x.shape
    q=self.queries(x)
    k=self.keys(x)
    v=self.values(x)

    attention_weights = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5
    attention_weights = attention_weights.masked_fill(self.tril[:SL, :SL] == 0, float('-inf'))
    attention_weights = F.softmax(attention_weights, dim=-1)
    x = attention_weights @ v
    return x


In [82]:
# Change dtype to float32 for model and inputs
dtype = torch.float  # Change this line
model1 = model1.to(dtype)  # Ensure model is in float32
# Ensure input is in long type for embedding
x = x.to(torch.long)  # Convert input to Long
y = y.to(torch.long)  # Convert target to Long if necessary

logits, loss = model1(x, y)
print(logits.shape)
print(loss.item())

torch.Size([4096, 4096])
8.38323974609375
