In [1]:
!pip install -q accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!nvidia-smi

Thu Jun  6 15:32:29 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   45C    P8              13W /  72W |      1MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
import os
from types import SimpleNamespace

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    get_cosine_schedule_with_warmup
)

os.environ['TOKENIZER_PARALLELISM'] = 'false'

# Config

In [4]:
cfg = {
    'model_id': 'h2oai/h2o-danube-1.8b-base',
    'context_length': 256,
    'batch_size': 2,
    'num_epochs': 10,
    'learning_rate': 0.00004,
    'weight_decay': 0.01,
    'seed': 252,
    'logging_steps': 1,
    'device': 'cuda' if torch.cuda.is_available else 'cpu'
}

cfg = SimpleNamespace(**cfg)

# Load data

In [5]:
data_file_path = "/content/the-verdict.txt"
with open(data_file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

In [6]:
# First 100 characters
print(text_data[:99])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


# Tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

# Dataset & DataLoader

In [9]:
class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt)

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader(txt, tokenizer, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True, num_workers=0):

    # Create dataset
    dataset = GPTDataset(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=0
    )

    return dataloader

In [10]:
torch.manual_seed(seed=cfg.seed)

train_dataloader = create_dataloader(
    txt=text_data,
    tokenizer=tokenizer,
    batch_size=cfg.batch_size,
    max_length=cfg.context_length,
    stride=cfg.context_length,
    shuffle=True,
    drop_last=True,
    num_workers=1
)

In [11]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))

print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 20479
Tokens: 5562


In [12]:
print("Train loader:")
for x, y in train_dataloader:
    print(x.shape, y.shape)

Train loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [13]:
train_tokens = 0
for input_batch, target_batch in train_dataloader:
    train_tokens += input_batch.numel()

print("Training tokens:", train_tokens)

Training tokens: 5120


# Model, Optimizer, and Scheduler

In [14]:
torch.manual_seed(cfg.seed)

model = AutoModelForCausalLM.from_pretrained(
    cfg.model_id,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    device_map=cfg.device
)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=cfg.learning_rate,
    weight_decay=cfg.weight_decay
)

scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=cfg.num_epochs*len(train_dataloader)
)

config.json:   0%|          | 0.00/635 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.66G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

# Pretraining

In [15]:
for epoch in range(cfg.num_epochs):

    for batch_idx, (input_batch, target_batch) in enumerate(train_dataloader):

        model.train()

        input_batch = input_batch.to(cfg.device)
        target_batch = target_batch.to(cfg.device)

        logits = model(input_batch).logits

        loss = F.cross_entropy(
            logits.flatten(0, 1),
            target_batch.flatten()
        )

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        if not (batch_idx % cfg.logging_steps):
            print(
                f'Epoch: {epoch+1}/{cfg.num_epochs}'
                f' | Batch {batch_idx+1}/{len(train_dataloader)}'
                f' | Loss: {loss.item():.4f}'
            )

Epoch: 1/10 | Batch 1/10 | Loss: 2.7401
Epoch: 1/10 | Batch 2/10 | Loss: 2.8946
Epoch: 1/10 | Batch 3/10 | Loss: 3.0911
Epoch: 1/10 | Batch 4/10 | Loss: 3.0325
Epoch: 1/10 | Batch 5/10 | Loss: 2.9545
Epoch: 1/10 | Batch 6/10 | Loss: 2.9419
Epoch: 1/10 | Batch 7/10 | Loss: 2.7240
Epoch: 1/10 | Batch 8/10 | Loss: 2.7759
Epoch: 1/10 | Batch 9/10 | Loss: 2.7292
Epoch: 1/10 | Batch 10/10 | Loss: 2.8143
Epoch: 2/10 | Batch 1/10 | Loss: 1.8911
Epoch: 2/10 | Batch 2/10 | Loss: 1.2617
Epoch: 2/10 | Batch 3/10 | Loss: 1.8958
Epoch: 2/10 | Batch 4/10 | Loss: 1.2020
Epoch: 2/10 | Batch 5/10 | Loss: 1.5542
Epoch: 2/10 | Batch 6/10 | Loss: 1.0094
Epoch: 2/10 | Batch 7/10 | Loss: 0.9212
Epoch: 2/10 | Batch 8/10 | Loss: 1.1122
Epoch: 2/10 | Batch 9/10 | Loss: 0.8384
Epoch: 2/10 | Batch 10/10 | Loss: 1.2413
Epoch: 3/10 | Batch 1/10 | Loss: 0.6549
Epoch: 3/10 | Batch 2/10 | Loss: 0.6658
Epoch: 3/10 | Batch 3/10 | Loss: 0.4495
Epoch: 3/10 | Batch 4/10 | Loss: 0.3024
Epoch: 3/10 | Batch 5/10 | Loss: 0.355

In [16]:
torch.save(model.state_dict(), "my_llm.pth")