In [17]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollator,
    DataCollatorForLanguageModeling,
    PreTrainedModel,
    PreTrainedTokenizerBase,
    Trainer,
)
import os
import requests
import numpy as np
import inspect
# detect cuda
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Args:
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

# Example usage
args = Args(
    lr=1e-4, 
    beta1=0.9,
    beta2=0.95,
    weight_decay=0.1,
    warmup_percent=0.05,
    scheduler = 'cos',
    batch_size=64, 
    num_epochs=3,
    eval_freq = 20,
    device='cuda:0',
    model_name='gpt2',
    max_seq_length=512,
    prompt = "I would like to",
)

device_type = "cuda" if "cuda" in str(args.device) else "cpu"
if device_type == "cuda":
    torch.cuda.set_device(args.device)

model = AutoModelForCausalLM.from_pretrained(args.model_name)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
tokenizer.pad_token = tokenizer.eos_token
max_seq_length = min(tokenizer.model_max_length, args.max_seq_length)

def get_shakespeare_dataset(max_seq_length=max_seq_length):
    char_tknzr = tokenizer.encode
    DATA_PATH = os.path.join(os.getcwd(), "datasets", "shakespeare")
    raw_path = os.path.join(DATA_PATH, "raw.txt")
    train_path = os.path.join(DATA_PATH, f"train.npy")
    test_path = os.path.join(DATA_PATH, f"test.npy")
    # if path is not even there, download all data
    if not os.path.exists(DATA_PATH):
        print("Downloading raw Shakespeare texts")
        url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
        os.makedirs(DATA_PATH, exist_ok=True)
        text = requests.get(url, timeout=60).text
        with open(raw_path, "w+", encoding="utf8") as f:
            f.write(text)
    
    if not os.path.exists(train_path) or not os.path.exists(test_path):
        print("Tokenizing Shakespeare texts")
        # load text
        with open(raw_path, encoding="utf8") as f:
            text = "".join(f.readlines())
        i = int(0.8*len(text))
        # encode text
        x_all = np.array(char_tknzr(text))
        idx = 0
        len_x_all = len(x_all)
        x_seq = []
        y_seq = []  
        for i in range((len_x_all-1) // max_seq_length):
            x = x_all[i*max_seq_length:(i+1)*max_seq_length]
            y = x_all[i*max_seq_length+1:(i+1)*max_seq_length+1]
            x_seq.append(x)
            y_seq.append(y)
        
        indices = np.random.permutation(len(x_seq))
        x_seq_shuffled = [x_seq[i] for i in indices]
        y_seq_shuffled = [y_seq[i] for i in indices]
        train_x, train_y = x_seq_shuffled[:int(0.8*len(x_seq))], y_seq_shuffled[:int(0.8*len(x_seq))]
        val_x, val_y = x_seq_shuffled[int(0.8*len(x_seq)):], y_seq_shuffled[int(0.8*len(x_seq)):]
        # mem = np.memmap(train_path, dtype=np.uint16, mode="w+", shape=(len(x_seq_train), max_seq_length))
        # for i, x in enumerate(x_seq_train):
        #     mem[i] = x
        # mem = np.memmap(test_path, dtype=np.uint16, mode="w+", shape=(len(x_seq_test), max_seq_length))
        # for i, x in enumerate(x_seq_test):
        #     mem[i] = x
    print(f'num train data 80 percent: {len(train_x)}, num val data 20 persent: {len(val_x)}, num tokens {len(x_all)} floor divided by max_seq_length {max_seq_length}')
    

    return {"train_x": train_x, "train_y": train_y, "val_x": val_x, "val_y": val_y, "shuffle": indices}

        # x = np.array(char_tknzr(text[:i]), dtype=np.uint16)
        # x_test = np.array(char_tknzr(text[i:]), dtype=np.uint16)
        # # map memory
        # mem = np.memmap(train_path, dtype=np.uint16, mode="w+", shape=x.shape)
        # mem[:] = x
        # mem = np.memmap(test_path, dtype=np.uint16, mode="w+", shape=x_test.shape)
        # mem[:] = x_test

    # # at this point we know that the binfile was properly created so we load it
    # return {"train": np.memmap(train_path, dtype=np.uint16, mode="r"),
    #         "val": np.memmap(test_path, dtype=np.uint16, mode="r"),
    #         "shuffle": indices}
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        super().__init__()
        self.data = data
        self.labels = labels

    def __len__(self):
        # chunk the data into sequences of length `sequence_length`
        # NOTE: we discard the last remainding sequence if it's not of length `sequence_length`
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        label = self.labels[idx]
        return sample, label

dataset = get_shakespeare_dataset(max_seq_length=max_seq_length)
train_dataset = MyDataset(dataset['train_x'], dataset['train_y'])# sft_config = SFTConfig(
val_dataset = MyDataset(dataset['val_x'], dataset['val_y'])

print(f"train dataset size: {len(train_dataset)}, val dataset size: {len(val_dataset)}")
#     dataset_text_field="text",
#     max_seq_length=512,
#     output_dir="/tmp",
# )
# trainer = SFTTrainer(
#     "gpt2",
#     train_dataset=dataset,
#     args=sft_config,
# )
# trainer.train()

Tokenizing Shakespeare texts


Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors


num train data 80 percent: 528, num val data 20 persent: 132, num tokens 338025 floor divided by max_seq_length 512
train dataset size: 528, val dataset size: 132


In [18]:
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=False,
    )
val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        shuffle=False,
    )

print(f'num steps per epoch: {len(train_loader)}')
print(f'num steps per val epoch: {len(val_loader)}')
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


num steps per epoch: 9
num steps per val epoch: 3


In [19]:

input_ids = tokenizer(args.prompt, return_tensors="pt").input_ids.to(device)
gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.9,
    max_length=30,
    pad_token_id=tokenizer.eos_token_id  # EOS Token
)
gen_text = tokenizer.batch_decode(gen_tokens)[0]
print(gen_text)

I would like to do it in a way that shows it's not just a gimmick," she told RTL's Lidia Jones.

"


In [20]:

use_fused = (device_type == 'cuda') and ('fused' in inspect.signature(torch.optim.AdamW).parameters)
extra_args = dict(fused=True) if use_fused else dict()
opt = torch.optim.AdamW(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2),
                                weight_decay=args.weight_decay, **extra_args)

iterations = len(train_loader) * args.num_epochs
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=opt, max_lr=args.lr, 
                                                total_steps=iterations, 
                                                pct_start=args.warmup_percent, 
                                                anneal_strategy=args.scheduler, 
                                                cycle_momentum=False, div_factor=1e2, 
                                                final_div_factor=.1)

In [21]:
from tqdm import tqdm

for epoch in range(args.num_epochs):
    model.train()
    for step_id, (x, y) in enumerate(tqdm(train_loader)):
        # print(f'x shape: {x.shape}, y shape: {y.shape}')
        x = x.to(device)
        y = y.to(device)
        opt.zero_grad()
        outputs = model(x, labels=y)
        loss = outputs.loss
        loss.backward()
        opt.step()
        scheduler.step()
        opt.zero_grad()

        if step_id % args.eval_freq == 0 or step_id == len(train_loader):
            # Validation
            model.eval()
            correct_predictions = 0
            total_predictions = 0
            val_loss = 0
            
            with torch.no_grad():
                for x, y in val_loader:
                    x = x.to(device)
                    y = y.to(device)
                    val_outputs = model(x, labels=y)
                    val_loss += val_outputs.loss.item()

                    # Calculate token-level accuracy
                    logits = val_outputs.logits
                    predictions = torch.argmax(logits, dim=-1)
                    correct_predictions += (predictions == y).sum().item()
                    total_predictions += torch.numel(x)

            avg_val_loss = val_loss / len(val_loader)
            accuracy = correct_predictions / total_predictions

            print(f"Epoch {epoch+1} Step {step_id}: Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}")
            model.train()


 11%|█         | 1/9 [00:02<00:22,  2.79s/it]

Epoch 1 Step 0: Validation Loss: 9.4229, Accuracy: 0.3460


100%|██████████| 9/9 [00:14<00:00,  1.61s/it]
 11%|█         | 1/9 [00:02<00:21,  2.72s/it]

Epoch 2 Step 0: Validation Loss: 6.1429, Accuracy: 0.1787


100%|██████████| 9/9 [00:14<00:00,  1.60s/it]
 11%|█         | 1/9 [00:02<00:21,  2.72s/it]

Epoch 3 Step 0: Validation Loss: 5.8890, Accuracy: 0.1741


100%|██████████| 9/9 [00:14<00:00,  1.60s/it]
