In [6]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)
import os
import requests
import numpy as np
import inspect
# detect cuda
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Args:
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

# Example usage
args = Args(
    lr=1e-4, 
    beta1=0.9,
    beta2=0.95,
    weight_decay=0.01,
    warmup_percent=0.05,
    scheduler = 'cos',
    batch_size=128, 
    num_epochs=30,
    eval_freq =10,
    device='cuda:0',
    model_name='gpt2',
    max_seq_length=256,
    prompt = "I would like to",
)

device_type = "cuda" if "cuda" in str(args.device) else "cpu"
if device_type == "cuda":
    torch.cuda.set_device(args.device)

model = AutoModelForCausalLM.from_pretrained(args.model_name)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
tokenizer.pad_token = tokenizer.eos_token
max_seq_length = min(tokenizer.model_max_length, args.max_seq_length)
print(tokenizer.vocab_size)

def get_shakespeare_dataset(max_seq_length=max_seq_length):
    char_tknzr = tokenizer.encode
    DATA_PATH = os.path.join(os.getcwd(), "datasets", "shakespeare")
    raw_path = os.path.join(DATA_PATH, "raw.txt")
    train_path = os.path.join(DATA_PATH, f"train.npy")
    test_path = os.path.join(DATA_PATH, f"test.npy")
    # if path is not even there, download all data
    if not os.path.exists(DATA_PATH):
        print("Downloading raw Shakespeare texts")
        url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
        os.makedirs(DATA_PATH, exist_ok=True)
        text = requests.get(url, timeout=60).text
        with open(raw_path, "w+", encoding="utf8") as f:
            f.write(text)
    
    if not os.path.exists(train_path) or not os.path.exists(test_path):
        print("Tokenizing Shakespeare texts")
        # load text
        with open(raw_path, encoding="utf8") as f:
            text = "".join(f.readlines())
        # encode text
        x_all = np.array(char_tknzr(text))
        len_x_all = len(x_all)
        seq = []
        for i in range(len_x_all // max_seq_length):
            x = x_all[i*max_seq_length:(i+1)*max_seq_length]
            seq.append(x)
        
        indices = np.random.permutation(len(seq))
        seq_shuffled = [seq[i] for i in indices]
        train = seq_shuffled[:int(0.8*len(seq))]
        val = seq_shuffled[int(0.8*len(seq)):]
        # mem = np.memmap(train_path, dtype=np.uint16, mode="w+", shape=(len(x_seq_train), max_seq_length))
        # for i, x in enumerate(x_seq_train):
        #     mem[i] = x
        # mem = np.memmap(test_path, dtype=np.uint16, mode="w+", shape=(len(x_seq_test), max_seq_length))
        # for i, x in enumerate(x_seq_test):
        #     mem[i] = x
    print(f'num train data 80 percent: {len(train)}, num val data 20 persent: {len(val)}, num tokens {len(x_all)} floor divided by max_seq_length {max_seq_length}')
    

    return {"train": train, "val": val, "shuffle": indices}

        # x = np.array(char_tknzr(text[:i]), dtype=np.uint16)
        # x_test = np.array(char_tknzr(text[i:]), dtype=np.uint16)
        # # map memory
        # mem = np.memmap(train_path, dtype=np.uint16, mode="w+", shape=x.shape)
        # mem[:] = x
        # mem = np.memmap(test_path, dtype=np.uint16, mode="w+", shape=x_test.shape)
        # mem[:] = x_test

    # # at this point we know that the binfile was properly created so we load it
    # return {"train": np.memmap(train_path, dtype=np.uint16, mode="r"),
    #         "val": np.memmap(test_path, dtype=np.uint16, mode="r"),
    #         "shuffle": indices}
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data

    def __len__(self):
        # chunk the data into sequences of length `sequence_length`
        # NOTE: we discard the last remainding sequence if it's not of length `sequence_length`
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        return sample

dataset = get_shakespeare_dataset(max_seq_length=args.max_seq_length)
train_dataset = MyDataset(dataset['train'])# sft_config = SFTConfig(
val_dataset = MyDataset(dataset['val'])

print(f"train dataset size: {len(train_dataset)}, val dataset size: {len(val_dataset)}")
#     dataset_text_field="text",
#     max_seq_length=512,
#     output_dir="/tmp",
# )
# trainer = SFTTrainer(
#     "gpt2",
#     train_dataset=dataset,
#     args=sft_config,
# )
# trainer.train()



50257
Tokenizing Shakespeare texts


Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors


num train data 80 percent: 1056, num val data 20 persent: 264, num tokens 338025 floor divided by max_seq_length 256
train dataset size: 1056, val dataset size: 264


In [2]:
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=False,
    )
val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        shuffle=False,
    )

print(f'num steps per epoch: {len(train_loader)}')
print(f'num steps per val epoch: {len(val_loader)}')
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


num steps per epoch: 9
num steps per val epoch: 3


In [3]:

input_ids = tokenizer(args.prompt, return_tensors="pt").input_ids.to(device)
gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.9,
    max_length=30,
    pad_token_id=tokenizer.eos_token_id  # EOS Token
)
gen_text = tokenizer.batch_decode(gen_tokens)[0]
print(gen_text)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


I would like to share with you some great tips and tricks that can help you out with your own build process.

Here are some of the


In [4]:

use_fused = (device_type == 'cuda') and ('fused' in inspect.signature(torch.optim.AdamW).parameters)
extra_args = dict(fused=True) if use_fused else dict()
opt = torch.optim.AdamW(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2),
                                weight_decay=args.weight_decay, **extra_args)

iterations = len(train_loader) * args.num_epochs
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=opt, max_lr=args.lr, 
                                                total_steps=iterations, 
                                                pct_start=args.warmup_percent, 
                                                anneal_strategy=args.scheduler, 
                                                cycle_momentum=False, div_factor=1e2, 
                                                final_div_factor=.1)

In [5]:

from torch.nn import functional as F
for epoch in range(args.num_epochs):
    model.train()
    for step_id, x in enumerate(train_loader):
        # print(f'x shape: {x.shape}, y shape: {y.shape}')
        x = x.to(device)
        opt.zero_grad()
        outputs = model(x, labels=x)
        loss = outputs.loss
        train_loss = loss.item()
        loss.backward()
        opt.step()
        scheduler.step()
        opt.zero_grad()

        if step_id % args.eval_freq == 0 or step_id == len(train_loader):
            # Validation
            model.eval()
            epoch = step_id//len(train_loader)
            current_lr = scheduler.get_last_lr()[0] if args.scheduler is not None else extra_args.lr
                
            correct_predictions = 0
            total_predictions = 0
            val_loss_sum = 0
            val_loss_list = []  
            val_acc_list = []
            num_predictions = args.batch_size * args.max_seq_length
            with torch.no_grad():
                for x_val in val_loader:
                    x_val = x_val.to(device)
                    val_outputs = model(x_val, labels=x_val)
                    # val_loss_list.append(val_outputs.loss)
                    # val_loss_sum += val_outputs.loss.item()

                    # Calculate token-level accuracy
                    logits = val_outputs.logits
                    shift_logits = logits[..., :-1, :].contiguous()
                    shift_labels = x_val[..., 1:].contiguous()
                    # Flatten the tokens
                    loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
                    # print(f'logits shape: {logits.shape}')
                    predictions = torch.argmax(shift_logits, dim=-1)
                    # print(f'predictions shape: {predictions.shape}')
                    acc = (predictions == shift_labels).float().mean()
                    val_loss_list.append(loss)
                    val_acc_list.append(acc)
                    # print(f'correct predictions: {correct_predictions}')
                    # total_predictions += torch.numel(x)
                    # print(f'total predictions: {total_predictions}')

            # val_loss = val_loss_sum / len(val_loader)
            # val_loss = sum(val_loss_list)/len(val_loss_list)
            # print(f'val loss: {val_loss}')
            val_acc = torch.stack(val_acc_list).mean().item()
            val_loss = torch.stack(val_loss_list).mean().item()
            # val_acc = sum(val_acc_list)/len(val_acc_list)
            # print(f'val acc: {val_acc}')

            print_string = f"{epoch}/{step_id + epoch * len(train_loader)} [train] loss={train_loss:.3f} [val] loss={val_loss:.3f}, acc={val_acc:3f}"
            if scheduler is not None:
                print_string += f" [lr] {current_lr:.5f}"
            print(print_string)
            input_ids = tokenizer(args.prompt, return_tensors="pt").input_ids.to(device)
            gen_tokens = model.generate(
                input_ids,
                do_sample=True,
                temperature=0.9,
                max_length=30,
                pad_token_id=tokenizer.eos_token_id  # EOS Token
            )
            gen_text = tokenizer.batch_decode(gen_tokens)[0]
            print(gen_text)

        model.train()


0/0 [train] loss=4.600 [val] loss=4.427, acc=0.331485 [lr] 0.00000
I would like to congratulate Toretto on his achievement in this monumental challenge."

"I think we have made a fantastic performance. I'm
0/0 [train] loss=4.021 [val] loss=3.803, acc=0.339440 [lr] 0.00009
I would like to thank all who, for their hospitality, and thank my dear neighbour, have remained steadfast, since the time of your parting, when
0/0 [train] loss=3.802 [val] loss=3.642, acc=0.354831 [lr] 0.00010
I would like to thank you, my children, for your hospitality and honourable service to us.

You look, my brothers; you cannot
0/0 [train] loss=3.648 [val] loss=3.548, acc=0.366105 [lr] 0.00010
I would like to thank you gentlemen for this gracious deed!
KING RICHARD II:
Sir, I am your lord.

KING
0/0 [train] loss=3.546 [val] loss=3.485, acc=0.375368 [lr] 0.00010
I would like to have done with you when I did come to thy house
And I tell him, thou young man, how much I can't
0/0 [train] loss=3.469 [val] loss=3.44

In [3]:
from torch.utils.data import DataLoader, SequentialSampler
from torch.utils.data import Dataset

# Example dataset
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

# Creating a sample dataset
data = list(range(100))
dataset = MyDataset(data)

# DistributedSampler usage
sampler = SequentialSampler(
    dataset
)

# DataLoader with the DistributedSampler
dataloader = DataLoader(dataset, sampler=sampler, batch_size=10)

# Simulating data loading in a distributed setup
for batch in dataloader:
    print(batch)


tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
tensor([10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
tensor([20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
tensor([30, 31, 32, 33, 34, 35, 36, 37, 38, 39])
tensor([40, 41, 42, 43, 44, 45, 46, 47, 48, 49])
tensor([50, 51, 52, 53, 54, 55, 56, 57, 58, 59])
tensor([60, 61, 62, 63, 64, 65, 66, 67, 68, 69])
tensor([70, 71, 72, 73, 74, 75, 76, 77, 78, 79])
tensor([80, 81, 82, 83, 84, 85, 86, 87, 88, 89])
tensor([90, 91, 92, 93, 94, 95, 96, 97, 98, 99])
