In [11]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollator,
    DataCollatorForLanguageModeling,
    PreTrainedModel,
    PreTrainedTokenizerBase,
    Trainer,
)
import os
import requests
import numpy as np
import inspect
# detect cuda
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Args:
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

# Example usage
args = Args(
    lr=1e-4, 
    beta1=0.9,
    beta2=0.95,
    weight_decay=0.01,
    warmup_percent=0.05,
    scheduler = 'cos',
    batch_size=32, 
    num_epochs=3,
    eval_freq = 2,
    device='cuda:0',
    model_name='gpt2',
    max_seq_length=256,
    prompt = "I would like to",
)

device_type = "cuda" if "cuda" in str(args.device) else "cpu"
if device_type == "cuda":
    torch.cuda.set_device(args.device)

model = AutoModelForCausalLM.from_pretrained(args.model_name)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
tokenizer.pad_token = tokenizer.eos_token
max_seq_length = min(tokenizer.model_max_length, args.max_seq_length)

def get_shakespeare_dataset(max_seq_length=max_seq_length):
    char_tknzr = tokenizer.encode
    DATA_PATH = os.path.join(os.getcwd(), "datasets", "shakespeare")
    raw_path = os.path.join(DATA_PATH, "raw.txt")
    train_path = os.path.join(DATA_PATH, f"train.npy")
    test_path = os.path.join(DATA_PATH, f"test.npy")
    # if path is not even there, download all data
    if not os.path.exists(DATA_PATH):
        print("Downloading raw Shakespeare texts")
        url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
        os.makedirs(DATA_PATH, exist_ok=True)
        text = requests.get(url, timeout=60).text
        with open(raw_path, "w+", encoding="utf8") as f:
            f.write(text)
    
    if not os.path.exists(train_path) or not os.path.exists(test_path):
        print("Tokenizing Shakespeare texts")
        # load text
        with open(raw_path, encoding="utf8") as f:
            text = "".join(f.readlines())
        i = int(0.8*len(text))
        # encode text
        x_all = np.array(char_tknzr(text))
        idx = 0
        len_x_all = len(x_all)
        x_seq = []
        # y_seq = []  
        for i in range(len_x_all // max_seq_length):
            x = x_all[i*max_seq_length:(i+1)*max_seq_length]
            # y = x_all[i*max_seq_length+1:(i+1)*max_seq_length+1]
            x_seq.append(x)
            # y_seq.append(y)
        
        indices = np.random.permutation(len(x_seq))
        seq_shuffled = [x_seq[i] for i in indices]
        # y_seq_shuffled = [y_seq[i] for i in indices]
        train = seq_shuffled[:int(0.8*len(x_seq))]
        val = seq_shuffled[int(0.8*len(x_seq)):]
        # mem = np.memmap(train_path, dtype=np.uint16, mode="w+", shape=(len(x_seq_train), max_seq_length))
        # for i, x in enumerate(x_seq_train):
        #     mem[i] = x
        # mem = np.memmap(test_path, dtype=np.uint16, mode="w+", shape=(len(x_seq_test), max_seq_length))
        # for i, x in enumerate(x_seq_test):
        #     mem[i] = x
    print(f'num train data 80 percent: {len(train)}, num val data 20 persent: {len(val)}, num tokens {len(x_all)} floor divided by max_seq_length {max_seq_length}')
    

    return {"train_x": train_x, "train_y": train_y, "val_x": val_x, "val_y": val_y, "shuffle": indices}

        # x = np.array(char_tknzr(text[:i]), dtype=np.uint16)
        # x_test = np.array(char_tknzr(text[i:]), dtype=np.uint16)
        # # map memory
        # mem = np.memmap(train_path, dtype=np.uint16, mode="w+", shape=x.shape)
        # mem[:] = x
        # mem = np.memmap(test_path, dtype=np.uint16, mode="w+", shape=x_test.shape)
        # mem[:] = x_test

    # # at this point we know that the binfile was properly created so we load it
    # return {"train": np.memmap(train_path, dtype=np.uint16, mode="r"),
    #         "val": np.memmap(test_path, dtype=np.uint16, mode="r"),
    #         "shuffle": indices}
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        super().__init__()
        self.data = data
        self.labels = labels

    def __len__(self):
        # chunk the data into sequences of length `sequence_length`
        # NOTE: we discard the last remainding sequence if it's not of length `sequence_length`
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        label = self.labels[idx]
        return sample, label

dataset = get_shakespeare_dataset(max_seq_length=args.max_seq_length)
train_dataset = MyDataset(dataset['train_x'], dataset['train_y'])# sft_config = SFTConfig(
val_dataset = MyDataset(dataset['val_x'], dataset['val_y'])

print(f"train dataset size: {len(train_dataset)}, val dataset size: {len(val_dataset)}")
#     dataset_text_field="text",
#     max_seq_length=512,
#     output_dir="/tmp",
# )
# trainer = SFTTrainer(
#     "gpt2",
#     train_dataset=dataset,
#     args=sft_config,
# )
# trainer.train()



Tokenizing Shakespeare texts


Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors


num train data 80 percent: 1056, num val data 20 persent: 264, num tokens 338025 floor divided by max_seq_length 256
train dataset size: 1056, val dataset size: 264


In [12]:
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=False,
    )
val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        shuffle=False,
    )

print(f'num steps per epoch: {len(train_loader)}')
print(f'num steps per val epoch: {len(val_loader)}')
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


num steps per epoch: 33
num steps per val epoch: 9


In [13]:

input_ids = tokenizer(args.prompt, return_tensors="pt").input_ids.to(device)
gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.9,
    max_length=30,
    pad_token_id=tokenizer.eos_token_id  # EOS Token
)
gen_text = tokenizer.batch_decode(gen_tokens)[0]
print(gen_text)

I would like to thank our donors, and our volunteers and our community, for working so hard.

In the past ten years, we've


In [14]:

use_fused = (device_type == 'cuda') and ('fused' in inspect.signature(torch.optim.AdamW).parameters)
extra_args = dict(fused=True) if use_fused else dict()
opt = torch.optim.AdamW(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2),
                                weight_decay=args.weight_decay, **extra_args)

iterations = len(train_loader) * args.num_epochs
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=opt, max_lr=args.lr, 
                                                total_steps=iterations, 
                                                pct_start=args.warmup_percent, 
                                                anneal_strategy=args.scheduler, 
                                                cycle_momentum=False, div_factor=1e2, 
                                                final_div_factor=.1)

In [15]:

from torch.nn import functional as F
for epoch in range(args.num_epochs):
    model.train()
    for step_id, (x, y) in enumerate(train_loader):
        # print(f'x shape: {x.shape}, y shape: {y.shape}')
        x = x.to(device)
        y = y.to(device)
        opt.zero_grad()
        outputs = model(x, labels=x)
        loss = outputs.loss
        train_loss = loss.item()
        loss.backward()
        opt.step()
        scheduler.step()
        opt.zero_grad()

        if step_id % args.eval_freq == 0 or step_id == len(train_loader):
            # Validation
            model.eval()
            epoch = step_id//len(train_loader)
            current_lr = scheduler.get_last_lr()[0] if args.scheduler is not None else extra_args.lr
                
            correct_predictions = 0
            total_predictions = 0
            val_loss_sum = 0
            val_loss_list = []  
            val_acc_list = []
            num_predictions = args.batch_size * args.max_seq_length
            with torch.no_grad():
                for x_val, y_val in val_loader:
                    x_val = x_val.to(device)
                    y_val = y_val.to(device)
                    val_outputs = model(x_val, labels=y_val)
                    # val_loss_list.append(val_outputs.loss)
                    # val_loss_sum += val_outputs.loss.item()

                    # Calculate token-level accuracy
                    logits = val_outputs.logits
                    # print(f'logits shape: {logits.shape}')
                    predictions = torch.argmax(logits, dim=-1)
                    # print(f'predictions shape: {predictions.shape}')
                    acc = (predictions == y_val).float().mean()
                    loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_val.view(-1))
                    val_loss_list.append(loss)
                    val_acc_list.append(acc)
                    # print(f'correct predictions: {correct_predictions}')
                    # total_predictions += torch.numel(x)
                    # print(f'total predictions: {total_predictions}')

            # val_loss = val_loss_sum / len(val_loader)
            # val_loss = sum(val_loss_list)/len(val_loss_list)
            # print(f'val loss: {val_loss}')
            val_acc = torch.stack(val_acc_list).mean().item()
            val_loss = torch.stack(val_loss_list).mean().item()
            # val_acc = sum(val_acc_list)/len(val_acc_list)
            # print(f'val acc: {val_acc}')

            print_string = f"{epoch}/{step_id + epoch * len(train_loader)} [train] loss={train_loss:.3f} [val] loss={val_loss:.3f}, acc={val_acc:3f}"
            if scheduler is not None:
                print_string += f" [lr] {current_lr:.5f}"
            print(print_string)
            input_ids = tokenizer(args.prompt, return_tensors="pt").input_ids.to(device)
            gen_tokens = model.generate(
                input_ids,
                do_sample=True,
                temperature=0.9,
                max_length=30,
                pad_token_id=tokenizer.eos_token_id  # EOS Token
            )
            gen_text = tokenizer.batch_decode(gen_tokens)[0]
            print(gen_text)

        model.train()


0/0 [train] loss=4.606 [val] loss=4.410, acc=0.323581 [lr] 0.00002
I would like to know if anyone has any other thoughts on the matter: What do you think would have happened if you are currently in charge of The
0/2 [train] loss=4.445 [val] loss=3.969, acc=0.316203 [lr] 0.00009
I would like to speak with you about the situation.


What are your thoughts on the recent situation, and why did you decide to go back
0/4 [train] loss=4.093 [val] loss=3.909, acc=0.314019 [lr] 0.00010
I would like to say thank you for doing so great work. I am so very privileged with the opportunity to be able to become your mentor here.
0/6 [train] loss=3.977 [val] loss=3.846, acc=0.325534 [lr] 0.00010
I would like to ask you to give me some assurance that I am fully willing to meet as many worthy men as I can bear in your midst,
0/8 [train] loss=4.025 [val] loss=3.812, acc=0.330295 [lr] 0.00010
I would like to hear your answer, it seems.

How you say, "I am not your father. I love my father," says
0/10 [trai

KeyboardInterrupt: 