# Install datasets
Restart runtime after installing libraries.

In [None]:
!pip install datasets
!pip install transformers
!pip install accelerate

# Download dataset and model
Download dataset from huggingface or get local dataset

In [None]:
# download alpaca dataset 
from datasets import load_dataset
dataset = load_dataset("tatsu-lab/alpaca")

In [None]:
dataset

In [None]:
# download model 
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM, IntervalStrategy

torch.manual_seed(42)
output_dir = "EleutherAI/gpt-neo-125m"
tokenizer = AutoTokenizer.from_pretrained(output_dir, bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = AutoModelForCausalLM.from_pretrained(output_dir, low_cpu_mem_usage=True, device_map="auto")  
model.resize_token_embeddings(len(tokenizer))

# Preprocess

In [None]:
dataset = dataset['train']

In [None]:
# format dataset 
data_list = []
text = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request."
for i in len(dataset['instruction']):
    if dataset['input']:
        temp = text + "###Instruction:" + dataset['instruction'][i] + "\n\n###Input:" + dataset['input'][i] + "\n\n###Output:" + dataset['output'][i]
    else:
        temp = text + "###Instruction:" + dataset['instruction'][i] + "\n\n###Output:" + dataset['output'][i]
    data_list.append(temp)

In [None]:
data_list = pd.Series(data_list)

In [None]:
data_list[0]

In [None]:
len(data_list)

In [None]:
# find longest sequence 
max_length = max([len(tokenizer.encode(e)) for e in data_list])
print("Max length: {}".format(max_length))

In [None]:
# class to tokenize and add attention mask to dataset  
class MyDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer(f"<|startoftext|>"+ txt + "<|endoftext|>", truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
mydataset = MyDataset(data_list, tokenizer, max_length = max_length)

In [None]:
mydataset

# Train

In [None]:
import gc
import torch
from torch.cuda.amp import autocast, GradScaler
from transformers import DataCollatorForLanguageModeling, AutoConfig, default_data_collator
gc.collect()
torch.cuda.empty_cache()

In [None]:
use_fp16 = True  # True if you're using mixed precision

training_args = TrainingArguments(
    output_dir = "finetune_save",  # model save path 
    optim="adafactor", # uses less memory 
    num_train_epochs=1,  # number of train epochs 
    per_device_train_batch_size=2,  # batch size 
    fp16=use_fp16,  # whether to use mixed precision 
    learning_rate=5e-5,  # learning rate determines the size of the steps taken during the optimization process
    logging_strategy="steps",  # print log with steps 
    warmup_steps=3000,  # initial phase where the learning rate is gradually increased from a small value to its intended value
    weight_decay=0.01,  # regularization technique used 
    logging_steps=0.1,  # percentage of steps being updated 
    save_steps=0.06, # save model, decimals is percent and integer is number of steps
    save_strategy="steps",  # save model according to steps or epoch 
    report_to="none"  
)

if use_fp16:
    scaler = GradScaler()  # prevent underflow

trainer = Trainer(model=model, args=training_args, train_dataset=mydataset,
    data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])})

trainer.train()

# Load saved model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(output_dir, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(device=f"cuda", non_blocking=True)  # use mixed precision 

# Load Dataset again

In [None]:
from datasets import load_dataset
dataset = load_dataset("tatsu-lab/alpaca")
dataset = dataset['train']

# Generate 1
Generate from model

In [None]:
i = 0
text = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request."
if dataset['input'][i]:
    temp = text + "###Instruction:" + dataset['instruction'][i] + "\n\n###Input:" + dataset['input'][i] + "\n\n###Output:"
else:
    temp = text + "###Instruction:" + dataset['instruction'][i] + "\n\n###Output:"

In [None]:
temp

In [None]:
tokenizer.pad_token = "[PAD]"
tokenizer.padding_side = "left"
generated = tokenizer("<|startoftext|>" + temp, return_tensors="pt").input_ids.cuda()
sample_outputs = model.generate(generated, do_sample=True, # can use hyperparameters if set True
                                          top_k=50, # choose from the top k words while generating 
                                          top_p=0.95 # choose from top p% words while generating
                                          min_length=20, # minimum generation length
                                          max_length=300, # maximum generation length
                                          temperature=1.0, # higher likelihood of generating low-probability words, likely to get unpredictable responses
                                          num_return_sequences=5 # generate n question-answer sets 
                        ) 

In [None]:
for i, sample_output in enumerate(sample_outputs):
    print("{}: {} \n".format(i, tokenizer.decode(
        sample_output, skip_special_tokens=True)))

# Generate 2
Generate using pipeline

In [None]:
from transformers import pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0) 

In [None]:
response = generator(temp, do_sample=True, # can use hyperparameters if set True
							top_k=50, # choose from the top k words while generating 
                            top_p=0.95 # choose from top p% words while generating
						    min_length=20, # minimum generation length
							max_length=300, # maximum generation length
							temperature=1.0, # higher likelihood of generating low-probability words, likely to get unpredictable responses
                            return_full_text=True
					)
print(response[0])

# Upload to Huggingface

In [None]:
from huggingface_hub import login
login()

In [None]:
huggingface_path = "huggingfaceid/repositoryname"

In [None]:
tokenizer.push_to_hub(huggingface_path)
model.push_to_hub(huggingface_path)