# Trainer

In [1]:
import os

import torch
import transformers
from tqdm import tqdm

# Set variables
#os.environ["WANDB_NOTEBOOK_NAME"] = "4a_finetune_gpt2-distil_descriptions"
os.environ['CUDA_VISIBLE_DEVICES']='1'

# Setup PyTorch Dataset subclass
class wineDataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings
            
  def __len__(self):
    return len(self.encodings['input_ids'])
    
  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = item['input_ids']
    return item
  

# Setup tokenizer
tokenizer = transformers.GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.add_special_tokens(
  {'eos_token':'<|startoftext|>',
   'bos_token':'<|startoftext|>'
  }
)
tokenizer.add_tokens(['[prompt]','[response]','[category_1]',
                      '[category_2]','[origin]','[description]',
                      '<|endoftext|>'])

tokenizer.pad_token = tokenizer.eos_token

tokenizer.save_pretrained('data/modeling/gpt2_distil_model/')

with open('data/scraped/name_desc_nlp_ready_train.txt', 'r', encoding='utf8') as file:
    wines_raw_train = file.read().splitlines()
with open('data/scraped/name_desc_nlp_ready_test.txt', 'r', encoding='utf8') as file:
    wines_raw_test = file.read().splitlines()
print("Loaded dataset")

wine_encodings_train = tokenizer(wines_raw_train, padding='max_length')
wine_encodings_test = tokenizer(wines_raw_test, padding='max_length')
print("Encoded dataset")

wine_dataset_train = wineDataset(wine_encodings_train)
wine_dataset_test = wineDataset(wine_encodings_test)
print("Created PyTorch DataSet")


Loaded dataset
Encoded dataset
Created PyTorch DataSet


In [None]:
model = transformers.AutoModelForCausalLM.from_pretrained('distilgpt2')
model.resize_token_embeddings(len(tokenizer))
model.save_pretrained('data/modeling/gpt2_distil_model/')
print("Loaded model")

training_args = transformers.TrainingArguments(
  output_dir="data/gpt2_runs/distilgpt2-trainer", #The output directory
  overwrite_output_dir=True, #overwrite the content of the output directory
  num_train_epochs=1, # number of training epochs
  per_device_train_batch_size=2, # batch size for training
  per_device_eval_batch_size=2,  # batch size for evaluation
  eval_steps=500, # Number of update steps between two evaluations.
  save_steps=2000, # after # steps model is saved
  warmup_steps=500, # number of warmup steps for learning rate scheduler
)

trainer = transformers.Trainer(
  model=model,
  args=training_args,
  train_dataset=wine_dataset_train,
  eval_dataset=wine_dataset_test
)
print("Set up trainer")

In [None]:
trainer.train()

--------------------------------

# Native PyTorch

In [None]:
model = transformers.GPT2LMHeadModel.from_pretrained('distilgpt2')
model.resize_token_embeddings(len(tokenizer))
model.train()

optim = transformers.AdamW(model.parameters(), lr=5e-5)

train_loader = torch.utils.data.DataLoader(wine_dataset)

for epoch in range(3):
  for batch in train_loader:
    optim.zero_grad()
    #input_ids = batch.to(device)
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    print(len(input_ids))
    #attention_mask = batch['attention_mask'].to(device)
    #labels = batch['labels'].to(device)
    outputs = model(**batch, labels=input_ids, return_dict=True)
    loss = outputs['loss']
    loss.backward()
    optim.step()

In [None]:
outputs.keys()

--------------

# HuggingFace Script

Due to the method in which Jupyter processes shell commands it won't show STDOUT live, only outputting once the run is finished. So I prefer to just paste this into a terminal instead of running in here.

In [None]:
!python transformers/examples/language-modeling/run_language_modeling.py \
--output_dir gpt2_distil_output \
--model_type distilgpt2 \
--model_name_or_path "data/modeling/gpt2_distil_model/" \
--do_train \
--train_data_file "data/scraped/name_desc_nlp_ready_train.txt" \
--do_eval \
--eval_data_file "data/scraped/name_desc_nlp_ready_test.txt" \
--per_gpu_train_batch_size 1 \
--overwrite_output_dir