# Transfer Learning with GPT2

###### Introductory code to fine-tuning GPT2. Based on transformers library  https://github.com/huggingface/transformers

## First install transformers and import required libraries


In [1]:
!pip install transformers==2.3.0
import os
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup



## Load data from your local hard drive
##### Training data should be called train.txt and should consist of contiguous text with approximately 500 words per line (after tokenization any line longer than 512 tokens will be cut off). If there is an interruption in the text (e.g. beginning of a new chapter if using text from a book) insert the special token <|endoftext|> here. A validation file called val.txt should be created in the same format.

In [12]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving train_continuous.tsv to train_continuous.tsv
Saving val_continuous.tsv to val_continuous.tsv
User uploaded file "train_continuous.tsv" with length 9634271 bytes
User uploaded file "val_continuous.tsv" with length 1075547 bytes


## Create Class for processing dataset and returning sample

In [0]:
class Dataset(Dataset):
    def __init__(self, file, tokenizer, max_length, pad=False):
        'Initialization'
        self.data = self.read_file(file)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.padding_token = tokenizer.eos_token
        self.preprocess(pad)
        
    def read_file(self, file):
        'Returns list of paragaphs'
        f = open(file, encoding='utf-8')
        data = f.read()
        f.close()
        data = data.split('\n')
        data = [entry for entry in data if entry]
        return data
    
    def preprocess(self, pad):
        'Preprocesses dataset by padding'
        
        # get max length of input data (as far as max possible length) include beginning and end  of sentence tokens
        tokens = [self.tokenizer.tokenize(line) for line in self.data]
        tokens_length = [len(tokens_sample) for tokens_sample in tokens]
        token_ids = [self.tokenizer.convert_tokens_to_ids(tokens_sample) for tokens_sample in tokens]
        
        # sort token ids
        token_ids_sorted = [x for _, x in sorted(zip(tokens_length, token_ids), reverse=True)]
        token_ids_sorted = [token_ids_sample[:self.max_length] for token_ids_sample in token_ids_sorted]
        
        if pad == True:
            token_ids_length = [len(tokens_sample) for tokens_sample in token_ids_sorted]
            padding_token_id = self.tokenizer.convert_tokens_to_ids(self.padding_token)
            token_ids_padded = [token_id+[padding_token_id]*(self.max_length - len(token_id)) for token_id in token_ids_sorted]
        else:
            token_ids_padded = token_ids_sorted

        self.token_ids = token_ids_padded

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.data)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        sample = torch.tensor(self.token_ids[index])
        return sample

## Define Model Parameters

In [0]:
batch_size = 1
shuffle = True
drop_last = False
num_epochs = 100

weight_decay = 0.0
learning_rate = 5e-5
adam_epsilon = 1e-8
warmup_steps = 0
max_grad_norm = 1.0
gradient_accumulation_steps = 4

## Read model config, tokenizer and model class and weights

In [0]:
BASE_MODEL = 'gpt2'
config = GPT2Config.from_pretrained(BASE_MODEL)
tokenizer = GPT2Tokenizer.from_pretrained(BASE_MODEL)
model = GPT2LMHeadModel.from_pretrained(BASE_MODEL)
parent_directory = os.getcwd()
output_directory = os.path.join(parent_directory, 'saved_model')
if not os.path.isdir(output_directory):
  os.mkdir(output_directory)

## Define train/eval dataset and dataloaders

In [0]:
train_path = os.path.join(parent_directory, 'train.txt')
eval_path = os.path.join(parent_directory, 'val.txt')
train_dataset = Dataset(train_path, tokenizer, tokenizer.max_len, pad=True)
eval_dataset = Dataset(eval_path, tokenizer, tokenizer.max_len, pad=True)

# define train/eval dataloaders
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)
eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=batch_size,
                            shuffle=False, drop_last=drop_last)
t_total = len(train_dataloader) // gradient_accumulation_steps * num_epochs

In [6]:
# optional weight decay
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

## Run model for n epochs and print validation and training losses. 
##### With small datasets (~10MB) the model will start to overfit after around 5-10 epochs. Keep track of the loss and stop the training when the validation loss continues to increase

In [0]:
for epoch in range(num_epochs):
    model.train()
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    for i in range(len(train_dataset)):
        sample = iter(train_dataloader)
        train_batch = sample.next()
        train_batch = train_batch.to(device)
        outputs = model(train_batch, labels=train_batch)
        loss = outputs[0]
        
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps
        loss.backward()
        tr_loss += loss.item()
        
        if (i + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

    tr_loss = tr_loss / i

    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()
    for i in range(len(eval_dataset)):
        with torch.no_grad():
            eval_sample = iter(eval_dataloader)
            eval_batch = eval_sample.next()
            eval_batch = eval_batch.to(device)
            outputs = model(eval_batch, labels=eval_batch)
            loss = outputs[0]
            eval_loss += loss.mean().item()
    eval_loss = eval_loss / i
    perplexity = torch.exp(torch.tensor(eval_loss))
    output_eval_file = os.path.join(output_directory, "eval_results.txt")
    
    # decode last sentence and print
    prediction = outputs[1]
    decoded_prediction = [tokenizer.decode(token.item()) for token in prediction[0].argmax(-1).cpu()]
    decoded_prediction = ' '.join(decoded_prediction)
    decoded_input = [tokenizer.decode(token.item()) for token in eval_batch[0].cpu()]
    decoded_input = ' '.join(decoded_input)
    print(decoded_input.encode('utf-8'))
    print(decoded_prediction.encode('utf-8'))
    print('Epoch: {} Validation Loss: {}'.format(epoch, eval_loss))
   
    # write model weights every 5th epoch
    if epoch % 5 == 0:
        save_directory = os.path.join(output_directory, 'epoch' + str(epoch))
        if not os.path.isdir(save_directory):
            os.mkdir(save_directory)
        config.save_pretrained(save_directory)     
        model.save_pretrained(save_directory)
    print('Epoch: {} Training Loss: {}'.format(epoch, tr_loss))