### Fine Tuning GPT-2 Model

In [1]:
import pandas as pd
import transformers
import torch 
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
import utils

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# constants 
MAX_SEQ_LEN = 50
DEVICE = 'cpu'
VERBOSE = True

GENRE = 'Rock'
SONG_LIMIT = 1


# True to train by groups of lines, False to train by single lines 
BY_VERSE = False 

In [3]:
# read in cleaned data
song_df = pd.read_csv('clean_data.csv')
print(song_df.shape)
song_df.head(5)

(171855, 5)


Unnamed: 0,artist,song_name,lyrics,language,genres
0,ivete sangalo,Careless Whisper,I feel so unsure\nAs I take your hand and lead...,en,"['Pop', ' Axé', ' Romântico']"
1,ivete sangalo,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya\nOr even try to school...",en,"['Pop', ' Axé', ' Romântico']"
2,ivete sangalo,Cruisin' (Part. Saulo),"Baby, let's cruise, away from here\nDon't be c...",en,"['Pop', ' Axé', ' Romântico']"
3,ivete sangalo,Easy,"Know it sounds funny\nBut, I just can't stand ...",en,"['Pop', ' Axé', ' Romântico']"
4,ivete sangalo,For Your Babies (The Voice cover),You've got that look again\nThe one I hoped I ...,en,"['Pop', ' Axé', ' Romântico']"


In [4]:
# get song lyrics of the given genre 
song_lyrics = utils.get_lyrics_in_genre(song_df, GENRE, verbose=VERBOSE, by_verse=BY_VERSE, song_limit=SONG_LIMIT)

Selected 1 / 86991 in the genre Rock
Total sequences: 12


#### Fine Tuning GPT-2 Model Using Song Lyrics Data

In [5]:
# defining Dataset Class
class Dset(torch.utils.data.Dataset):
     """A custom dataset"""
     def __init__(self, data: list[list[int]]):
         self.data = []
         for d in data:
             input_ids = torch.tensor(d, dtype=torch.int64)
             attention_mask = torch.ones(len(d), dtype=torch.int64)
             self.data.append({'input_ids': input_ids,
                  'attention_mask': attention_mask, 'labels': input_ids})
 
     def __len__(self):
         return len(self.data)
 
     def __getitem__(self, idx: int):
         return self.data[idx]


In [6]:
def get_model_tokenizer():
    # get model
    pipe = transformers.pipeline(task='text-generation', model='gpt2', device=DEVICE)
    model = transformers.GPT2LMHeadModel.from_pretrained('gpt2')

    # set model configurations
    config = transformers.GPT2Config.from_pretrained('gpt2')
    config.do_sample = True
    config.max_length = MAX_SEQ_LEN

    # get tokenizer
    tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2')

    # add padding token to tokenizer
    tokenizer.add_special_tokens({'pad_token': "[PAD]"})

    model.config.pad_token_id = tokenizer.pad_token_id
    model = transformers.GPT2LMHeadModel.from_pretrained('gpt2',
                                                        config=model.config)
    model.resize_token_embeddings(len(tokenizer))

    return model, tokenizer

In [7]:
def train_model(model, dset_train, dset_val):
    
    # set training arguments
    training_args = transformers.TrainingArguments(
     output_dir="gpt2-lyrics-model_save/training_args",
     learning_rate=1e-3,
     per_device_train_batch_size=4, #TODO try changing to 20
     per_device_eval_batch_size=4, #TODO try changing to 20
     num_train_epochs=1,
     evaluation_strategy='epoch',
     save_strategy='no',
    )

    # train model
    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        train_dataset=dset_train,
        eval_dataset=dset_val,
    )

    trainer.train()

    return model

In [17]:
def generate_texts(model, tokenizer, n_texts):
    encoded_texts = []
    texts = []
    pad_token_id = 50257 # TODO generalize this

    for _ in range(n_texts):

        inputs = tokenizer(" ", return_tensors="pt")
        encoded_output = model.generate(**inputs, pad_token_id=pad_token_id)
        text_output = tokenizer.batch_decode(encoded_output)

        encoded_texts.append(encoded_output)
        texts.append(text_output[0])
    
    return encoded_texts, texts


In [9]:
# get model and tokenizer
model, tokenizer = get_model_tokenizer()

In [10]:
# create training, valdation, and testing data 
train_data, val_data = train_test_split(song_lyrics, train_size=0.8)

# encode data
train_encodings = [tokenizer(text=x, return_tensors='pt', padding='max_length', max_length=MAX_SEQ_LEN, truncation=True) for x in train_data]
train_encodings = [enc['input_ids'].tolist()[0] for enc in train_encodings]

val_encodings = [tokenizer(text=x, return_tensors='pt', padding='max_length', max_length=MAX_SEQ_LEN, truncation=True) for x in val_data]
val_encodings = [enc['input_ids'].tolist()[0] for enc in val_encodings]

In [11]:
# create training and valdation datasets
dset_train = Dset(train_encodings)
dset_val = Dset(val_encodings)

In [12]:
# fine tune the model
model = train_model(model, tokenizer, dset_train, dset_val)

                                             
100%|██████████| 3/3 [00:18<00:00,  6.03s/it]

{'eval_loss': 5.33721399307251, 'eval_runtime': 0.5971, 'eval_samples_per_second': 5.025, 'eval_steps_per_second': 1.675, 'epoch': 1.0}
{'train_runtime': 18.0906, 'train_samples_per_second': 0.497, 'train_steps_per_second': 0.166, 'train_loss': 21.97089131673177, 'epoch': 1.0}





In [18]:
# generate 3 lyrics
encoded_gen_texts, gen_texts = generate_texts(model, tokenizer, 3)

Compute Perplexity

In [30]:
def compute_perplexity(model, tokenizer, test_text):
    
    encodings = tokenizer("\n\n".join(test_text), return_tensors="pt")

    max_length = model.config.n_positions
    stride = 512
    seq_len = MAX_SEQ_LEN

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(DEVICE)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    # convert to scalar
    ppl = ppl.numpy().tolist()

    return ppl

In [33]:
# compute perplexity of generated lyrics
ppl = compute_perplexity(model, tokenizer, gen_texts)
ppl

  0%|          | 0/1 [00:00<?, ?it/s]


9566.6787109375