### Fine Tuning GPT-2 Model

In [1]:
import pandas as pd
import transformers
import torch 
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
import utils

In [2]:
# constants 
MAX_SEQ_LEN = 10
DEVICE = 'cpu'
VERBOSE = True

GENRE = 'Pop'
SONG_LIMIT = 5


# True to train by groups of lines, False to train by single lines 
BY_VERSE = False 

In [3]:
# read in cleaned data
song_df = pd.read_csv('clean_data.csv')
print(song_df.shape)
song_df.head(5)

(163020, 5)


Unnamed: 0,artist,song_name,lyrics,language,genres
0,ivete sangalo,Careless Whisper,I feel so unsure\nAs I take your hand and lead...,en,"['Pop', ' Axé', ' Romântico']"
1,ivete sangalo,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya\nOr even try to school...",en,"['Pop', ' Axé', ' Romântico']"
2,ivete sangalo,Cruisin' (Part. Saulo),"Baby, let's cruise, away from here\nDon't be c...",en,"['Pop', ' Axé', ' Romântico']"
3,ivete sangalo,Easy,"Know it sounds funny\nBut, I just can't stand ...",en,"['Pop', ' Axé', ' Romântico']"
4,ivete sangalo,For Your Babies (The Voice cover),You've got that look again\nThe one I hoped I ...,en,"['Pop', ' Axé', ' Romântico']"


In [4]:
# get song lyrics of the given genre 
song_lyrics = utils.get_lyrics_in_genre(song_df, GENRE, verbose=VERBOSE, by_verse=BY_VERSE, song_limit=SONG_LIMIT)

Selected 5 / 47162 in the genre Pop
Total sequences: 138


#### Fine Tuning GPT-2 Model Using Song Lyrics Data

In [35]:
# defining Dataset Class
class Dset(torch.utils.data.Dataset):
     """A custom dataset"""
     def __init__(self, data: list[list[int]]):
         self.data = []
         for d in data:
             input_ids = torch.tensor(d, dtype=torch.int64)
             attention_mask = torch.ones(len(d), dtype=torch.int64)
             self.data.append({'input_ids': input_ids,
                  'attention_mask': attention_mask, 'labels': input_ids})
 
     def __len__(self):
         return len(self.data)
 
     def __getitem__(self, idx: int):
         return self.data[idx]


In [40]:
def get_model_tokenizer():
    # get model
    # pipe = transformers.pipeline(task='text-generation', model='gpt2', device=DEVICE)
    model = transformers.GPT2LMHeadModel.from_pretrained('gpt2')

    # set model configurations
    config = transformers.GPT2Config.from_pretrained('gpt2')
    config.do_sample = True
    config.max_length = MAX_SEQ_LEN

    # get tokenizer
    tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2')

    # add padding token to tokenizer
    tokenizer.add_special_tokens({'pad_token': "[PAD]"})

    model.config.pad_token_id = tokenizer.pad_token_id
    model = transformers.GPT2LMHeadModel.from_pretrained('gpt2',
                                                        config=model.config)
    model.resize_token_embeddings(len(tokenizer))

    return model, tokenizer

In [29]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [37]:
def train_model(model, dset_train, dset_val):
    
    # set training arguments
    training_args = transformers.TrainingArguments(
     output_dir="gpt2-lyrics-model_save/training_args",
     learning_rate=1e-3,
     per_device_train_batch_size=4, #TODO try changing to 20
     per_device_eval_batch_size=4, #TODO try changing to 20
     num_train_epochs=1,
     evaluation_strategy='epoch',
     save_strategy='no',
    )

    # train model
    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        train_dataset=dset_train,
        eval_dataset=dset_val,
    )

    trainer.train()

    

    return model

In [38]:
def generate_texts(model, tokenizer, n_texts):
    encoded_texts = []
    texts = []

    for _ in range(n_texts):
        input_str = np.random.choice(list(tokenizer.get_vocab().keys()))
        print(input_str)

        # input_ids = tokenizer(input_str, return_tensors="pt").input_ids
        encoded_output = model.generate(remove_invalid_values=True, max_length=10, do_sample=True).numpy().tolist()[0]
        text_output = tokenizer.batch_decode(encoded_output, skip_special_tokens=True)
        print(text_output[1:])

        encoded_texts.append(encoded_output[1:])
        texts.append(text_output[1:])
    
    return encoded_texts, texts


In [39]:
# generate 3 lyrics
encoded_gen_texts, gen_texts = generate_texts(model, tokenizer, 3)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


['ups', ' and', ' now', ' i', ' can', ' dance', ' with', ' a', ' pol']


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


['w', 'elcome', ' to', ' the', ' freak', 'show', ')', '', '']
['w', 'elcome', ' to', ' the', ' freak', 'show', ')', '', '']


In [48]:
from transformers import pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
generated_text = text_generator(" ", max_length=100, num_return_sequences=1)
print(generated_text[0]['generated_text'])

  it freakshow i my freakshow andshow my janesand you have a you've've toshow's the freakshowshow i was that you baby you you is the life and but your freakshow at can i like we a said like your it is freakshow, and my freakshow is where you freakshow freakshow at the likeshow it freakshow's i'm de's where you are it and i in you is you mak it's a you's a loveshow


In [None]:
train_data[100]

"i don't touch i just leave it up to tock and dem"

In [106]:
t = tokenizer(text=train_data[200], return_tensors='pt', padding='max_length', max_length=MAX_SEQ_LEN, truncation=True)
t['input_ids']

tensor([[ 1350,   257,  4692,  1110,   287,  5968,  1674,   484, 13972,   319]])

In [107]:
tokenizer.get_vocab().keys()



In [108]:
{k:v for k,v in zip(tokenizer.get_vocab().keys(), tokenizer.get_vocab().values()) if v in[72,  836,  470, 3638, 1312,  655, 2666,  340,  510,  284]}

{'Ġto': 284,
 'Ġleave': 2666,
 'Ġup': 510,
 "'t": 470,
 'Ġdon': 836,
 'i': 72,
 'Ġit': 340,
 'Ġtouch': 3638,
 'Ġjust': 655,
 'Ġi': 1312}

In [41]:
# get model and tokenizer
model, tokenizer = get_model_tokenizer()

In [42]:
# create training, valdation, and testing data 
train_data, val_data = train_test_split(song_lyrics, train_size=0.8)

# encode data
train_encodings = [tokenizer(text=x, return_tensors='pt', padding='max_length', max_length=MAX_SEQ_LEN, truncation=True) for x in train_data]
train_encodings = [enc['input_ids'].tolist()[0] for enc in train_encodings]

val_encodings = [tokenizer(text=x, return_tensors='pt', padding='max_length', max_length=MAX_SEQ_LEN, truncation=True) for x in val_data]
val_encodings = [enc['input_ids'].tolist()[0] for enc in val_encodings]

In [43]:
# create training and valdation datasets
dset_train = Dset(train_encodings)
dset_val = Dset(val_encodings)

In [44]:
# fine tune the model
model = train_model(model, dset_train, dset_val)

  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 4.02815580368042, 'eval_runtime': 0.7569, 'eval_samples_per_second': 36.991, 'eval_steps_per_second': 9.248, 'epoch': 1.0}
{'train_runtime': 22.3741, 'train_samples_per_second': 4.916, 'train_steps_per_second': 1.251, 'train_loss': 9.222093854631696, 'epoch': 1.0}


In [52]:
# generate 3 lyrics
encoded_gen_texts, gen_texts = generate_texts(model, tokenizer, 3)
print(''.join(gen_texts[0]))

[' de', "'", 'and', 'show', 'show', ' you', ' it', "'s", ' you']
[')', 'i', "'ve", ' but', ')', ' freak', 'show', ',', ' but']
['and', '(', 'w', 'elcome', ' to', ' a', ' the', ' freak', 'show']
 de'andshowshow you it's you


In [24]:
trained_comma_model

Compute Perplexity

In [14]:
def compute_perplexity(model, tokenizer, test_text):
    
    encodings = tokenizer("\n\n".join(test_text), return_tensors="pt")

    max_length = model.config.n_positions
    stride = 512
    seq_len = MAX_SEQ_LEN

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(DEVICE)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    # convert to scalar
    ppl = ppl.numpy().tolist()

    return ppl

In [54]:
# compute perplexity of generated lyrics
ppl = compute_perplexity(model, tokenizer, val_data)
ppl

Token indices sequence length is longer than the specified maximum sequence length for this model (3348 > 1024). Running this sequence through the model will result in indexing errors
  0%|          | 0/1 [00:00<?, ?it/s]


515.04296875