### Fine Tuning GPT-2 Model

In [None]:
import pandas as pd
from gpt2_utils import Dset 
from gpt2_utils import get_model_tokenizer, train_model, generate_texts, compute_perplexity, load_model

Set notebook variables

In [None]:
# constants 
MAX_SEQ_LEN = 10
DEVICE = 'cpu'
VERBOSE = True

GENRE = 'metal'

# Name of this trained model, will be used for filename when saving the model
MODEL_INSTANCE_NAME = 'all_songs_50_epoch_0.0001_lr'

Read in train, vallidation, and test data

In [None]:
# read in cleaned data
if GENRE == 'country':
    train_lines = pd.read_csv('country_train.csv', header=None).values.tolist()
    val_lines = pd.read_csv('country_val.csv', header=None).values.tolist()
    test_lines = pd.read_csv('country_test.csv', header=None).values.tolist()

elif GENRE == 'metal':
    train_lines = pd.read_csv('metal_train.csv', header=None).values.tolist()
    val_lines = pd.read_csv('metal_val.csv', header=None).values.tolist()
    test_lines = pd.read_csv('metal_test.csv', header=None).values.tolist()

else:
    raise ValueError('Incorrect genre given.')

In [None]:
print('train lines :', len(train_lines))
print('val lines : ', len(val_lines))
print('test lines : ', len(test_lines))

Fine Tuning GPT-2 Model

In [None]:
# get model and tokenizer
model, tokenizer = get_model_tokenizer(MAX_SEQ_LEN)

In [None]:
train_lines = train_lines[0:100]
val_lines = val_lines[0:100]
test_lines = test_lines[0:100]

In [None]:
# encode data
train_encodings = [tokenizer(text=x, return_tensors='pt', padding='max_length', max_length=MAX_SEQ_LEN, truncation=True) for x in train_lines]
train_encodings = [enc['input_ids'].tolist()[0] for enc in train_encodings]

val_encodings = [tokenizer(text=x, return_tensors='pt', padding='max_length', max_length=MAX_SEQ_LEN, truncation=True) for x in val_lines]
val_encodings = [enc['input_ids'].tolist()[0] for enc in val_encodings]

test_encodings = [tokenizer(text=x, return_tensors='pt', padding='max_length', max_length=MAX_SEQ_LEN, truncation=True) for x in test_lines]
test_encodings = [enc['input_ids'].tolist()[0] for enc in test_encodings]

In [None]:
# create training, valdation, and testing datasets
dset_train = Dset(train_encodings)
dset_val = Dset(val_encodings)
dset_test = Dset(test_encodings)

In [None]:
# fine tune the model
model = train_model(model, dset_train, dset_val, GENRE, MODEL_INSTANCE_NAME, batches=20, epochs=50, lr=0.0001)

In [None]:
# generate lyrics
gen_texts = generate_texts(model, tokenizer, 15)
for text in gen_texts:
    print(''.join(text))

Compute Perplexity

In [None]:
# compute perplexity of generated lyrics
import numpy as np
test_lines_flt = np.array(test_lines).flatten().tolist()
ppl = compute_perplexity(model, tokenizer, test_lines_flt, MAX_SEQ_LEN, DEVICE)
ppl