### Fine Tuning GPT-2 Model

In [None]:
import pandas as pd
import math
import numpy as np
from gpt2_utils import Dset 
from gpt2_utils import get_model_tokenizer, train_model, generate_texts, load_model, compute_perplexity

Set notebook variables

In [None]:
# constants 
MAX_SEQ_LEN = 10
DEVICE = 'cpu'
VERBOSE = True

GENRE = 'country'

# Name of this trained model, will be used for filename when saving the model
MODEL_INSTANCE_NAME = 'foo'

Read in train, vallidation, and test data

In [None]:
# read in cleaned data
if GENRE == 'country':
    train_lines = pd.read_csv('data/country_train.csv', header=None).values.tolist()
    val_lines = pd.read_csv('data/country_val.csv', header=None).values.tolist()
    test_lines = pd.read_csv('data/country_test.csv', header=None).values.tolist()

elif GENRE == 'metal':
    train_lines = pd.read_csv('data/metal_train.csv', header=None).values.tolist()
    val_lines = pd.read_csv('data/metal_val.csv', header=None).values.tolist()
    test_lines = pd.read_csv('data/metal_test.csv', header=None).values.tolist()

else:
    raise ValueError('Incorrect genre given.')

In [None]:
print('train lines :', len(train_lines))
print('val lines : ', len(val_lines))
print('test lines : ', len(test_lines))

In [None]:
train_end = math.ceil(len(train_lines)/4)
train_lines = train_lines[0:train_end]

val_end = math.ceil(len(val_lines)/4)
val_lines = val_lines[0:val_end]

Fine Tuning GPT-2 Model

In [None]:
# get model and tokenizer
model, tokenizer = get_model_tokenizer(MAX_SEQ_LEN)

In [None]:
# encode data
train_encodings = [tokenizer(text=x, return_tensors='tf', padding='max_length', max_length=MAX_SEQ_LEN, truncation=True) for x in train_lines]
train_encodings = [enc['input_ids'].numpy().tolist()[0] for enc in train_encodings]

val_encodings = [tokenizer(text=x, return_tensors='tf', padding='max_length', max_length=MAX_SEQ_LEN, truncation=True) for x in val_lines]
val_encodings = [enc['input_ids'].numpy().tolist()[0] for enc in val_encodings]

test_encodings = [tokenizer(text=x, return_tensors='tf', padding='max_length', max_length=MAX_SEQ_LEN, truncation=True) for x in test_lines]
test_encodings = [enc['input_ids'].numpy().tolist()[0] for enc in test_encodings]

In [None]:
# create training, valdation, and testing datasets
dset_train = Dset(train_encodings)
dset_val = Dset(val_encodings)
dset_test = Dset(test_encodings)

In [None]:
# NOTE: only uncomment below if you want to fine tune a model. It make take a long time to run.
# # fine tune the model
# model = train_model(model, dset_train, dset_val, GENRE, MODEL_INSTANCE_NAME, batches=100, epochs=1, lr=0.001)

In [None]:
# generate lyrics
gen_texts = generate_texts(model, tokenizer, 15)
for text in gen_texts:
    print(''.join(text))

Generate Text from a Loaded Model

In [None]:
loaded_model = load_model("gpt2_trained_models/metal/100_lines_10_epoch")
gen_texts = generate_texts(loaded_model, tokenizer, 2, "generated_txts/foo.txt")
for text in gen_texts:
    print(''.join(text))

Compute Perplexity

In [None]:
# compute perplexity of on test data
test_lines_flt = np.array(test_lines).flatten().tolist()
ppl = compute_perplexity('gpt2_trained_models/metal/100_lines_10_epoch', tokenizer, test_lines_flt, MAX_SEQ_LEN)
ppl