### Fine Tuning GPT-2 Model

In [13]:
import pandas as pd
import math
from gpt2_utils import Dset 
from gpt2_utils import get_model_tokenizer, train_model, generate_texts, compute_perplexity, load_model

Set notebook variables

In [14]:
# constants 
MAX_SEQ_LEN = 10
DEVICE = 'cpu'
VERBOSE = True

GENRE = 'country'

# Name of this trained model, will be used for filename when saving the model
MODEL_INSTANCE_NAME = 'foo'

Read in train, vallidation, and test data

In [15]:
# read in cleaned data
if GENRE == 'country':
    train_lines = pd.read_csv('data/country_train.csv', header=None).values.tolist()
    val_lines = pd.read_csv('data/country_val.csv', header=None).values.tolist()
    test_lines = pd.read_csv('data/country_test.csv', header=None).values.tolist()

elif GENRE == 'metal':
    train_lines = pd.read_csv('data/metal_train.csv', header=None).values.tolist()
    val_lines = pd.read_csv('data/metal_val.csv', header=None).values.tolist()
    test_lines = pd.read_csv('data/metal_test.csv', header=None).values.tolist()

else:
    raise ValueError('Incorrect genre given.')

In [16]:
print('train lines :', len(train_lines))
print('val lines : ', len(val_lines))
print('test lines : ', len(test_lines))

train lines : 149771
val lines :  18610
test lines :  19108


In [17]:
train_end = math.ceil(len(train_lines)/4)
train_lines = train_lines[0:train_end]

#val_end = math.ceil(len(val_lines)/4)
#val_lines = val_lines[0:val_end]

Fine Tuning GPT-2 Model

In [18]:
# get model and tokenizer
model, tokenizer = get_model_tokenizer(MAX_SEQ_LEN)

All TF 2.0 model weights were used when initializing GPT2LMHeadModel.

Some weights of GPT2LMHeadModel were not initialized from the TF 2.0 model and are newly initialized: ['lm_head.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# encode data
train_encodings = [tokenizer(text=x, return_tensors='tf', padding='max_length', max_length=MAX_SEQ_LEN, truncation=True) for x in train_lines]
train_encodings = [enc['input_ids'].numpy().tolist()[0] for enc in train_encodings]

val_encodings = [tokenizer(text=x, return_tensors='tf', padding='max_length', max_length=MAX_SEQ_LEN, truncation=True) for x in val_lines]
val_encodings = [enc['input_ids'].numpy().tolist()[0] for enc in val_encodings]

test_encodings = [tokenizer(text=x, return_tensors='tf', padding='max_length', max_length=MAX_SEQ_LEN, truncation=True) for x in test_lines]
test_encodings = [enc['input_ids'].numpy().tolist()[0] for enc in test_encodings]

In [20]:
# create training, valdation, and testing datasets
dset_train = Dset(train_encodings)
dset_val = Dset(val_encodings)
dset_test = Dset(test_encodings)

In [None]:
# fine tune the model
model = train_model(model, dset_train, dset_val, GENRE, MODEL_INSTANCE_NAME, batches=100, epochs=1, lr=0.001)

  0%|          | 0/375 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

{'eval_loss': 4.997827529907227, 'eval_runtime': 72.765, 'eval_samples_per_second': 63.946, 'eval_steps_per_second': 0.646, 'epoch': 1.0}
{'train_runtime': 1491.3113, 'train_samples_per_second': 25.107, 'train_steps_per_second': 0.251, 'train_loss': 4.483689453125, 'epoch': 1.0}


In [10]:
# generate lyrics
gen_texts = generate_texts(model, tokenizer, 15)
for text in gen_texts:
    print(''.join(text))



 and we've gonna see
 it's in one
 like a little woman
 me than we're walking over my lips
 but all in heaven you're walking in the
 to lose me more i've been good on
 what you're all the sun
 the little to make your life.
 to buy you
 in fire like you can me
, just where they might make you hold me
 for you? not a few more i've

 in town and just all in home
 in the door


In [21]:
loaded_model = load_model("gpt2_trained_models/country/gpt2_final_model_epoch1_half")
#generate_texts(model: transformers.GPT2LMHeadModel, tokenizer: transformers.GPT2Tokenizer, n_texts: int, file_path=None) -> List[List[str]]:
gen_texts = generate_texts(loaded_model, tokenizer, 100, "generated_txts/gpt2_country.txt")
for text in gen_texts:
    print(''.join(text))

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.





 you been just a different

, the man that i could get back
 to make it together, take a lot of
 us the things that said, we're right
: the music
 just where the way in the day
, the love is gone
 the angels can make a big wind

, not,
, and things about the whole sound of day
 in a hobo
 were we were one

's up
 the love are the things we're a shame
 the greatest folks the party is one and some
 all the moon and all he loves me
's gone on the wall
 to lose her
 and friends so hard i'm in my arms
 by
 to me too lonely
 him to be in my hands
 the suns, they're a good girl
 about us to be free
 to be a-home song,

 and it still takes your eyes


, if she'll be the same
 for once

, it out the storm.
 for a little and his song
 your lips
 the rest, for a perfect man

 my pride
 in our pride out for

 are all the way we go together


, as much happy day in his life
 with people to love you
s just just been the whole thing
, you still love them, i was a
 and the people ever left to me
 i

In [22]:
from tqdm import tqdm
import numpy as np
import tensorflow as tf
import torch
test_data = np.array(test_lines).flatten().tolist()
model = loaded_model

# being func
encodings = tokenizer("\n\n".join(test_data), return_tensors="tf")

max_length = 10
stride=10
seq_len = encodings.input_ids.shape[1]

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc]
    target_ids_np = tf.identity(input_ids).numpy()
    target_ids_np[:, :-trg_len] = -100 
    target_ids = tf.convert_to_tensor(np.array(target_ids_np))
    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = math.exp(np.mean(nlls))

ppl


Token indices sequence length is longer than the specified maximum sequence length for this model (196059 > 1024). Running this sequence through the model will result in indexing errors
  7%|▋         | 1355/19606 [06:10<1:31:39,  3.32it/s]

In [72]:
max_length

1024

In [48]:
input_ids

<tf.Tensor: shape=(1, 10), dtype=int32, numpy=
array([[40252, 11752,  5156,  1560,   262,   582,   379,   262,  7846,
         1302]])>

In [None]:
input_ids[:, [-1,0]]

Compute Perplexity

In [None]:
# compute perplexity of generated lyrics
import numpy as np
#test_lines_first = test_lines[:math.ceil(len(test_lines)/4)]
test_lines_flt = np.array(test_lines).flatten().tolist()
ppl = compute_perplexity(loaded_model, tokenizer, test_lines_flt, MAX_SEQ_LEN, DEVICE)
ppl