In [28]:
import pandas as pd
import transformers
import torch 
import math
from itertools import chain

In [29]:
GENRE = 'Rock'
# use all songs
N_SONGS = 15

In [30]:
# read in  data
df = pd.read_csv('clean_data.csv')
# filter data by genre
genre_df = df[df.genres.apply(lambda x: GENRE in x)]

# save lyrics
lyrics = genre_df.lyrics.values
# split lyrics by line
lines = [song.split('\n') for song in lyrics[0:N_SONGS]]
lines = list(chain.from_iterable(lines))
# remove empty lines
lines = [line for line in lines if len(line)>0]


In [31]:
# name model
MODEL_NAME = 'gpt2'
pipe = transformers.pipeline(task='text-generation', model=MODEL_NAME, device='cpu')

In [106]:
# set model 
model = transformers.GPT2LMHeadModel.from_pretrained(MODEL_NAME)

# set model configurations
config = transformers.GPT2Config.from_pretrained(MODEL_NAME)
config.do_sample = True
config.max_length = 50

In [121]:
# set tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

# add padding token to tokenizer
tokenizer.add_special_tokens({'pad_token': "[PAD]"})

# encode training data
enc_tokens = [tokenizer(text=line, return_tensors='pt', padding='max_length', max_length=50, truncation=True) for line in lines]
enc_tokens = [enc['input_ids'].tolist()[0] for enc in enc_tokens]

In [115]:
model = transformers.GPT2LMHeadModel.from_pretrained(MODEL_NAME,
                                                      config=config)

In [97]:
class MyDset(torch.utils.data.Dataset):
     """A custom dataset"""
     def __init__(self, data: list[list[int]]):
         self.data = []
         for d in data:
             input_ids = torch.tensor(d, dtype=torch.int64)
             attention_mask = torch.ones(len(d), dtype=torch.int64)
             self.data.append({'input_ids': input_ids,
                  'attention_mask': attention_mask, 'labels': input_ids})
 
     def __len__(self):
         return len(self.data)
 
     def __getitem__(self, idx: int):
         return self.data[idx]


In [98]:
training_args = transformers.TrainingArguments(
     output_dir="idiot_save/",
     learning_rate=1e-3,
     per_device_train_batch_size=4, #TODO try changing to 20
     per_device_eval_batch_size=4, #TODO try changing to 20
     num_train_epochs=1,
     evaluation_strategy='epoch',
     save_strategy='no',
 )

In [99]:
# create training, valdation, and testing data intervals
END1 = math.ceil(len(enc_tokens)*0.8)
END2 = END1 + math.ceil(len(enc_tokens)*0.1)

# create training, valdation, and testing data
dset_train = MyDset(enc_tokens[0:END1])
dset_val = MyDset(enc_tokens[END1:END2])
dset_test = MyDset(enc_tokens[END2:])

In [100]:
# train model
trainer = transformers.Trainer(
     model=model,
     args=training_args,
     train_dataset=dset_train,
     eval_dataset=dset_val,
 )

trainer.train()

  0%|          | 0/88 [00:00<?, ?it/s]

IndexError: index out of range in self

In [88]:
inputs = tokenizer("", return_tensors="pt")
generation_output = model.generate(**inputs, pad_token_id=50256)

IndexError: index -1 is out of bounds for dimension 1 with size 0

In [72]:
tokenizer.batch_decode(generation_output)

NameError: name 'generation_output' is not defined

In [None]:
lines

['Giant steps are what you take',
 'Walking on the moon',
 "I hope my legs don't break",
 'Walking on the moon',
 'We could walk forever',
 'Walking on the moon',
 'We could live together',
 'Walking on, walking on the moon',
 'Walking back from your house',
 'Walking on the moon',
 'Walking back from your house',
 'Walking on the moon',
 'Feet they hardly touch the ground',
 'Walking on the moon',
 "My feet don't hardly make no sound",
 'Walking on, walking on the moon',
 'Some may say',
 "I'm wishing my days away",
 'No way',
 "And if it's the price I pay",
 'Some say',
 "Tomorrow's another day",
 'You stay',
 'I may as well play',
 'Giant steps are what you take',
 'Walking on the moon',
 "I hope my legs don't break",
 'Walking on the moon',
 'We could walk forever',
 'Walking on the moon',
 'We could be together',
 'Walking on, walking on the moon',
 'Some may say',
 "I'm wishing my days away",
 'No way',
 "And if it's the price I pay",
 'Some say',
 "Tomorrow's another day",
 'You

In [None]:
generation_output

tensor([[  220,   460,   470,  1037,    13, 50256]])