In [6]:
# https://www.kaggle.com/datasets/markkorvin/large-metal-lyrics-archive-228k-songs

In [9]:
!pip install kaggle



In [10]:
!kaggle datasets download neisse/scrapped-lyrics-from-6-genres.zip

Traceback (most recent call last):
  File "/home/alxmke/.pyenv/versions/3.10.9/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/home/alxmke/.pyenv/versions/3.10.9/lib/python3.10/site-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/home/alxmke/.pyenv/versions/3.10.9/lib/python3.10/site-packages/kaggle/api/kaggle_api_extended.py", line 164, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /home/alxmke/.kaggle. Or use the environment method.


In [26]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

### Prepare data
lyrics = pd.read_csv('lyrics-data.csv')
lyrics = lyrics[lyrics['language']=='en']

#Only keep popular artists, with genre Rock/Pop and popularity high enough
artists = pd.read_csv('artists-data.csv')
artists = artists[(artists['Genres'].isin(['Rock'])) & (artists['Popularity']>5)]
df = lyrics.merge(
    artists[['Artist', 'Genres', 'Link']],
    left_on='ALink',
    right_on='Link',
    how='inner',
)
df = df.drop(columns=['ALink','SLink','language','Link'])

#Drop the songs with lyrics too long (after more than 1024 tokens, does not work)
df = df[df['Lyric'].apply(lambda x: len(x.split(' ')) < 350)]

#Create a very small test set to compare generated text with the reality
test_set = df.sample(n = 200)
df = df.loc[~df.index.isin(test_set.index)]

#Reset the indexes
test_set = test_set.reset_index()
df = df.reset_index()

#For the test set only, keep last 20 words in a new column, then remove them from original column
test_set['True_end_lyrics'] = test_set['Lyric'].str.split().str[-20:].apply(' '.join)
test_set['Lyric'] = test_set['Lyric'].str.split().str[:-20].apply(' '.join)
print(test_set['Lyric'])
print(test_set['True_end_lyrics'])

0      (As heard on the ziggy farwell concert lp.) Wh...
1      Strange infatuation seems to grace the evening...
2      That always end up point blank, shot between t...
3      She had an horror of rooms she was tired you c...
4      I went out walking the other day Seen a little...
                             ...                        
195    Men walkin' 'long the railroad tracks Goin' so...
196    Now you're running with the right crowd Well y...
197    I'm waiting for my man Twenty-six dollars in m...
198    What is this land of America, so many travel t...
199    It was on the day the cowboys were abandoned f...
Name: Lyric, Length: 200, dtype: object
0      yeah (Oh, oh, oh, oh) here she comes (Oh, oh, ...
1      I Fall Without you, I'm nothing Without you, I...
2      to fight Point blank they must have shot you i...
3      light know what I mean (Chorus) (twice) (Choru...
4      peace of mind That's why I'll keep searching t...
                             ...                

In [27]:
class SongLyrics(Dataset):  
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lyrics = []

        for row in df['Lyric']:
          self.lyrics.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))               
        if truncate:
            self.lyrics = self.lyrics[:20000]
        self.lyrics_count = len(self.lyrics)
        
    def __len__(self):
        return self.lyrics_count

    def __getitem__(self, item):
        return self.lyrics[item]
    
dataset = SongLyrics(df['Lyric'], truncate=True, gpt2_type="gpt2")

In [28]:
#Get the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [29]:
def train(
    dataset,
    model,
    tokenizer,
    batch_size=16,
    epochs=5,
    lr=2e-5,
    max_seq_len=400,
    warmup_steps=200,
    gpt2_type="gpt2",
    output_dir=".",
    output_prefix="wreckgar",
    test_mode=False,
    save_model_on_epoch=False,
):
    import os
    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [30]:
model = train(
    dataset,
    model,
    tokenizer,
)



Training epoch 0
0


705it [00:31, 22.14it/s]


Training epoch 1
tensor(3.2764, device='cuda:0', grad_fn=<NllLossBackward0>)


705it [00:30, 22.81it/s]


Training epoch 2
tensor(2.6462, device='cuda:0', grad_fn=<NllLossBackward0>)


705it [00:32, 21.45it/s]


Training epoch 3
tensor(2.6374, device='cuda:0', grad_fn=<NllLossBackward0>)


705it [00:31, 22.64it/s]


Training epoch 4
tensor(1.6928, device='cuda:0', grad_fn=<NllLossBackward0>)


705it [00:31, 22.45it/s]


In [31]:
print(model.generate)

<bound method GenerationMixin.generate of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)>


In [32]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=30, #maximum number of words
    top_p=0.8,
    temperature=1.,
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
  generated_lyrics = []
  for i in range(len(test_data)):
    x = generate(model.to('cpu'), tokenizer, test_data['Lyric'][i], entry_count=1)
    generated_lyrics.append(x)
  return generated_lyrics

#Run the functions to generate the lyrics
generated_lyrics = text_generation(test_set)


100%|██████████| 1/1 [00:08<00:00,  8.67s/it]
100%|██████████| 1/1 [00:04<00:00,  4.39s/it]
100%|██████████| 1/1 [00:08<00:00,  8.52s/it]
100%|██████████| 1/1 [00:04<00:00,  4.44s/it]
100%|██████████| 1/1 [00:04<00:00,  4.59s/it]
100%|██████████| 1/1 [00:11<00:00, 11.48s/it]
100%|██████████| 1/1 [00:06<00:00,  6.00s/it]
100%|██████████| 1/1 [00:10<00:00, 10.34s/it]
100%|██████████| 1/1 [00:03<00:00,  3.44s/it]
100%|██████████| 1/1 [00:05<00:00,  5.80s/it]
100%|██████████| 1/1 [00:03<00:00,  3.05s/it]
100%|██████████| 1/1 [00:02<00:00,  2.20s/it]
100%|██████████| 1/1 [00:06<00:00,  6.26s/it]
100%|██████████| 1/1 [00:08<00:00,  8.25s/it]
100%|██████████| 1/1 [00:01<00:00,  1.58s/it]
100%|██████████| 1/1 [00:03<00:00,  3.80s/it]
100%|██████████| 1/1 [00:05<00:00,  5.95s/it]
100%|██████████| 1/1 [00:09<00:00,  9.62s/it]
100%|██████████| 1/1 [00:05<00:00,  5.88s/it]
100%|██████████| 1/1 [00:08<00:00,  8.27s/it]
100%|██████████| 1/1 [00:10<00:00, 10.65s/it]
100%|██████████| 1/1 [00:06<00:00,

In [34]:
#Loop to keep only generated text and add it as a new column in the dataframe
my_generations=[]

for i in range(len(generated_lyrics)):
  a = test_set['Lyric'][i].split()[-30:] #Get the matching string we want (30 words)
  b = ' '.join(a)
  c = ' '.join(generated_lyrics[i]) #Get all that comes after the matching string
  my_generations.append(c.split(b)[-1])


test_set['Generated_lyrics'] = my_generations


#Finish the sentences when there is a point, remove after that
final=[]

for i in range(len(test_set)):
  to_remove = test_set['Generated_lyrics'][i].split('.')[-1]
  final.append(test_set['Generated_lyrics'][i].replace(to_remove,''))

test_set['Generated_lyrics'] = final

In [41]:
for lyrics in generated_lyrics:
    print(f"lyric: {lyrics[0]}")

lyric: (As heard on the ziggy farwell concert lp.) White light - White light gonna drive me out of my brain White light - White light gonna make me feel so insane White heat - White heat shapin' them down to my toes White light - White light's got it now, goodness knows White light - White light gonna drive me out of my mind White light - White light's surely gonna make me blind White heat - White heat shaping way down to my toes White light - White light could kill me now, goodness knows Oh, Oh, White light Oh, Oh, White light Oh, Oh, White heat Oh, Oh, White heat White light - White light gonna drive me out of my brain White light - White light gonna make me feel so insane White heat - White heat shapin' them down to my toes White light - White light's got it now, goodness knows White light - White light gonna drive me out of my mind White light - White light's surely gonna make me blind White heat - White heat shaping way down to my toes White light - White light could kill me now, 