In [120]:
!pip install kaggle



In [121]:
!kaggle datasets download neisse/scrapped-lyrics-from-6-genres.zip

Traceback (most recent call last):
  File "/home/alxmke/.pyenv/versions/3.10.9/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/home/alxmke/.pyenv/versions/3.10.9/lib/python3.10/site-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/home/alxmke/.pyenv/versions/3.10.9/lib/python3.10/site-packages/kaggle/api/kaggle_api_extended.py", line 164, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /home/alxmke/.kaggle. Or use the environment method.


In [122]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

### Prepare data
lyrics = pd.read_csv('lyrics-data.csv')
lyrics = lyrics[lyrics['language']=='en']

#Only keep popular artists, with genre Rock/Pop and popularity high enough
artists = pd.read_csv('artists-data.csv')
artists = artists[(artists['Genres'].isin(['Rock'])) & (artists['Popularity']>5)]
df = lyrics.merge(
    artists[['Artist', 'Genres', 'Link']],
    left_on='ALink',
    right_on='Link',
    how='inner',
)
df = df.drop(columns=['ALink','SLink','language','Link'])

#Drop the songs with lyrics too long (after more than 1024 tokens, does not work)
df = df[df['Lyric'].apply(lambda x: len(x.split(' ')) < 350)]

#Create a very small test set to compare generated text with the reality
test_set = df.sample(n = 200)
df = df.loc[~df.index.isin(test_set.index)]

#Reset the indexes
test_set = test_set.reset_index()
df = df.reset_index()

#For the test set only, keep last 20 words in a new column, then remove them from original column
end_len = 20
test_set['True_end_lyrics'] = test_set['Lyric'].str.split().str[-end_len:].apply(' '.join)
test_set['Lyric'] = test_set['Lyric'].str.split().str[:-end_len].apply(' '.join)
print(test_set['Lyric'])
print(test_set['True_end_lyrics'])

0      Yeah girl, now won't you come on out tonight L...
1      Just the twinkling lights of heaven Two reflec...
2      Now you're running with the right crowd Well y...
3      Downtown hipsters drinking up the drug line Do...
4      Remember when the music Came from wooden boxes...
                             ...                        
195    New angels of promise (do do do do) We despair...
196    When people call us revolutionary They're just...
197    Sitting here wishing on a cement floor Just wi...
198    Turn and turn again Turn and turn again I shak...
199    You're like a sailor with a girl on every shor...
Name: Lyric, Length: 200, dtype: object
0      Do do hum do do do Do do hum do do do Whoaaa y...
1      Well so do I I love how she moves me It makes ...
2      night fire is down in the street yearning Oh o...
3      his name If he didn't exist, it'd all go on ju...
4      hum the melody We'd be safe within the sound S...
                             ...                

In [123]:
class SongLyrics(Dataset):  
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lyrics = []

        for row in df['Lyric']:
          self.lyrics.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))               
        if truncate:
            self.lyrics = self.lyrics[:20000]
        self.lyrics_count = len(self.lyrics)
        
    def __len__(self):
        return self.lyrics_count

    def __getitem__(self, item):
        return self.lyrics[item]
    
dataset = SongLyrics(df['Lyric'], truncate=True, gpt2_type="gpt2")

In [124]:
#Get the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [125]:
def train(
    dataset,
    model,
    tokenizer,
    batch_size=16,
    epochs=5,
    lr=2e-5,
    max_seq_len=400,
    warmup_steps=200,
    gpt2_type="gpt2",
    output_dir=".",
    output_prefix="wreckgar",
    test_mode=False,
    save_model_on_epoch=False,
):
    import os
    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [126]:
model = train(
    dataset,
    model,
    tokenizer,
)



Training epoch 0
0


705it [00:28, 24.56it/s]


Training epoch 1
tensor(2.8919, device='cuda:0', grad_fn=<NllLossBackward0>)


705it [00:30, 22.88it/s]


Training epoch 2
tensor(3.5535, device='cuda:0', grad_fn=<NllLossBackward0>)


705it [00:30, 23.39it/s]


Training epoch 3
tensor(2.6932, device='cuda:0', grad_fn=<NllLossBackward0>)


705it [00:31, 22.60it/s]


Training epoch 4
tensor(1.7914, device='cuda:0', grad_fn=<NllLossBackward0>)


705it [00:31, 22.73it/s]


In [127]:
print(model.generate)

<bound method GenerationMixin.generate of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)>


In [128]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=30, #maximum number of words
    top_p=0.8,
    temperature=1.,
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
  generated_lyrics = []
  for i in range(len(test_data)):
    x = generate(model.to('cpu'), tokenizer, test_data['Lyric'][i], entry_count=1)
    generated_lyrics.append(x)
  return generated_lyrics

#Run the functions to generate the lyrics
generated_lyrics = text_generation(test_set)


100%|██████████| 1/1 [00:07<00:00,  7.20s/it]
100%|██████████| 1/1 [00:03<00:00,  3.00s/it]
100%|██████████| 1/1 [00:11<00:00, 11.03s/it]
100%|██████████| 1/1 [00:09<00:00,  9.43s/it]
100%|██████████| 1/1 [00:06<00:00,  6.40s/it]
100%|██████████| 1/1 [00:05<00:00,  5.18s/it]
100%|██████████| 1/1 [00:05<00:00,  5.68s/it]
100%|██████████| 1/1 [00:05<00:00,  5.67s/it]
100%|██████████| 1/1 [00:05<00:00,  5.45s/it]
100%|██████████| 1/1 [00:08<00:00,  8.12s/it]
100%|██████████| 1/1 [00:05<00:00,  5.22s/it]
100%|██████████| 1/1 [00:04<00:00,  4.71s/it]
100%|██████████| 1/1 [00:05<00:00,  5.22s/it]
100%|██████████| 1/1 [00:08<00:00,  8.12s/it]
100%|██████████| 1/1 [00:04<00:00,  4.99s/it]
100%|██████████| 1/1 [00:05<00:00,  5.09s/it]
100%|██████████| 1/1 [00:02<00:00,  2.49s/it]
100%|██████████| 1/1 [00:02<00:00,  2.59s/it]
100%|██████████| 1/1 [00:04<00:00,  4.09s/it]
100%|██████████| 1/1 [00:08<00:00,  8.70s/it]
100%|██████████| 1/1 [00:06<00:00,  6.97s/it]
100%|██████████| 1/1 [00:05<00:00,

In [171]:
generations=[]
lyrics = test_set['Lyric']
for i in range(len(generated_lyrics)):
    generated = generated_lyrics[i][0]
    original = lyrics[i]
    new = generated[len(original):].split("<")[0].split(".")[0].strip()
    generations.append(new)
    print(f"generated ({len(generated)}): {generated}")
    print(f"original ({len(original)}):  {original}")
    print(f"new ({len(new)}): {new}")
    print()

test_set['Generated_lyrics'] = generations

generated (1129): Yeah girl, now won't you come on out tonight Little girl, now where the stars will shine real bright Yeah I gotta get that feeling Yeah I wanna get that feeling Whoa back again Yeah back again Tonight, yeah there's something here in the air Tonight, I ain't got money but I don't care I gotta get that feeling Yeah I wanna get that feeling And before the night is through You gonna get that feeling too Hold me in your arms and baby take everything Let the light shine, and if we dare to be Yeah yeah yeah yeah yeah Da da da da da da da Da da da da da da da Da da da da da da da Tonight, well there's something in the air And tonight, yeah we ain't got money but I don't care Yeah we gotta get that feeling Yeah we gotta get that feeling Whoa back again Whoaaa Say say say say Do do hum do do do Do do hum do do do Do do hum do do do Do do hum do do do Do do hum do do do Do do hum do do do Do do hum do do do Yeah yeah yeah yeah yeah Do do hum do do do Do do hum do do do Do do hum

In [193]:
#Using BLEU score to compare the real sentences with the generated ones
import statistics
from nltk.translate.bleu_score import sentence_bleu

scores=[]

for i in range(len(test_set)):
  reference = test_set['True_end_lyrics'][i].split()
  candidate = test_set['Generated_lyrics'][i].split(".")[0].split()
  scores.append(
    sentence_bleu(
      [reference],
      candidate,
    )
  )

avg_bleu = statistics.mean(scores)
print(f"avg_bleu: {avg_bleu}")

avg_bleu: 0.12658270017072024
