In [1]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

  from .autonotebook import tqdm as notebook_tqdm


# Preprocess Dataset

In [2]:
# only want to use english songs

lyrics = pd.read_csv('lyrics-data.csv')
lyrics = lyrics[lyrics['language']=='en']

In [3]:
# only want to keep rap songs

artists = pd.read_csv('artists-data.csv')
artists = artists[(artists['Genres'].isin(['Rap']))]
music_df = lyrics.merge(artists[['Artist', 'Genres', 'Link']], left_on='ALink', right_on='Link', how='inner')
music_df = music_df.drop(columns=['ALink','SLink','Link'])

In [4]:
music_df.head()

Unnamed: 0,SName,Lyric,language,Artist,Genres
0,Killing Me Softly With His Song,Strumming my pain with his fingers\nSinging my...,en,Fugees,Rap
1,How Many Mics,Intro: Wyclef Jean\nPick up your microphones\n...,en,Fugees,Rap
2,Ready Or Not,"Ready or not, here I come, you can't hide\nGon...",en,Fugees,Rap
3,Vocab (LP Version),Chorus\nYou got the vocab\nI got the vocab\nYo...,en,Fugees,Rap
4,Zealots,"CLEF]\nAnother MC lose his life tonight, lord\...",en,Fugees,Rap


In [5]:
music_df.shape

(2012, 5)

In [6]:
# want to remove songs that are too long; token limit

music_df = music_df[music_df['Lyric'].apply(lambda x: len(x.split(' ')) < 350)]

In [7]:
music_df.shape

(517, 5)

In [8]:
#Create a very small test set to compare generated text with the reality
test_set = music_df.sample(n = 50)
music_df = music_df.loc[~music_df.index.isin(test_set.index)]

#Reset the indexes
test_set = test_set.reset_index()
music_df = music_df.reset_index()

#For the test set only, keep last 20 words in a new column, then remove them from original column
test_set['True_end_lyrics'] = test_set['Lyric'].str.split().str[-20:].apply(' '.join)
test_set['Lyric'] = test_set['Lyric'].str.split().str[:-20].apply(' '.join)

In [9]:
test_set = test_set.drop(columns=["index"])

# Tokenize Lyrics

In [10]:
def tokenizer(df, truncate=False, gpt2_type="gpt2", max_length=1024):

    tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
    lyrics = []

    for row in music_df['Lyric']:
        lyrics.append(torch.tensor(
            tokenizer.encode(f"<|{df}|>{row[:max_length]}<|endoftext|>")
        ))  
        
    if truncate:
        lyrics = lyrics[:20000]
    
    return lyrics

In [11]:
tokenized_lyrics = tokenizer(music_df["Lyric"], truncate=True, gpt2_type="gpt2")

In [12]:
len(tokenized_lyrics)

467

# Train

In [13]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

Downloading: 100%|██████████████████████████████████████████████████████████████████| 548M/548M [00:25<00:00, 21.4MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [15]:
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [22]:
def train(
    dataset, 
    model, 
    tokenizer,
    batch_size=16, 
    epochs=5, 
    lr=2e-5,
    max_seq_len=400, 
    warmup_steps=200,
    gpt2_type="gpt2", 
    output_dir=".", 
    output_prefix="wreckgar",
    test_mode=False,
    save_model_on_epoch=False,
):
    
    acc_steps = 100
#     device=torch.device("cuda")
#     model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

#             input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
            
    return model

In [23]:
model = train(tokenized_lyrics, model, tokenizer)

Training epoch 0
0


467it [28:45,  3.70s/it]


Training epoch 1
tensor(3.9118, grad_fn=<NllLossBackward0>)


467it [29:46,  3.82s/it]


Training epoch 2
tensor(3.8541, grad_fn=<NllLossBackward0>)


467it [28:48,  3.70s/it]


Training epoch 3
tensor(3.7502, grad_fn=<NllLossBackward0>)


467it [28:57,  3.72s/it]


Training epoch 4
tensor(3.2721, grad_fn=<NllLossBackward0>)


467it [37:03,  4.76s/it]


In [163]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=50,
    entry_length=100, # maximum number of words
    top_p=0.8,
    temperature=1.
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():
        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
#             print(generated.size())
#             generated_tokens = torch.empty((1,0))

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)
#                 generated = next_token
#                 print(f"generated: {generated_tokens}")

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:
                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    print(generated_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
                output_list = list(generated.squeeze().numpy())
                output_text = f"{tokenizer.decode(output_list)}" 
                print(generated_list)
                generated_list.append(output_text)
                
    return generated_list
#     return output_text



In [164]:
# # Function to generate multiple sentences. Test data should be a dataframe
# def text_generation(test_data):
#     generated_lyrics = []
#     for i in range(len(test_data)):
#         x = generate(model, tokenizer, test_data['Lyric'][i], entry_count=1)
#         generated_lyrics.append(x)
#     return generated_lyrics

# # Run the functions to generate the lyrics
# generated_lyrics = text_generation(test_set)

In [165]:
# # Loop to keep only generated text and add it as a new column in the dataframe
# my_generations = []

# for i in range(len(generated_lyrics)):
# #     a = test_set['Lyric'][i].split()[-30:] #Get the matching string we want (30 words)
# #     b = ' '.join(a)
#     c = generated_lyrics[i] #Get all that comes after the matching string
#     my_generations.append(c.split(b)[-1])

# test_set['Generated_lyrics'] = my_generations


# #Finish the sentences when there is a point, remove after that
# final = []

# for i in range(len(test_set)):
#     to_remove = test_set['Generated_lyrics'][i].split('.')[-1]
#     final.append(test_set['Generated_lyrics'][i].replace(to_remove,''))

# test_set['Generated_lyrics'] = final

In [166]:
# test_set

In [167]:
prompt = "Write me a rap about a boy named fazal who was raised in the streets without a family and without money."

In [168]:
new_lyric = generate(model, tokenizer, prompt, entry_count=1)

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

torch.Size([1, 24])
generated: tensor([[198.]])
generated: tensor([[198., 314.]])
generated: tensor([[198., 314., 198.]])
generated: tensor([[198., 314., 198., 679.]])
generated: tensor([[198., 314., 198., 679., 314.]])
generated: tensor([[198., 314., 198., 679., 314., 314.]])
generated: tensor([[198., 314., 198., 679., 314., 314., 198.]])
generated: tensor([[ 198.,  314.,  198.,  679.,  314.,  314.,  198., 4525.]])
generated: tensor([[ 198.,  314.,  198.,  679.,  314.,  314.,  198., 4525.,  314.]])
generated: tensor([[ 198.,  314.,  198.,  679.,  314.,  314.,  198., 4525.,  314.,  679.]])
generated: tensor([[  198.,   314.,   198.,   679.,   314.,   314.,   198.,  4525.,   314.,
           679., 14026.]])
generated: tensor([[  198.,   314.,   198.,   679.,   314.,   314.,   198.,  4525.,   314.,
           679., 14026.,   198.]])
generated: tensor([[  198.,   314.,   198.,   679.,   314.,   314.,   198.,  4525.,   314.,
           679., 14026.,   198.,  2011.]])
generated: tensor([[  

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.84s/it]

generated: tensor([[  198.,   314.,   198.,   679.,   314.,   314.,   198.,  4525.,   314.,
           679., 14026.,   198.,  2011.,   887.,   198.,   198.,   628.,   198.,
           679.,   921.,   314.,   632.,  1320., 50256.]])
[]





In [169]:
new_lyric

'\n I\n He I I\n Like I He Tell\n My But\n\n\n\n\n He You I It That<|endoftext|>'