In [1]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

  from .autonotebook import tqdm as notebook_tqdm


# Preprocess Dataset

In [2]:
# only want to use english songs

lyrics = pd.read_csv('lyrics-data.csv')
lyrics = lyrics[lyrics['language']=='en']

In [4]:
# only want to keep rap songs

artists = pd.read_csv('artists-data.csv')
artists = artists[(artists['Genres'].isin(['Rap']))]
music_df = lyrics.merge(artists[['Artist', 'Genres', 'Link']], left_on='ALink', right_on='Link', how='inner')
music_df = music_df.drop(columns=['ALink','SLink','Link'])

In [5]:
music_df.head()

Unnamed: 0,SName,Lyric,language,Artist,Genres
0,Killing Me Softly With His Song,Strumming my pain with his fingers\nSinging my...,en,Fugees,Rap
1,How Many Mics,Intro: Wyclef Jean\nPick up your microphones\n...,en,Fugees,Rap
2,Ready Or Not,"Ready or not, here I come, you can't hide\nGon...",en,Fugees,Rap
3,Vocab (LP Version),Chorus\nYou got the vocab\nI got the vocab\nYo...,en,Fugees,Rap
4,Zealots,"CLEF]\nAnother MC lose his life tonight, lord\...",en,Fugees,Rap


In [6]:
music_df.shape

(2012, 5)

In [7]:
# want to remove songs that are too long; token limit

music_df = music_df[music_df['Lyric'].apply(lambda x: len(x.split(' ')) < 350)]

In [8]:
music_df.shape

(517, 5)

In [9]:
#Create a very small test set to compare generated text with the reality
test_set = music_df.sample(n = 200)
music_df = music_df.loc[~music_df.index.isin(test_set.index)]

#Reset the indexes
test_set = test_set.reset_index()
music_df = music_df.reset_index()

#For the test set only, keep last 20 words in a new column, then remove them from original column
test_set['True_end_lyrics'] = test_set['Lyric'].str.split().str[-20:].apply(' '.join)
test_set['Lyric'] = test_set['Lyric'].str.split().str[:-20].apply(' '.join)

In [13]:
test_set = test_set.drop(columns=["index"])

# Tokenize Lyrics

In [29]:
def tokenizer(df, truncate=False, gpt2_type="gpt2", max_length=1024):

    tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
    lyrics = []

    for row in music_df['Lyric']:
        lyrics.append(torch.tensor(
            tokenizer.encode(f"<|{df}|>{row[:max_length]}<|endoftext|>")
        ))  
        
    if truncate:
        lyrics = lyrics[:20000]
        
    lyrics_count = len(lyrics)
    
    return lyrics

In [30]:
tokenized_lyrics= tokenizer(music_df["Lyric"], truncate=True, gpt2_type="gpt2")