In [1]:
import os
import pandas as pd
import sklearn.model_selection as ms
import re
import contractions
import nltk

from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from imblearn.under_sampling import RandomUnderSampler

In [2]:
if not os.path.exists('data/train'):
    os.makedirs('data/train')
if not os.path.exists('data/test'):
    os.makedirs('data/test')

In [3]:
def basic_process(text):
    
    # we are deleting special strings ('verse', 'chorus', '2x', etc.), strings in brackets and numbers
    delete_str = ['"', ',',
                'VERSE', 'Verse', 'verse', 
                'CHORUS', 'Chorus', 'chorus',
                '\[.*\]', r'\dx', r'\d+'
                ]
    for ds in delete_str:
        text = re.sub(ds, '', text)
    
    # deleting new line
    text = re.sub('\n', ' ', text)
    
    # expanding contractions
    expanded_words = [] 
    for word in text.split():
        expanded_words.append(contractions.fix(word))  
    text = ' '.join(expanded_words)
    
    # deleting all special characters
    text = ''.join(c for c in text if c.isalnum() or c == ' ')
    
    # removing repeated spaces
    text = re.sub(' +', ' ', text)
    
    # lowering text
    text = text.lower()
    
    return text

In [4]:
def nltk_process(text):
    nltk_stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    result = []
    
    # tokenization
    tokens = word_tokenize(text)
    for word in tokens:
        
        # deleting stop words
        if word not in nltk_stop_words:
            
            # lemmatization
            lemma = lemmatizer.lemmatize(word)
            
            result.append(lemma)
    
    return result

In [5]:
def process_lyrics_basic(lyrics):
    return [basic_process(text) for text in lyrics]

In [6]:
# nltk doesn't work with Parallel
def process_lyrics(lyrics):
    return [nltk_process(text) for text in lyrics]

In [7]:
def split_and_save(df, name):
    df_train, df_test = ms.train_test_split(df, train_size=0.7, random_state=7, stratify=df.genre)
    df_train.to_csv(f'data/train/{name}.csv', index=False)
    df_test.to_csv(f'data/test/{name}.csv', index=False)

# Our own dataset

In [8]:
d = pd.read_csv('data/raw/dataset2.csv', names=['lyrics', 'genre'])
d = d.drop_duplicates()
d.to_csv('data/raw/dataset2.csv', index=False, header=False)

In [9]:
d = d[['genre', 'lyrics']]
d

Unnamed: 0,genre,lyrics
0,country,"[Intro]\n(Heads Carolina, tails California)\n\..."
1,rock,[Verse 1]\nYou've been my muse for a long time...
2,rock,[Verse 1]\nI'm gonna fight 'em off\nA seven na...
3,hip-hop,"[Part I]\n\n[Verse 1: Drake]\nYeah, ayy\nHop i..."
4,country,[Verse 1]\nWe used to chase that Chattanooga f...
...,...,...
4088,country,[Verse 1]\nMr. Weatherman\nWhat is your foreca...
4089,metal,[Verse 1]\nI meant to come back to put out bli...
4090,hip-hop,"[Intro: Louis Prima]\nDown the chimney, he wil..."
4091,rock,"[Verse 1]\nI ain't rich, but I damn sure wanna..."


In [10]:
d.genre.value_counts()

country    896
metal      887
pop        815
rock       767
hip-hop    727
Name: genre, dtype: int64

In [11]:
d.lyrics = process_lyrics_basic(d.lyrics)
d

Unnamed: 0,genre,lyrics
0,country,heads carolina tails california i was out with...
1,rock,you have been my muse for a long time you get ...
2,rock,i am going to fight them off a seven nation ar...
3,hip-hop,yeah ayy hop in that bitch and i start the v s...
4,country,we used to chase that chattanooga freight coup...
...,...,...
4088,country,mr weatherman what is your forecast i need a m...
4089,metal,i meant to come back to put out bliss but the ...
4090,hip-hop,down the chimney he will come with his great b...
4091,rock,i are not rich but i damn sure want to be work...


In [12]:
split_and_save(d, 'dataset2')

In [13]:
d['tokens'] = process_lyrics(d.lyrics)
d

Unnamed: 0,genre,lyrics,tokens
0,country,heads carolina tails california i was out with...,"[head, carolina, tail, california, boy, catchi..."
1,rock,you have been my muse for a long time you get ...,"[muse, long, time, get, every, dark, night, al..."
2,rock,i am going to fight them off a seven nation ar...,"[going, fight, seven, nation, army, could, hol..."
3,hip-hop,yeah ayy hop in that bitch and i start the v s...,"[yeah, ayy, hop, bitch, start, v, snake, grass..."
4,country,we used to chase that chattanooga freight coup...,"[used, chase, chattanooga, freight, couple, ki..."
...,...,...,...
4088,country,mr weatherman what is your forecast i need a m...,"[mr, weatherman, forecast, need, major, change..."
4089,metal,i meant to come back to put out bliss but the ...,"[meant, come, back, put, bliss, style, crumbli..."
4090,hip-hop,down the chimney he will come with his great b...,"[chimney, come, great, big, smile, find, even,..."
4091,rock,i are not rich but i damn sure want to be work...,"[rich, damn, sure, want, working, like, dog, d..."


In [14]:
split_and_save(d, 'dataset2_proc')