In [1]:
import os
import pandas as pd
import sklearn.model_selection as ms
import re
import contractions
import nltk

from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from imblearn.under_sampling import RandomUnderSampler

In [2]:
if not os.path.exists('data/train'):
    os.makedirs('data/train')
if not os.path.exists('data/test'):
    os.makedirs('data/test')

In [3]:
def basic_process(text):
    
    # we are deleting special strings ('verse', 'chorus', '2x', etc.), strings in brackets and numbers
    delete_str = ['"', ',',
                'VERSE', 'Verse', 'verse', 
                'CHORUS', 'Chorus', 'chorus',
                '\[.*\]', r'\dx', r'\d+'
                ]
    for ds in delete_str:
        text = re.sub(ds, '', text)
    
    # deleting new line
    text = re.sub('\n', ' ', text)
    
    # expanding contractions
    expanded_words = [] 
    for word in text.split():
        expanded_words.append(contractions.fix(word))  
    text = ' '.join(expanded_words)
    
    # deleting all special characters
    text = ''.join(c for c in text if c.isalnum() or c == ' ')
    
    # removing repeated spaces
    text = re.sub(' +', ' ', text)
    
    # lowering text
    text = text.lower()
    
    return text

In [4]:
def nltk_process(text):
    nltk_stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    result = []
    
    # tokenization
    tokens = word_tokenize(text)
    for word in tokens:
        
        # deleting stop words
        if word not in nltk_stop_words:
            
            # lemmatization
            lemma = lemmatizer.lemmatize(word)
            
            result.append(lemma)
    
    return result

In [5]:
def process_lyrics_basic(lyrics):
    return [basic_process(text) for text in lyrics]

In [6]:
# nltk doesn't work with Parallel
def process_lyrics(lyrics):
    return [nltk_process(text) for text in lyrics]

In [7]:
def split_and_save(df, name):
    df_train, df_test = ms.train_test_split(df, train_size=0.7, random_state=7, stratify=df.genre)
    df_train.to_csv(f'data/train/{name}.csv', index=False)
    df_test.to_csv(f'data/test/{name}.csv', index=False)

# Our own dataset

In [8]:
d = pd.read_csv('data/raw/dataset.csv', names=['lyrics', 'genre'])
d = d.drop_duplicates()
d.to_csv('data/raw/dataset.csv', index=False, header=False)

In [9]:
d = d[['genre', 'lyrics']]
d

Unnamed: 0,genre,lyrics
0,rock,"[Intro]\nGo!\n\n[Verse 1]\nSo one, two, three\..."
1,pop,[Chorus]\nBecause you know I'm all about that ...
2,pop,[Verse 1]\nI took a pill in Ibiza to show Avic...
3,alternative,Ring My Bell - Anita Ward\nBrianstorm - Arctic...
4,alternative,[Intro]\nShalalala la la la\nUh huh\n\n[Verse ...
...,...,...
623,alternative,[Verse 1]\nI can't escape this hell\nSo many t...
624,rock,[Verse 1]\nThe lights go out and I can't be sa...
625,pop,[Verse 1]\nThis what happen when I think 'bout...
626,rock,"[Verse 1]\nCome up to meet you, tell you I'm s..."


In [10]:
d.genre.value_counts()

pop            166
rock           157
hip-hop        156
alternative    149
Name: genre, dtype: int64

In [11]:
d.lyrics = process_lyrics_basic(d.lyrics)
d

Unnamed: 0,genre,lyrics
0,rock,go so one two three take my hand and come with...
1,pop,because you know i am all about that bass bout...
2,pop,i took a pill in ibiza to show avicii i was co...
3,alternative,ring my bell anita ward brianstorm arctic monk...
4,alternative,shalalala la la la uh huh i was down at the ne...
...,...,...
623,alternative,i cannot escape this hell so many times i have...
624,rock,the lights go out and i cannot be saved tides ...
625,pop,this what happen when i think bout you i get i...
626,rock,come up to meet you tell you i am sorry you do...


In [12]:
split_and_save(d, 'dataset')

In [13]:
d['tokens'] = process_lyrics(d.lyrics)
d

Unnamed: 0,genre,lyrics,tokens
0,rock,go so one two three take my hand and come with...,"[go, one, two, three, take, hand, come, look, ..."
1,pop,because you know i am all about that bass bout...,"[know, bass, bout, bass, treble, bout, bass, b..."
2,pop,i took a pill in ibiza to show avicii i was co...,"[took, pill, ibiza, show, avicii, cool, finall..."
3,alternative,ring my bell anita ward brianstorm arctic monk...,"[ring, bell, anita, ward, brianstorm, arctic, ..."
4,alternative,shalalala la la la uh huh i was down at the ne...,"[shalalala, la, la, la, uh, huh, new, amsterda..."
...,...,...,...
623,alternative,i cannot escape this hell so many times i have...,"[escape, hell, many, time, tried, still, caged..."
624,rock,the lights go out and i cannot be saved tides ...,"[light, go, saved, tide, tried, swim, brought,..."
625,pop,this what happen when i think bout you i get i...,"[happen, think, bout, get, feeling, yeah, star..."
626,rock,come up to meet you tell you i am sorry you do...,"[come, meet, tell, sorry, know, lovely, find, ..."


In [14]:
split_and_save(d, 'dataset_proc')