In [1]:
import os
import pandas as pd
import sklearn.model_selection as ms
import re
import contractions
import nltk

from joblib import Parallel, delayed
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from imblearn.under_sampling import RandomUnderSampler

In [2]:
# Parameters

min_obs = 5000

In [3]:
# Creation of necessary directories

if not os.path.exists('data/train'):
    os.makedirs('data/train')
if not os.path.exists('data/test'):
    os.makedirs('data/test')

In [4]:
def basic_process(text):
    '''
    Preprocess lyrics of the song.
    Parameters:
        text (str): Lyrics of the song.
    Returns:
        str: Preprocessed lyrics of the song.
    '''
    
    # we are deleting special strings ('verse', 'chorus', '2x', etc.), strings in brackets and numbers
    delete_str = ['"', ',',
                'VERSE', 'Verse', 'verse', 
                'CHORUS', 'Chorus', 'chorus',
                '\[.*\]', r'\dx', r'\d+'
                ]
    for ds in delete_str:
        text = re.sub(ds, '', text)
    
    # deleting new line
    text = re.sub('\n', ' ', text)
    
    # expanding contractions
    expanded_words = [] 
    for word in text.split():
        expanded_words.append(contractions.fix(word))  
    text = ' '.join(expanded_words)
    
    # deleting all special characters
    text = ''.join(c for c in text if c.isalnum() or c == ' ')
    
    # removing repeated spaces
    text = re.sub(' +', ' ', text)
    
    # lowering text
    text = text.lower()
    
    return text

In [5]:
def nltk_process(text):
    '''
    Preprocess and tokenize lyrics of the song.
    Parameters:
        text (str): Lyrics of the song.
    Returns:
        list: Preprocessed and tokenized lyrics of the song.
    '''
    
    nltk_stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    result = []
    
    # tokenization
    tokens = word_tokenize(text)
    for word in tokens:
        
        # deleting stop words
        if word not in nltk_stop_words:
            
            # lemmatization
            lemma = lemmatizer.lemmatize(word)
            
            result.append(lemma)
    
    return result

In [6]:
def process_lyrics_basic(lyrics):
    '''
    Preprocess lyrics.
    Parameters:
        lyrics (Series): Lyrics of the data.
    Returns:
        Series: Preprocessed lyrics of the data.
    '''
    return Parallel(n_jobs=-1, backend='threading')(delayed(basic_process)(text) for text in lyrics)

In [7]:
# nltk doesn't work with Parallel
def process_lyrics(lyrics):
    '''
    Preprocess and tokenize lyrics.
    Parameters:
        lyrics (Series): Lyrics of the data.
    Returns:
        list: Preprocessed and tokenized lyrics of the data.
    '''
    return [nltk_process(text) for text in lyrics]

In [8]:
def split_and_save(df, name):
    '''
    Split dataset into training and testing data and save it to files.
    Parameters:
        df (DataFrame): Dataset.
        name (str): Name of the dataset.
    '''
    df_train, df_test = ms.train_test_split(df, train_size=0.7, random_state=7, stratify=df.genre)
    df_train.to_csv(f'data/train/{name}.csv', index=False)
    df_test.to_csv(f'data/test/{name}.csv', index=False)

# MetroLyrics

source of data: https://github.com/hiteshyalamanchili/SongGenreClassification/blob/master/dataset/english_cleaned_lyrics.zip

In [9]:
# Reading data from CSV file

d = pd.read_csv('data/raw/english_cleaned_lyrics.csv', index_col=0)

In [10]:
# Taking only necessary columns

d = d[['genre', 'lyrics']]
d

Unnamed: 0,genre,lyrics
0,Pop,Oh baby how you doing You know I'm gonna cut r...
1,Pop,playin everything so easy it's like you seem s...
2,Pop,If you search For tenderness It isn't hard to ...
3,Pop,Oh oh oh I oh oh oh I If I wrote a book about ...
4,Pop,Party the people the people the party it's pop...
...,...,...
362232,Country,I gotta say Boy after only just a couple of da...
362233,Country,I helped you find her diamond ring You made me...
362234,Country,Look at the couple in the corner booth Looks a...
362235,Country,When I fly off this mortal earth And I'm measu...


In [11]:
# Deleting observations from 'Other' genre and replacing genre 'Folk' with 'Country' label

d = d[d.genre != 'Other']
d = d.replace('Folk', 'Country')

In [12]:
# Imbalanced dataset

d.genre.value_counts()

Rock          100053
Pop            34137
Hip-Hop        22654
Metal          21210
Country        15847
Jazz            7310
Electronic      6942
R&B             3336
Indie           2935
Name: genre, dtype: int64

In [13]:
# Basic lyrics preprocessing

d.lyrics = process_lyrics_basic(d.lyrics)
d

Unnamed: 0,genre,lyrics
0,Pop,oh baby how you doing you know i am going to c...
1,Pop,playin everything so easy it is like you seem ...
2,Pop,if you search for tenderness it is not hard to...
3,Pop,oh oh oh i oh oh oh i if i wrote a book about ...
4,Pop,party the people the people the party it is po...
...,...,...
362232,Country,i got to say boy after only just a couple of d...
362233,Country,i helped you find her diamond ring you made me...
362234,Country,look at the couple in the corner booth looks a...
362235,Country,when i fly off this mortal earth and i am meas...


In [14]:
# Splitting and saving 'metrolyrics' dataset

split_and_save(d, 'metrolyrics')

In [15]:
# Lyrics preprocessing and tokenization

d['tokens'] = process_lyrics(d.lyrics)
d

Unnamed: 0,genre,lyrics,tokens
0,Pop,oh baby how you doing you know i am going to c...,"[oh, baby, know, going, cut, right, chase, wom..."
1,Pop,playin everything so easy it is like you seem ...,"[playin, everything, easy, like, seem, sure, s..."
2,Pop,if you search for tenderness it is not hard to...,"[search, tenderness, hard, find, love, need, l..."
3,Pop,oh oh oh i oh oh oh i if i wrote a book about ...,"[oh, oh, oh, oh, oh, oh, wrote, book, stand, t..."
4,Pop,party the people the people the party it is po...,"[party, people, people, party, popping, sittin..."
...,...,...,...
362232,Country,i got to say boy after only just a couple of d...,"[got, say, boy, couple, date, hand, outright, ..."
362233,Country,i helped you find her diamond ring you made me...,"[helped, find, diamond, ring, made, try, every..."
362234,Country,look at the couple in the corner booth looks a...,"[look, couple, corner, booth, look, lot, like,..."
362235,Country,when i fly off this mortal earth and i am meas...,"[fly, mortal, earth, measured, depth, girth, f..."


In [16]:
# Splitting and saving 'metrolyrics' dataset with tokenized lyrics

split_and_save(d, 'metrolyrics_proc')

# Song lyrics from 79 musical genres

source of data: https://www.kaggle.com/datasets/neisse/scrapped-lyrics-from-6-genres

In [17]:
# Reading artists data from CSV file

d1 = pd.read_csv('data/raw/artists-data.csv')
d1

Unnamed: 0,Artist,Genres,Songs,Popularity,Link
0,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/
1,Chiclete com Banana,Axé,268.0,3.8,/chiclete-com-banana/
2,Banda Eva,Axé; Romântico; Reggae,215.0,2.3,/banda-eva/
3,É O Tchan,Axé,129.0,1.6,/e-o-tchan/
4,Claudia Leitte,Pop; Axé; Romântico,167.0,1.5,/claudia-leitte/
...,...,...,...,...,...
4163,Miriam Makeba,World Music; Black Music; Blues,17.0,0.0,/miriam-makeba/
4164,Freddie Aguilar,World Music,61.0,0.0,/freddie-aguilar/
4165,Amadou & Mariam,World Music,14.0,0.0,/amadou-mariam/
4166,Magic System,World Music; Gospel/Religioso,16.0,0.0,/magic-system/


In [18]:
# Leaving only one genre for every artist

d1.Genres = d1.Genres.str.replace(',', ';')
d1.Genres = d1.Genres.str.split(r'\s*;\s*').str[0]
d1

Unnamed: 0,Artist,Genres,Songs,Popularity,Link
0,Ivete Sangalo,Pop,313.0,4.4,/ivete-sangalo/
1,Chiclete com Banana,Axé,268.0,3.8,/chiclete-com-banana/
2,Banda Eva,Axé,215.0,2.3,/banda-eva/
3,É O Tchan,Axé,129.0,1.6,/e-o-tchan/
4,Claudia Leitte,Pop,167.0,1.5,/claudia-leitte/
...,...,...,...,...,...
4163,Miriam Makeba,World Music,17.0,0.0,/miriam-makeba/
4164,Freddie Aguilar,World Music,61.0,0.0,/freddie-aguilar/
4165,Amadou & Mariam,World Music,14.0,0.0,/amadou-mariam/
4166,Magic System,World Music,16.0,0.0,/magic-system/


In [19]:
# Leaving only necessary columns

d1 = d1[['Genres', 'Link']]
d1

Unnamed: 0,Genres,Link
0,Pop,/ivete-sangalo/
1,Axé,/chiclete-com-banana/
2,Axé,/banda-eva/
3,Axé,/e-o-tchan/
4,Pop,/claudia-leitte/
...,...,...
4163,World Music,/miriam-makeba/
4164,World Music,/freddie-aguilar/
4165,World Music,/amadou-mariam/
4166,World Music,/magic-system/


In [20]:
# All different artists' genres

d1.Genres.value_counts()

Gospel/Religioso    464
Rock                246
Pop                 236
Sertanejo           220
Indie               175
                   ... 
Metal                 2
Lo-fi                 2
Electro Swing         2
Piseiro               1
Urban                 1
Name: Genres, Length: 79, dtype: int64

In [21]:
# Reading lyrics data from CSV file

d2 = pd.read_csv('data/raw/lyrics-data.csv')
d2

Unnamed: 0,ALink,SName,SLink,Lyric,language
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt
1,/ivete-sangalo/,Se Eu Não Te Amasse Tanto Assim,/ivete-sangalo/se-eu-nao-te-amasse-tanto-assim...,Meu coração\nSem direção\nVoando só por voar\n...,pt
2,/ivete-sangalo/,Céu da Boca,/ivete-sangalo/chupa-toda.html,É de babaixá!\nÉ de balacubaca!\nÉ de babaixá!...,pt
3,/ivete-sangalo/,Quando A Chuva Passar,/ivete-sangalo/quando-a-chuva-passar.html,Quando a chuva passar\n\nPra quê falar\nSe voc...,pt
4,/ivete-sangalo/,Sorte Grande,/ivete-sangalo/sorte-grande.html,A minha sorte grande foi você cair do céu\nMin...,pt
...,...,...,...,...,...
379926,/clegg-johnny/,The Waiting,/clegg-johnny/the-waiting.html,Chorus\nHere we stand waiting on the plain\nDa...,en
379927,/clegg-johnny/,Too Early For The Sky,/clegg-johnny/too-early-for-the-sky.html,I nearly disappeared into the mouth of a croco...,en
379928,/clegg-johnny/,Warsaw 1943 (I Never Betrayed The Revolution),/clegg-johnny/warsaw-1943-i-never-betrayed-the...,"Amambuka, amambuka azothengisa izwe lakithi, i...",en
379929,/clegg-johnny/,When The System Has Fallen,/clegg-johnny/when-the-system-has-fallen.html,Sweat in the heat for days on end\nwaiting for...,en


In [22]:
# Leaving only English lyrics

d2 = d2[d2.language == 'en']
d2

Unnamed: 0,ALink,SName,SLink,Lyric,language
69,/ivete-sangalo/,Careless Whisper,/ivete-sangalo/careless-whisper.html,I feel so unsure\nAs I take your hand and lead...,en
86,/ivete-sangalo/,Could You Be Loved / Citação Musical do Rap: S...,/ivete-sangalo/could-you-be-loved-citacao-musi...,"Don't let them fool, ya\nOr even try to school...",en
88,/ivete-sangalo/,Cruisin' (Part. Saulo),/ivete-sangalo/cruisin-part-saulo.html,"Baby, let's cruise, away from here\nDon't be c...",en
111,/ivete-sangalo/,Easy,/ivete-sangalo/easy.html,"Know it sounds funny\nBut, I just can't stand ...",en
140,/ivete-sangalo/,For Your Babies (The Voice cover),/ivete-sangalo/for-your-babies-the-voice-cover...,You've got that look again\nThe one I hoped I ...,en
...,...,...,...,...,...
379926,/clegg-johnny/,The Waiting,/clegg-johnny/the-waiting.html,Chorus\nHere we stand waiting on the plain\nDa...,en
379927,/clegg-johnny/,Too Early For The Sky,/clegg-johnny/too-early-for-the-sky.html,I nearly disappeared into the mouth of a croco...,en
379928,/clegg-johnny/,Warsaw 1943 (I Never Betrayed The Revolution),/clegg-johnny/warsaw-1943-i-never-betrayed-the...,"Amambuka, amambuka azothengisa izwe lakithi, i...",en
379929,/clegg-johnny/,When The System Has Fallen,/clegg-johnny/when-the-system-has-fallen.html,Sweat in the heat for days on end\nwaiting for...,en


In [23]:
# Leaving only necessary columns and renaming them

d2 = d2[['ALink', 'Lyric', 'SName']]
d2 = d2.rename(columns={'ALink': 'Link', 'SName': 'title'})
d2

Unnamed: 0,Link,Lyric,title
69,/ivete-sangalo/,I feel so unsure\nAs I take your hand and lead...,Careless Whisper
86,/ivete-sangalo/,"Don't let them fool, ya\nOr even try to school...",Could You Be Loved / Citação Musical do Rap: S...
88,/ivete-sangalo/,"Baby, let's cruise, away from here\nDon't be c...",Cruisin' (Part. Saulo)
111,/ivete-sangalo/,"Know it sounds funny\nBut, I just can't stand ...",Easy
140,/ivete-sangalo/,You've got that look again\nThe one I hoped I ...,For Your Babies (The Voice cover)
...,...,...,...
379926,/clegg-johnny/,Chorus\nHere we stand waiting on the plain\nDa...,The Waiting
379927,/clegg-johnny/,I nearly disappeared into the mouth of a croco...,Too Early For The Sky
379928,/clegg-johnny/,"Amambuka, amambuka azothengisa izwe lakithi, i...",Warsaw 1943 (I Never Betrayed The Revolution)
379929,/clegg-johnny/,Sweat in the heat for days on end\nwaiting for...,When The System Has Fallen


In [24]:
# Merging artists and lyrics data into one dataset

d3 = pd.merge(d1, d2, on='Link')
d3 = d3[['Genres', 'Lyric', 'title']]
d3 = d3.rename(columns={'Genres': 'genre', 'Lyric': 'lyrics'})

In [25]:
# Basic lyrics preprocessing

d3.lyrics = process_lyrics_basic(d3.lyrics)

In [26]:
# Basic title preprocessing

for row in d3.iterrows():
    d3.title[row[0]] = basic_process(str(d3.title[row[0]]))
d3

Unnamed: 0,genre,lyrics,title
0,Pop,i feel so unsure as i take your hand and lead ...,careless whisper
1,Pop,do not let them fool ya or even try to school ...,could you be loved citação musical do rap se l...
2,Pop,baby let us cruise away from here do not be co...,cruisin part saulo
3,Pop,know it sounds funny but i just cannot stand t...,easy
4,Pop,you have got that look again the one i hoped i...,for your babies the voice cover
...,...,...,...
191382,World Music,here we stand waiting on the plain darkness ha...,the waiting
191383,World Music,i nearly disappeared into the mouth of a croco...,too early for the sky
191384,World Music,amambuka amambuka azothengisa izwe lakithi izw...,warsaw i never betrayed the revolution
191385,World Music,sweat in the heat for days on end waiting for ...,when the system has fallen


In [27]:
# Imbalanced dataset

vc = d3.genre.value_counts()
vc

Rock             25177
Pop              13759
Heavy Metal      13496
Indie            12998
Rap               9589
                 ...  
Electro Swing        6
Jovem Guarda         6
Forró                3
Lo-fi                1
Regional             1
Name: genre, Length: 73, dtype: int64

In [28]:
# Genres with least observations

vc[vc < min_obs]

Hard Rock        4632
Soul Music       4518
Dance            4252
Punk Rock        4157
Folk             4055
                 ... 
Electro Swing       6
Jovem Guarda        6
Forró               3
Lo-fi               1
Regional            1
Name: genre, Length: 62, dtype: int64

In [29]:
# Leaving only most common genres and replacing some labels

d3 = d3[d3.genre.isin(vc[vc >= min_obs].index.tolist())]
d3 = d3[d3.genre != 'Pop/Rock']
d3 = d3.replace('Heavy Metal', 'Metal')
d3 = d3.replace('Rock Alternativo', 'Alternative Rock')
d3 = d3.replace('Hip Hop', 'Hip-Hop')

In [30]:
# Resulting genres

vc = d3.genre.value_counts()
vc

Rock                25177
Pop                 13759
Metal               13496
Indie               12998
Rap                  9589
Hip-Hop              8412
Country              7377
Alternative Rock     5555
R&B                  5309
Gospel/Religioso     5017
Name: genre, dtype: int64

In [31]:
# Splitting and saving 'musicalgenres' dataset

split_and_save(d3, 'musicalgenres')

In [32]:
# Splitting and saving 'musicalgenres' dataset with only 5 genres

split_and_save(d3[d3.genre.isin(['Rock', 'Pop', 'Metal', 'Hip-Hop', 'Country'])], 'small_musicalgenres')

In [33]:
# Preprocessing and tokenization of lyrics and titles

d3['tokens'] = process_lyrics(d3.lyrics)
d3['tokens_title'] = process_lyrics(d3.title)
d3

Unnamed: 0,genre,lyrics,title,tokens,tokens_title
0,Pop,i feel so unsure as i take your hand and lead ...,careless whisper,"[feel, unsure, take, hand, lead, dance, floor,...","[careless, whisper]"
1,Pop,do not let them fool ya or even try to school ...,could you be loved citação musical do rap se l...,"[let, fool, ya, even, try, school, ya, oh, got...","[could, loved, citação, musical, rap, se, ligue]"
2,Pop,baby let us cruise away from here do not be co...,cruisin part saulo,"[baby, let, u, cruise, away, confused, way, cl...","[cruisin, part, saulo]"
3,Pop,know it sounds funny but i just cannot stand t...,easy,"[know, sound, funny, stand, pain, girl, leavin...",[easy]
4,Pop,you have got that look again the one i hoped i...,for your babies the voice cover,"[got, look, one, hoped, lad, face, beaming, sm...","[baby, voice, cover]"
...,...,...,...,...,...
182627,Alternative Rock,words guitar i got it words guitar i like it w...,words and guitar,"[word, guitar, got, word, guitar, like, way, w...","[word, guitar]"
182628,Alternative Rock,i got your letter today i read the things you ...,write me back fucker,"[got, letter, today, read, thing, say, thing, ...","[write, back, fucker]"
182629,Alternative Rock,you are not it you are not it you are the hott...,you are not it,"[hottest, band, around, biggest, dick, town, m...",[]
182630,Alternative Rock,you are no rock n roll fun like a party that i...,you are no rock n roll fun,"[rock, n, roll, fun, like, party, begun, walk,...","[rock, n, roll, fun]"


In [34]:
# Splitting and saving 'musicalgenres' dataset with tokenized lyrics and titles

split_and_save(d3, 'musicalgenres_proc')

In [35]:
# Splitting and saving 'musicalgenres' dataset with tokenized lyrics and titles and 5 genres only

split_and_save(d3[d3.genre.isin(['Rock', 'Pop', 'Metal', 'Hip-Hop', 'Country'])], 'small_musicalgenres_proc')

In [36]:
# Leaving only necessary columns

d3 = d3[['genre', 'lyrics', 'tokens']]

In [37]:
# Merging 'musicalgenres' and 'metrolyrics' datasets

d4 = pd.concat([d, d3])
d4

Unnamed: 0,genre,lyrics,tokens
0,Pop,oh baby how you doing you know i am going to c...,"[oh, baby, know, going, cut, right, chase, wom..."
1,Pop,playin everything so easy it is like you seem ...,"[playin, everything, easy, like, seem, sure, s..."
2,Pop,if you search for tenderness it is not hard to...,"[search, tenderness, hard, find, love, need, l..."
3,Pop,oh oh oh i oh oh oh i if i wrote a book about ...,"[oh, oh, oh, oh, oh, oh, wrote, book, stand, t..."
4,Pop,party the people the people the party it is po...,"[party, people, people, party, popping, sittin..."
...,...,...,...
182627,Alternative Rock,words guitar i got it words guitar i like it w...,"[word, guitar, got, word, guitar, like, way, w..."
182628,Alternative Rock,i got your letter today i read the things you ...,"[got, letter, today, read, thing, say, thing, ..."
182629,Alternative Rock,you are not it you are not it you are the hott...,"[hottest, band, around, biggest, dick, town, m..."
182630,Alternative Rock,you are no rock n roll fun like a party that i...,"[rock, n, roll, fun, like, party, begun, walk,..."


In [38]:
# Resulting genres

vc = d4.genre.value_counts()
vc

Rock                125230
Pop                  47896
Metal                34706
Hip-Hop              31066
Country              23224
Indie                15933
Rap                   9589
R&B                   8645
Jazz                  7310
Electronic            6942
Alternative Rock      5555
Gospel/Religioso      5017
Name: genre, dtype: int64

In [39]:
# Splitting and saving merged dataset

split_and_save(d4, 'full')

In [40]:
# Leaving only most common genres

d4 = d4[d4.genre.isin(vc[vc >= 20000].index.tolist())]

In [41]:
# Resulting genres

d4.genre.value_counts()

Rock       125230
Pop         47896
Metal       34706
Hip-Hop     31066
Country     23224
Name: genre, dtype: int64

In [42]:
# Splitting and saving merged dataset with only 5 genres

split_and_save(d4, 'small')

In [43]:
X = d4[['lyrics', 'tokens']]
y = d4.genre

In [44]:
# Creating the balanced dataset by dropping redundant observations

under_sampler = RandomUnderSampler(random_state=7)
d5, y_res = under_sampler.fit_resample(X, y)

In [45]:
d5['genre'] = y_res
d5

Unnamed: 0,lyrics,tokens,genre
0,when the last breath of life is gone from my b...,"[last, breath, life, gone, body, lip, cold, se...",Country
1,i wonder if you have had the time it takes to ...,"[wonder, time, take, think, sort, feel, ings, ...",Country
2,i still recall the morning that i met you stan...,"[still, recall, morning, met, standing, front,...",Country
3,mornings closing in on me how can i face anoth...,"[morning, closing, face, another, dawn, life, ...",Country
4,the old man told his story about the years gon...,"[old, man, told, story, year, gone, played, ho...",Country
...,...,...,...
116115,and even after we all collapse in laughter and...,"[even, collapse, laughter, perfume, pass, hand...",Rock
116116,i cannot read sin for what it is like hold ins...,"[read, sin, like, hold, inside, silence, odds,...",Rock
116117,destroyed by m t v i hate to bite the hand tha...,"[destroyed, v, hate, bite, hand, feed, much, i...",Rock
116118,i heard a man who had no lungs he took me in a...,"[heard, man, lung, took, made, lunch, told, wo...",Rock


In [46]:
# Resulting numbers of observations

d5.genre.value_counts()

Country    23224
Hip-Hop    23224
Metal      23224
Pop        23224
Rock       23224
Name: genre, dtype: int64

In [47]:
# Splitting and saving the merged, balanced dataset with only 5 genres

split_and_save(d5, 'small_balanced')