In [1]:
import os
import pandas as pd
import sklearn.model_selection as ms

import re
from joblib import Parallel, delayed
import contractions
import nltk
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from imblearn.under_sampling import RandomUnderSampler

In [2]:
min_obs = 5000

In [3]:
if not os.path.exists('data/train'):
    os.makedirs('data/train')
if not os.path.exists('data/val'):
    os.makedirs('data/val')
if not os.path.exists('data/test'):
    os.makedirs('data/test')

In [4]:
# we are deleting special strings ("verse", "chorus", "2x" etc.), 
# strings in brackets and numbers
delete_str = ['"', ',', 
              'VERSE', 'Verse', 'verse', 
              'CHORUS', 'Chorus', 'chorus',
              '\[.*\]', r'\dx', r'\d+'
             ]

def basic_process(text):
    # deleting above strings
    for ds in delete_str:
        text = re.sub(ds, '', text)
    # deleting new line
    text = re.sub('\n', ' ', text)
    # expanding contractions
    expanded_words = [] 
    for word in text.split():
        expanded_words.append(contractions.fix(word))  
    text = ' '.join(expanded_words)
    # deleting all special characters
    text = ''.join(c for c in text if c.isalnum() or c == " ")
    # removing repeated spaces
    text = re.sub(' +', ' ', text)
    # lowering text
    text = text.lower()
    return text

In [5]:
def nltk_process(text):
    nltk_stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    # tokenization
    tokens = word_tokenize(text)
    result = []
    for word in tokens:
        # deleting stop words
        if word not in nltk_stop_words:
            # lemmatization
            lemma = lemmatizer.lemmatize(word)
            result.append(lemma)
    return result

In [6]:
# # Unused spacy version: (it deletes much more words as stopwords than nltk)
# pip install spacy
# python -m spacy downloady en
# import spacy
# nlp = spacy.load('en_core_web_sm')
# def spacy_process(text):
#     # tokenization
#     doc = nlp(text)
#     result = []
#     for token in doc:
#         # lemmatization
#         lemma = token.lemma_
#         # deleting stop words
#         lexeme = nlp.vocab[lemma]
#         if lexeme.is_stop == False:
#             result.append(lemma.lower())
#     return result

In [7]:
def process_lyrics_basic(lyrics):
    return Parallel(n_jobs=-1, backend='threading')(delayed(basic_process)(text) for text in lyrics)
# nltk doesn't work with Parallel in contradiciton to spacy
def process_lyrics(lyrics):
    return [nltk_process(text) for text in lyrics]

In [8]:
def split_and_save(df, name):
    df_train, df_test = ms.train_test_split(df, train_size=0.7, random_state=7, stratify=df.genre)
    df_train.to_csv(f'data/train/{name}.csv', index=False)
    dF_test.to_csv(f'data/test/{name}.csv', index=False)


# MetroLyrics

source of data: https://github.com/hiteshyalamanchili/SongGenreClassification/blob/master/dataset/english_cleaned_lyrics.zip

In [9]:
d = pd.read_csv('data/raw/english_cleaned_lyrics.csv', index_col=0)

In [10]:
d = d[['genre', 'lyrics']]
d

Unnamed: 0,genre,lyrics
0,Pop,Oh baby how you doing You know I'm gonna cut r...
1,Pop,playin everything so easy it's like you seem s...
2,Pop,If you search For tenderness It isn't hard to ...
3,Pop,Oh oh oh I oh oh oh I If I wrote a book about ...
4,Pop,Party the people the people the party it's pop...
...,...,...
362232,Country,I gotta say Boy after only just a couple of da...
362233,Country,I helped you find her diamond ring You made me...
362234,Country,Look at the couple in the corner booth Looks a...
362235,Country,When I fly off this mortal earth And I'm measu...


In [11]:
d = d[d.genre != "Other"]
d = d.replace("Folk", "Country")

In [12]:
vc = d.genre.value_counts()
vc

Rock          100053
Pop            34137
Hip-Hop        22654
Metal          21210
Country        15847
Jazz            7310
Electronic      6942
R&B             3336
Indie           2935
Name: genre, dtype: int64

In [13]:
processed_lyrics = process_lyrics_basic(d.lyrics)
d['lyrics'] = processed_lyrics
d

Unnamed: 0,genre,lyrics
0,Pop,oh baby how you doing you know i am going to c...
1,Pop,playin everything so easy it is like you seem ...
2,Pop,if you search for tenderness it is not hard to...
3,Pop,oh oh oh i oh oh oh i if i wrote a book about ...
4,Pop,party the people the people the party it is po...
...,...,...
362232,Country,i got to say boy after only just a couple of d...
362233,Country,i helped you find her diamond ring you made me...
362234,Country,look at the couple in the corner booth looks a...
362235,Country,when i fly off this mortal earth and i am meas...


In [16]:
split_and_save(d, "metrolyrics")

In [17]:
processed_lyrics = process_lyrics(d.lyrics)
d['tokens'] = processed_lyrics
d

Unnamed: 0,genre,lyrics,tokens
0,Pop,oh baby how you doing you know i am going to c...,"[oh, baby, know, going, cut, right, chase, wom..."
1,Pop,playin everything so easy it is like you seem ...,"[playin, everything, easy, like, seem, sure, s..."
2,Pop,if you search for tenderness it is not hard to...,"[search, tenderness, hard, find, love, need, l..."
3,Pop,oh oh oh i oh oh oh i if i wrote a book about ...,"[oh, oh, oh, oh, oh, oh, wrote, book, stand, t..."
4,Pop,party the people the people the party it is po...,"[party, people, people, party, popping, sittin..."
...,...,...,...
362232,Country,i got to say boy after only just a couple of d...,"[got, say, boy, couple, date, hand, outright, ..."
362233,Country,i helped you find her diamond ring you made me...,"[helped, find, diamond, ring, made, try, every..."
362234,Country,look at the couple in the corner booth looks a...,"[look, couple, corner, booth, look, lot, like,..."
362235,Country,when i fly off this mortal earth and i am meas...,"[fly, mortal, earth, measured, depth, girth, f..."


In [18]:
split_and_save(d, "metrolyrics_proc")

# Song lyrics from 79 musical genres

source of data: https://www.kaggle.com/datasets/neisse/scrapped-lyrics-from-6-genres

In [19]:
d1 = pd.read_csv('data/raw/artists-data.csv')
d1

Unnamed: 0,Artist,Genres,Songs,Popularity,Link
0,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/
1,Chiclete com Banana,Axé,268.0,3.8,/chiclete-com-banana/
2,Banda Eva,Axé; Romântico; Reggae,215.0,2.3,/banda-eva/
3,É O Tchan,Axé,129.0,1.6,/e-o-tchan/
4,Claudia Leitte,Pop; Axé; Romântico,167.0,1.5,/claudia-leitte/
...,...,...,...,...,...
4163,Miriam Makeba,World Music; Black Music; Blues,17.0,0.0,/miriam-makeba/
4164,Freddie Aguilar,World Music,61.0,0.0,/freddie-aguilar/
4165,Amadou & Mariam,World Music,14.0,0.0,/amadou-mariam/
4166,Magic System,World Music; Gospel/Religioso,16.0,0.0,/magic-system/


In [20]:
d1.Genres = d1.Genres.str.replace(',', ';')
d1.Genres = d1.Genres.str.split(r'\s*;\s*').str[0]
d1

Unnamed: 0,Artist,Genres,Songs,Popularity,Link
0,Ivete Sangalo,Pop,313.0,4.4,/ivete-sangalo/
1,Chiclete com Banana,Axé,268.0,3.8,/chiclete-com-banana/
2,Banda Eva,Axé,215.0,2.3,/banda-eva/
3,É O Tchan,Axé,129.0,1.6,/e-o-tchan/
4,Claudia Leitte,Pop,167.0,1.5,/claudia-leitte/
...,...,...,...,...,...
4163,Miriam Makeba,World Music,17.0,0.0,/miriam-makeba/
4164,Freddie Aguilar,World Music,61.0,0.0,/freddie-aguilar/
4165,Amadou & Mariam,World Music,14.0,0.0,/amadou-mariam/
4166,Magic System,World Music,16.0,0.0,/magic-system/


In [21]:
d1 = d1[['Genres', 'Link']]
d1

Unnamed: 0,Genres,Link
0,Pop,/ivete-sangalo/
1,Axé,/chiclete-com-banana/
2,Axé,/banda-eva/
3,Axé,/e-o-tchan/
4,Pop,/claudia-leitte/
...,...,...
4163,World Music,/miriam-makeba/
4164,World Music,/freddie-aguilar/
4165,World Music,/amadou-mariam/
4166,World Music,/magic-system/


In [24]:
d1.Genres.value_counts()

Gospel/Religioso    464
Rock                246
Pop                 236
Sertanejo           220
Indie               175
                   ... 
Metal                 2
Lo-fi                 2
Electro Swing         2
Piseiro               1
Urban                 1
Name: Genres, Length: 79, dtype: int64

In [25]:
d2 = pd.read_csv('data/raw/lyrics-data.csv')
d2

Unnamed: 0,ALink,SName,SLink,Lyric,language
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt
1,/ivete-sangalo/,Se Eu Não Te Amasse Tanto Assim,/ivete-sangalo/se-eu-nao-te-amasse-tanto-assim...,Meu coração\nSem direção\nVoando só por voar\n...,pt
2,/ivete-sangalo/,Céu da Boca,/ivete-sangalo/chupa-toda.html,É de babaixá!\nÉ de balacubaca!\nÉ de babaixá!...,pt
3,/ivete-sangalo/,Quando A Chuva Passar,/ivete-sangalo/quando-a-chuva-passar.html,Quando a chuva passar\n\nPra quê falar\nSe voc...,pt
4,/ivete-sangalo/,Sorte Grande,/ivete-sangalo/sorte-grande.html,A minha sorte grande foi você cair do céu\nMin...,pt
...,...,...,...,...,...
379926,/clegg-johnny/,The Waiting,/clegg-johnny/the-waiting.html,Chorus\nHere we stand waiting on the plain\nDa...,en
379927,/clegg-johnny/,Too Early For The Sky,/clegg-johnny/too-early-for-the-sky.html,I nearly disappeared into the mouth of a croco...,en
379928,/clegg-johnny/,Warsaw 1943 (I Never Betrayed The Revolution),/clegg-johnny/warsaw-1943-i-never-betrayed-the...,"Amambuka, amambuka azothengisa izwe lakithi, i...",en
379929,/clegg-johnny/,When The System Has Fallen,/clegg-johnny/when-the-system-has-fallen.html,Sweat in the heat for days on end\nwaiting for...,en


In [26]:
d2 = d2[d2.language == 'en']
d2

Unnamed: 0,ALink,SName,SLink,Lyric,language
69,/ivete-sangalo/,Careless Whisper,/ivete-sangalo/careless-whisper.html,I feel so unsure\nAs I take your hand and lead...,en
86,/ivete-sangalo/,Could You Be Loved / Citação Musical do Rap: S...,/ivete-sangalo/could-you-be-loved-citacao-musi...,"Don't let them fool, ya\nOr even try to school...",en
88,/ivete-sangalo/,Cruisin' (Part. Saulo),/ivete-sangalo/cruisin-part-saulo.html,"Baby, let's cruise, away from here\nDon't be c...",en
111,/ivete-sangalo/,Easy,/ivete-sangalo/easy.html,"Know it sounds funny\nBut, I just can't stand ...",en
140,/ivete-sangalo/,For Your Babies (The Voice cover),/ivete-sangalo/for-your-babies-the-voice-cover...,You've got that look again\nThe one I hoped I ...,en
...,...,...,...,...,...
379926,/clegg-johnny/,The Waiting,/clegg-johnny/the-waiting.html,Chorus\nHere we stand waiting on the plain\nDa...,en
379927,/clegg-johnny/,Too Early For The Sky,/clegg-johnny/too-early-for-the-sky.html,I nearly disappeared into the mouth of a croco...,en
379928,/clegg-johnny/,Warsaw 1943 (I Never Betrayed The Revolution),/clegg-johnny/warsaw-1943-i-never-betrayed-the...,"Amambuka, amambuka azothengisa izwe lakithi, i...",en
379929,/clegg-johnny/,When The System Has Fallen,/clegg-johnny/when-the-system-has-fallen.html,Sweat in the heat for days on end\nwaiting for...,en


In [27]:
d2 = d2[['ALink', 'Lyric']]
d2 = d2.rename(columns={'ALink': 'Link'})
d2

Unnamed: 0,Link,Lyric
69,/ivete-sangalo/,I feel so unsure\nAs I take your hand and lead...
86,/ivete-sangalo/,"Don't let them fool, ya\nOr even try to school..."
88,/ivete-sangalo/,"Baby, let's cruise, away from here\nDon't be c..."
111,/ivete-sangalo/,"Know it sounds funny\nBut, I just can't stand ..."
140,/ivete-sangalo/,You've got that look again\nThe one I hoped I ...
...,...,...
379926,/clegg-johnny/,Chorus\nHere we stand waiting on the plain\nDa...
379927,/clegg-johnny/,I nearly disappeared into the mouth of a croco...
379928,/clegg-johnny/,"Amambuka, amambuka azothengisa izwe lakithi, i..."
379929,/clegg-johnny/,Sweat in the heat for days on end\nwaiting for...


In [28]:
d3 = pd.merge(d1, d2, on='Link')
d3 = d3[['Genres', 'Lyric']]
d3 = d3.rename(columns={'Genres': 'genre', 'Lyric': 'lyrics'})

In [29]:
processed_lyrics = process_lyrics_basic(d3.lyrics)
d3['lyrics'] = processed_lyrics
d3

Unnamed: 0,genre,lyrics
0,Pop,i feel so unsure as i take your hand and lead ...
1,Pop,do not let them fool ya or even try to school ...
2,Pop,baby let us cruise away from here do not be co...
3,Pop,know it sounds funny but i just cannot stand t...
4,Pop,you have got that look again the one i hoped i...
...,...,...
191382,World Music,here we stand waiting on the plain darkness ha...
191383,World Music,i nearly disappeared into the mouth of a croco...
191384,World Music,amambuka amambuka azothengisa izwe lakithi izw...
191385,World Music,sweat in the heat for days on end waiting for ...


In [30]:
vc = d3.genre.value_counts()
vc

Rock             25177
Pop              13759
Heavy Metal      13496
Indie            12998
Rap               9589
                 ...  
Electro Swing        6
Jovem Guarda         6
Forró                3
Lo-fi                1
Regional             1
Name: genre, Length: 73, dtype: int64

In [31]:
vc[vc < min_obs]

Hard Rock        4632
Soul Music       4518
Dance            4252
Punk Rock        4157
Folk             4055
                 ... 
Electro Swing       6
Jovem Guarda        6
Forró               3
Lo-fi               1
Regional            1
Name: genre, Length: 62, dtype: int64

In [32]:
d3 = d3[d3.genre.isin(vc[vc >= min_obs].index.tolist())]
d3 = d3[d3.genre != "Pop/Rock"]
d3 = d3.replace("Heavy Metal", "Metal")
d3 = d3.replace("Rock Alternativo", "Alternative Rock")

In [33]:
vc = d3.genre.value_counts()
vc

Rock                25177
Pop                 13759
Metal               13496
Indie               12998
Rap                  9589
Hip Hop              8412
Country              7377
Alternative Rock     5555
R&B                  5309
Gospel/Religioso     5017
Name: genre, dtype: int64

In [34]:
split_and_save(d3, "musicalgenres")

In [35]:
processed_lyrics = process_lyrics(d3.lyrics)
d3['tokens'] = processed_lyrics
d3

Unnamed: 0,genre,lyrics,tokens
0,Pop,i feel so unsure as i take your hand and lead ...,"[feel, unsure, take, hand, lead, dance, floor,..."
1,Pop,do not let them fool ya or even try to school ...,"[let, fool, ya, even, try, school, ya, oh, got..."
2,Pop,baby let us cruise away from here do not be co...,"[baby, let, u, cruise, away, confused, way, cl..."
3,Pop,know it sounds funny but i just cannot stand t...,"[know, sound, funny, stand, pain, girl, leavin..."
4,Pop,you have got that look again the one i hoped i...,"[got, look, one, hoped, lad, face, beaming, sm..."
...,...,...,...
182627,Alternative Rock,words guitar i got it words guitar i like it w...,"[word, guitar, got, word, guitar, like, way, w..."
182628,Alternative Rock,i got your letter today i read the things you ...,"[got, letter, today, read, thing, say, thing, ..."
182629,Alternative Rock,you are not it you are not it you are the hott...,"[hottest, band, around, biggest, dick, town, m..."
182630,Alternative Rock,you are no rock n roll fun like a party that i...,"[rock, n, roll, fun, like, party, begun, walk,..."


In [36]:
split_and_save(d3, "musicalgenres_proc")

In [37]:
d4 = pd.concat([d, d3])
d4

Unnamed: 0,genre,lyrics,tokens
0,Pop,oh baby how you doing you know i am going to c...,"[oh, baby, know, going, cut, right, chase, wom..."
1,Pop,playin everything so easy it is like you seem ...,"[playin, everything, easy, like, seem, sure, s..."
2,Pop,if you search for tenderness it is not hard to...,"[search, tenderness, hard, find, love, need, l..."
3,Pop,oh oh oh i oh oh oh i if i wrote a book about ...,"[oh, oh, oh, oh, oh, oh, wrote, book, stand, t..."
4,Pop,party the people the people the party it is po...,"[party, people, people, party, popping, sittin..."
...,...,...,...
182627,Alternative Rock,words guitar i got it words guitar i like it w...,"[word, guitar, got, word, guitar, like, way, w..."
182628,Alternative Rock,i got your letter today i read the things you ...,"[got, letter, today, read, thing, say, thing, ..."
182629,Alternative Rock,you are not it you are not it you are the hott...,"[hottest, band, around, biggest, dick, town, m..."
182630,Alternative Rock,you are no rock n roll fun like a party that i...,"[rock, n, roll, fun, like, party, begun, walk,..."


In [38]:
vc = d4.genre.value_counts()
vc

Rock                125230
Pop                  47896
Metal                34706
Country              23224
Hip-Hop              22654
Indie                15933
Rap                   9589
R&B                   8645
Hip Hop               8412
Jazz                  7310
Electronic            6942
Alternative Rock      5555
Gospel/Religioso      5017
Name: genre, dtype: int64

In [39]:
split_and_save(d4, "full")

In [40]:
d4 = d4[d4.genre.isin(vc[vc >= 20000].index.tolist())]

In [47]:
vc = d4.genre.value_counts()
vc

Rock       125230
Pop         47896
Metal       34706
Country     23224
Hip-Hop     22654
Name: genre, dtype: int64

In [41]:
split_and_save(d4, "small")

In [42]:
X = d4[["lyrics","tokens"]]
y = d4.genre

In [43]:
under_sampler = RandomUnderSampler(random_state=7)
d5, y_res = under_sampler.fit_resample(X, y)

In [45]:
d5['genre'] = y_res

In [48]:
d5

Unnamed: 0,lyrics,tokens,genre
0,one sages have come from far following one bri...,"[one, sage, come, far, following, one, bright,...",Country
1,you will remember me when the west winds moves...,"[remember, west, wind, move, among, field, bar...",Country
2,one day the train was passing i caught it comi...,"[one, day, train, passing, caught, comin, look...",Country
3,it is been a long long time writers sammy cahn...,"[long, long, time, writer, sammy, cahn, jule, ...",Country
4,shortcut to part of bill monroe lyrics bill mo...,"[shortcut, part, bill, monroe, lyric, bill, mo...",Country
...,...,...,...
113265,sanctify me clean out my closet take away anyt...,"[sanctify, clean, closet, take, away, anything...",Rock
113266,stare out my window until the light fades the ...,"[stare, window, light, fade, darkness, overcom...",Rock
113267,oh heartbeat city here we come and happy days ...,"[oh, heartbeat, city, come, happy, day, count,...",Rock
113268,what would we do if love were dead what would ...,"[would, love, dead, would, say, silly, thing, ...",Rock


In [49]:
vc = d5.genre.value_counts()
vc

Country    22654
Hip-Hop    22654
Metal      22654
Pop        22654
Rock       22654
Name: genre, dtype: int64

In [46]:
split_and_save(d5, "small_balanced")