In [1]:
import os
import pandas as pd
import sklearn.model_selection as ms

In [2]:
if not os.path.exists('data/train'):
    os.makedirs('data/train')
if not os.path.exists('data/val'):
    os.makedirs('data/val')
if not os.path.exists('data/test'):
    os.makedirs('data/test')

# MetroLyrics

source of data: https://github.com/hiteshyalamanchili/SongGenreClassification/blob/master/dataset/english_cleaned_lyrics.zip

In [3]:
d = pd.read_csv('data/raw/english_cleaned_lyrics.csv', index_col=0)
d

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,Oh baby how you doing You know I'm gonna cut r...
1,1,then-tell-me,2009,beyonce-knowles,Pop,playin everything so easy it's like you seem s...
2,2,honesty,2009,beyonce-knowles,Pop,If you search For tenderness It isn't hard to ...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,Oh oh oh I oh oh oh I If I wrote a book about ...
4,4,black-culture,2009,beyonce-knowles,Pop,Party the people the people the party it's pop...
...,...,...,...,...,...,...
362232,362232,who-am-i-drinking-tonight,2012,edens-edge,Country,I gotta say Boy after only just a couple of da...
362233,362233,liar,2012,edens-edge,Country,I helped you find her diamond ring You made me...
362234,362234,last-supper,2012,edens-edge,Country,Look at the couple in the corner booth Looks a...
362235,362235,christ-alone-live-in-studio,2012,edens-edge,Country,When I fly off this mortal earth And I'm measu...


In [4]:
d.genre.value_counts()

Rock          100053
Pop            34137
Hip-Hop        22654
Metal          21210
Country        14158
Jazz            7310
Electronic      6942
Other           3786
R&B             3336
Indie           2935
Folk            1689
Name: genre, dtype: int64

In [5]:
for word in ['"', 'VERSE', 'verse', 'Verse', 'CHORUS', 'Chorus', 'chorus']:
    d.lyrics = d.lyrics.str.replace(word, '')

In [6]:
d = d[['genre', 'lyrics']]
d

Unnamed: 0,genre,lyrics
0,Pop,Oh baby how you doing You know I'm gonna cut r...
1,Pop,playin everything so easy it's like you seem s...
2,Pop,If you search For tenderness It isn't hard to ...
3,Pop,Oh oh oh I oh oh oh I If I wrote a book about ...
4,Pop,Party the people the people the party it's pop...
...,...,...
362232,Country,I gotta say Boy after only just a couple of da...
362233,Country,I helped you find her diamond ring You made me...
362234,Country,Look at the couple in the corner booth Looks a...
362235,Country,When I fly off this mortal earth And I'm measu...


In [7]:
d_train, d_val_test = ms.train_test_split(d, train_size=0.7, random_state=7, stratify=d.genre)
d_val, d_test = ms.train_test_split(d_val_test, train_size=0.33, random_state=7, stratify=d_val_test.genre)

In [8]:
d_train.to_csv('data/train/metrolyrics.csv', index=False)
d_val.to_csv('data/val/metrolyrics.csv', index=False)
d_test.to_csv('data/test/metrolyrics.csv', index=False)

# Song lyrics from 79 musical genres

source of data: https://www.kaggle.com/datasets/neisse/scrapped-lyrics-from-6-genres

In [9]:
d1 = pd.read_csv('data/raw/artists-data.csv')
d1

Unnamed: 0,Artist,Genres,Songs,Popularity,Link
0,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/
1,Chiclete com Banana,Axé,268.0,3.8,/chiclete-com-banana/
2,Banda Eva,Axé; Romântico; Reggae,215.0,2.3,/banda-eva/
3,É O Tchan,Axé,129.0,1.6,/e-o-tchan/
4,Claudia Leitte,Pop; Axé; Romântico,167.0,1.5,/claudia-leitte/
...,...,...,...,...,...
4163,Miriam Makeba,World Music; Black Music; Blues,17.0,0.0,/miriam-makeba/
4164,Freddie Aguilar,World Music,61.0,0.0,/freddie-aguilar/
4165,Amadou & Mariam,World Music,14.0,0.0,/amadou-mariam/
4166,Magic System,World Music; Gospel/Religioso,16.0,0.0,/magic-system/


In [10]:
d1.Genres = d1.Genres.str.replace(',', ';')
d1.Genres = d1.Genres.str.split(r'\s*;\s*').str[0]
d1

Unnamed: 0,Artist,Genres,Songs,Popularity,Link
0,Ivete Sangalo,Pop,313.0,4.4,/ivete-sangalo/
1,Chiclete com Banana,Axé,268.0,3.8,/chiclete-com-banana/
2,Banda Eva,Axé,215.0,2.3,/banda-eva/
3,É O Tchan,Axé,129.0,1.6,/e-o-tchan/
4,Claudia Leitte,Pop,167.0,1.5,/claudia-leitte/
...,...,...,...,...,...
4163,Miriam Makeba,World Music,17.0,0.0,/miriam-makeba/
4164,Freddie Aguilar,World Music,61.0,0.0,/freddie-aguilar/
4165,Amadou & Mariam,World Music,14.0,0.0,/amadou-mariam/
4166,Magic System,World Music,16.0,0.0,/magic-system/


In [11]:
d1 = d1[['Genres', 'Link']]
d1

Unnamed: 0,Genres,Link
0,Pop,/ivete-sangalo/
1,Axé,/chiclete-com-banana/
2,Axé,/banda-eva/
3,Axé,/e-o-tchan/
4,Pop,/claudia-leitte/
...,...,...
4163,World Music,/miriam-makeba/
4164,World Music,/freddie-aguilar/
4165,World Music,/amadou-mariam/
4166,World Music,/magic-system/


In [12]:
d1.Genres.value_counts()

Gospel/Religioso    464
Rock                246
Pop                 236
Sertanejo           220
Indie               175
                   ... 
Metal                 2
Lo-fi                 2
Electro Swing         2
Piseiro               1
Urban                 1
Name: Genres, Length: 79, dtype: int64

In [13]:
d2 = pd.read_csv('data/raw/lyrics-data.csv')
d2

Unnamed: 0,ALink,SName,SLink,Lyric,language
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt
1,/ivete-sangalo/,Se Eu Não Te Amasse Tanto Assim,/ivete-sangalo/se-eu-nao-te-amasse-tanto-assim...,Meu coração\nSem direção\nVoando só por voar\n...,pt
2,/ivete-sangalo/,Céu da Boca,/ivete-sangalo/chupa-toda.html,É de babaixá!\nÉ de balacubaca!\nÉ de babaixá!...,pt
3,/ivete-sangalo/,Quando A Chuva Passar,/ivete-sangalo/quando-a-chuva-passar.html,Quando a chuva passar\n\nPra quê falar\nSe voc...,pt
4,/ivete-sangalo/,Sorte Grande,/ivete-sangalo/sorte-grande.html,A minha sorte grande foi você cair do céu\nMin...,pt
...,...,...,...,...,...
379926,/clegg-johnny/,The Waiting,/clegg-johnny/the-waiting.html,Chorus\nHere we stand waiting on the plain\nDa...,en
379927,/clegg-johnny/,Too Early For The Sky,/clegg-johnny/too-early-for-the-sky.html,I nearly disappeared into the mouth of a croco...,en
379928,/clegg-johnny/,Warsaw 1943 (I Never Betrayed The Revolution),/clegg-johnny/warsaw-1943-i-never-betrayed-the...,"Amambuka, amambuka azothengisa izwe lakithi, i...",en
379929,/clegg-johnny/,When The System Has Fallen,/clegg-johnny/when-the-system-has-fallen.html,Sweat in the heat for days on end\nwaiting for...,en


In [14]:
for word in ['"', '\[.*\]', ',', 'VERSE', 'Verse', 'CHORUS', 'Chorus', 'chorus']:
    d2.Lyric = d2.Lyric.str.replace(word, '', regex=True)
d2.Lyric = d2.Lyric.str.replace('\n', ' ')

In [15]:
d2 = d2[d2.language == 'en']
d2

Unnamed: 0,ALink,SName,SLink,Lyric,language
69,/ivete-sangalo/,Careless Whisper,/ivete-sangalo/careless-whisper.html,I feel so unsure As I take your hand and lead ...,en
86,/ivete-sangalo/,Could You Be Loved / Citação Musical do Rap: S...,/ivete-sangalo/could-you-be-loved-citacao-musi...,Don't let them fool ya Or even try to school y...,en
88,/ivete-sangalo/,Cruisin' (Part. Saulo),/ivete-sangalo/cruisin-part-saulo.html,Baby let's cruise away from here Don't be conf...,en
111,/ivete-sangalo/,Easy,/ivete-sangalo/easy.html,Know it sounds funny But I just can't stand th...,en
140,/ivete-sangalo/,For Your Babies (The Voice cover),/ivete-sangalo/for-your-babies-the-voice-cover...,You've got that look again The one I hoped I h...,en
...,...,...,...,...,...
379926,/clegg-johnny/,The Waiting,/clegg-johnny/the-waiting.html,Here we stand waiting on the plain Darkness h...,en
379927,/clegg-johnny/,Too Early For The Sky,/clegg-johnny/too-early-for-the-sky.html,I nearly disappeared into the mouth of a croco...,en
379928,/clegg-johnny/,Warsaw 1943 (I Never Betrayed The Revolution),/clegg-johnny/warsaw-1943-i-never-betrayed-the...,Amambuka amambuka azothengisa izwe lakithi izw...,en
379929,/clegg-johnny/,When The System Has Fallen,/clegg-johnny/when-the-system-has-fallen.html,Sweat in the heat for days on end waiting for ...,en


In [16]:
d2 = d2[['ALink', 'Lyric']]
d2 = d2.rename(columns={'ALink': 'Link'})
d2

Unnamed: 0,Link,Lyric
69,/ivete-sangalo/,I feel so unsure As I take your hand and lead ...
86,/ivete-sangalo/,Don't let them fool ya Or even try to school y...
88,/ivete-sangalo/,Baby let's cruise away from here Don't be conf...
111,/ivete-sangalo/,Know it sounds funny But I just can't stand th...
140,/ivete-sangalo/,You've got that look again The one I hoped I h...
...,...,...
379926,/clegg-johnny/,Here we stand waiting on the plain Darkness h...
379927,/clegg-johnny/,I nearly disappeared into the mouth of a croco...
379928,/clegg-johnny/,Amambuka amambuka azothengisa izwe lakithi izw...
379929,/clegg-johnny/,Sweat in the heat for days on end waiting for ...


In [17]:
d3 = pd.merge(d1, d2, on='Link')
d3 = d3[['Genres', 'Lyric']]
d3

Unnamed: 0,Genres,Lyric
0,Pop,I feel so unsure As I take your hand and lead ...
1,Pop,Don't let them fool ya Or even try to school y...
2,Pop,Baby let's cruise away from here Don't be conf...
3,Pop,Know it sounds funny But I just can't stand th...
4,Pop,You've got that look again The one I hoped I h...
...,...,...
191382,World Music,Here we stand waiting on the plain Darkness h...
191383,World Music,I nearly disappeared into the mouth of a croco...
191384,World Music,Amambuka amambuka azothengisa izwe lakithi izw...
191385,World Music,Sweat in the heat for days on end waiting for ...


In [18]:
vc = d3.Genres.value_counts()
vc

Rock             25177
Pop              13759
Heavy Metal      13496
Indie            12998
Rap               9589
                 ...  
Electro Swing        6
Jovem Guarda         6
Forró                3
Lo-fi                1
Regional             1
Name: Genres, Length: 73, dtype: int64

In [19]:
vc[vc < 1000]

Tecnopop          994
House             933
Power-Pop         840
Surf Music        799
Classic Rock      753
Emocore           747
New Age           735
Industrial        723
Piano Rock        723
K-Pop/K-Rock      550
Psicodelia        541
Ska               479
Infantil          475
Funk              360
Clássico          356
Trance            355
MPB               329
Chillout          228
World Music       220
Instrumental      192
Tropical House    165
Metal             148
Bossa Nova        114
Post-Rock         106
Trap               99
Sertanejo          44
Samba              34
Reggaeton          32
Axé                14
Fado               13
Funk Carioca        9
Kizomba             7
Electro Swing       6
Jovem Guarda        6
Forró               3
Lo-fi               1
Regional            1
Name: Genres, dtype: int64

In [20]:
d3 = d3[d3.Genres.isin(vc[vc >= 1000].index.tolist())]
d3

Unnamed: 0,Genres,Lyric
0,Pop,I feel so unsure As I take your hand and lead ...
1,Pop,Don't let them fool ya Or even try to school y...
2,Pop,Baby let's cruise away from here Don't be conf...
3,Pop,Know it sounds funny But I just can't stand th...
4,Pop,You've got that look again The one I hoped I h...
...,...,...
191128,Trip-Hop,I had you there in the palm of my hand Saw tha...
191129,Trip-Hop,We’re all searching Time’s unfolding Trying to...
191130,Trip-Hop,Ther’s a clear blue sky outside A clear blue s...
191131,Trip-Hop,I’m looking at the people Walking up and down ...


In [21]:
d3 = d3.rename(columns={'Genres': 'genre', 'Lyric': 'lyrics'})

In [22]:
d3_train, d3_val_test = ms.train_test_split(d3, train_size=0.7, random_state=7, stratify=d3.genre)
d3_val, d3_test = ms.train_test_split(d3_val_test, train_size=0.33, random_state=7, stratify=d3_val_test.genre)

In [23]:
d3_train.to_csv('data/train/musicalgenres.csv', index=False)
d3_val.to_csv('data/val/musicalgenres.csv', index=False)
d3_test.to_csv('data/test/musicalgenres.csv', index=False)