In [332]:
import pandas as pd

In [333]:
df = pd.read_csv('lyrics_tagged.csv')

In [334]:
# Delete songs with empty tags

In [335]:
df = df[df['tag'].notnull()]

In [336]:
# Now we should understand - is our dataset imbalanced?

In [337]:
df.groupby('tag').size()

tag
60s                   21
6ix9ine                1
80s                  537
acoustic             180
alternative            1
alternative rock     198
blues                 60
britpop              122
cardi b                1
chillout               1
chris brown            1
classic rock        1489
country             1546
dance                 31
dancehall              1
ed sheeran             2
edm                    1
electronic           166
female vocalists     770
folk                 596
funk                 536
hindi                  2
hip hop                4
hiphop              2486
house                  2
indie                250
indie rock           191
jazz                1862
katy perry             1
kendrick lamar         1
nu metal              15
oldies                 1
pop                 6371
pop rock             125
rap                 1118
reggae               155
reggaeton              1
rnb                 1127
rock                2005
rockabilly           

In [338]:
# Yes, it's imbalanced. And also we see that we can fuse different genres into one. 

In [339]:
df = df.replace({'alternative rock': 'rock', 'classic rock': 'rock', 'indie rock': 'rock', 'pop rock': 'rock', 'rockabilly': 'rock', 'hip-hop': 'hip hop', 'hiphop': 'hip hop', 'rap': 'hip hop'})

In [340]:
# Check it

In [341]:
df.groupby('tag').size()

tag
60s                   21
6ix9ine                1
80s                  537
acoustic             180
alternative            1
blues                 60
britpop              122
cardi b                1
chillout               1
chris brown            1
country             1546
dance                 31
dancehall              1
ed sheeran             2
edm                    1
electronic           166
female vocalists     770
folk                 596
funk                 536
hindi                  2
hip hop             3608
house                  2
indie                250
jazz                1862
katy perry             1
kendrick lamar         1
nu metal              15
oldies                 1
pop                 6371
reggae               155
reggaeton              1
rnb                 1127
rock                4012
seen live              3
selena gomez           1
singersongwriter     157
soul                 926
soundtrack            28
swing                 71
the weeknd           

In [342]:
# I have decided to concentrate only on three genres - rock, hip hop, and pop

In [343]:
df = df[df['tag'].isin(['rock', 'pop', 'hip hop'])]

In [344]:
df.groupby('tag').size()

tag
hip hop    3608
pop        6371
rock       4012
dtype: int64

In [345]:
# But we still have a lot of pop-songs. Let's balance it

In [346]:
pop_songs = df[df['tag'] == 'pop'].sample(3800)

In [347]:
df = pd.concat([pop_songs, df[df['tag'] != 'pop']], axis=0)

In [348]:
df.groupby('tag').size()

tag
hip hop    3608
pop        3800
rock       4012
dtype: int64

In [349]:
# Now we should encode our genres

In [350]:
from sklearn import preprocessing

In [351]:
le = preprocessing.LabelEncoder()

In [352]:
le.fit(df['tag'])

LabelEncoder()

In [353]:
labels = le.transform(df['tag'])

In [354]:
# And than we split our data to train-data and test-data

In [355]:
from sklearn.model_selection import train_test_split

In [356]:
X_train, X_test, y_train, y_test = train_test_split(df['lyrics'], labels, test_size=0.2)

In [357]:
# Train TF-IDF Vectorizer and tranform lyrics to vectors

In [358]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [359]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 3))

In [360]:
tfidf_vectorizer.fit(X_train)

TfidfVectorizer(ngram_range=(2, 3))

In [361]:
X_data =  tfidf_vectorizer.transform(X_train)

In [362]:
# I have chosen simple KNeighbors Classifier for testing my idea. Just because It's easy and fast. 

In [363]:
from sklearn.neighbors import KNeighborsClassifier 

In [364]:
knn = KNeighborsClassifier(n_neighbors=80)

In [365]:
knn.fit(X_data, y_train)

KNeighborsClassifier(n_neighbors=80)

In [366]:
# Now we can test our classifier

In [367]:
X_test_data = tfidf_vectorizer.transform(X_test)

In [368]:
y_pred = knn.predict(X_test_data)

In [369]:
from sklearn.metrics import accuracy_score

In [370]:
accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)

0.6077057793345009

In [371]:
# 0.61 isn't a perfect result, but it's a good start point

In [372]:
# Let's try our trained model on examples

In [373]:
def predict_song(text):
    vector = tfidf_vectorizer.transform([text])
    predict = knn.predict(vector)
    return le.inverse_transform([predict])[0]

In [374]:
examples = X_test[:30]
predicted_labels = [predict_song(song_text) for song_text in X_test[:30]]

  return f(**kwargs)


In [375]:
real_labels = le.inverse_transform(y_test[:30])

In [376]:
results = []
for real, predicted in zip(real_labels, predicted_labels):
    results.append(f'Real genre: {real}, Predicted genre: {predicted}')

In [377]:
# It's interesting if we know the artist and song name

In [378]:
def find_source(lyrics):
    current_song = df[df['lyrics'] == lyrics]
    return f"{current_song['artist'].values[0]} - {current_song['song_name'].values[0]}"

In [379]:
# Let's print results

In [380]:
for text, result in zip(examples, results):
    print(find_source(text))
    print(f'{text[:300]}, \n\n {"-" * 50}\n\n{result} \n\n {"-" * 50}\n\n')

Chicago - Anyway You Want


Anyway you want me
It's alright with me
Anything you want
It's alright with me
Know that I love you
Know that I need you
Anyway you want
It's alright with me
Baby let me love you
Do what I want to do
Anyway you want
It's alright with me
Feels so good
Know that I love you
Know that I need you
Anyw, 

 --------------------------------------------------

Real genre: rock, Predicted genre: rock 

 --------------------------------------------------


Doja Cat - Grind On Me


(It's so thick, you know this)

I'm high as fuck
Light that dutch
Mocking when I'm walking
Ya, they bite that much
And they stalking my circumference, make your pipe stand up
Like a mic I adjust it I don't hide that stuff
Niggas in the kitchen with that white, doing bumps
I do not need that in my, 

 --------------------------------------------------

Real genre: pop, Predicted genre: rock 

 --------------------------------------------------


Eric Clapton - Call Me The Breeze


They call

In [None]:
# Let's try other lyrics

In [381]:
text = """
Stand against our enemy
Fire
The blessing touch of your despair
Despair and loneliness
You'll rise up, before you fall again
Atrocious
The bird of prey is haunting us

"Deus"
Conquer your fears
"Deus ex machina"
"Deus"
Surpass your weaknesses
"Deus ex machina"
"Deus"
Withstand the fall and stand up again
"Deus ex machina"
Provoking, disturbing, resisting
Surviving the ordeal
As a father-head of all of us
Your flame will forge our hearts, even in oblivion!
"Deus"
"Deus ex machina"
Resurrection brings our hopes to life
And the lies and fears dissolve
We are strong

Guide us / "Deus"
Your fire in our hearts / "Deus ex machina"
Guide us / "Deus"
You are bringing fear to gods / "Deus ex machina"
Guide us / "Deus"
Your fire in our hearts / "Deus ex machina"
Guide us / "Deus"
The children of Prometheus / "Deus ex machina"
"Guide us"
"Deus ex machina"

Prometheus...
As a giant, standing tall
In a shadow close the blackness
Prometheus
As your breed, we are bound to fall
But our light will scare the darkness
"""

In [382]:
predict_song(text)

'rock'

In [None]:
# It's Septicflesh's song "Prometheus". We don't have 'metal' genre in our dataset, but, of course, "rock" 
# is closer to the truth than "pop" or "hip hop"