# Predicting Genre Using Lyrics

Here we use the same Spotify dataset from Kaggle but we have scraped the lyrics for a random 5000 of them. We will use this data to see if we can predict track genre based on lyrics better than with song attributes.

In [1]:
import pandas as pd
import numpy as np

In [2]:
lyrics = pd.read_csv('song_lyrics.csv', index_col=0)
songs = pd.read_csv('spotify.csv', index_col=0)

In [4]:
data = lyrics.merge(songs, on='track_name', how='inner')
cols = ['popularity', 'track_name', 'song_id', 'track_id', 'lyrics', 'track_genre']
data = data[cols].groupby(by='song_id', as_index=False).max()

In [5]:
data

Unnamed: 0,song_id,popularity,track_name,track_id,lyrics,track_genre
0,43,62,Juicy,2AP7m2dBb8ULTx4Gc1rdMc,"\nYeah, this album is dedicated\nTo all the te...",hardcore
1,237,82,It Was A Good Day,2qOm7ukLyHUXWyR4ZWLwxA,\nBreak 'em\nYeah\nYeah\nYeah\nUh\nJust wakin'...,funk
2,442,62,Mathematics,3gRlmtdCyNoKiyozn2pqc9,"\nBucka-bucka-bucka-bucka-bucka-bucka, haha!\n...",hardcore
3,508,0,Lightfall,4j4GpPOvUoZSz1ozjPyMqp,"\nUh, I've been up for four days, gettin' mone...",iranian
4,544,70,Can't C Me,7rUchbZxrhF29Q0vYjKEU0,\nThe blind stares of a million pairs of eyes\...,funk
...,...,...,...,...,...,...
3867,8612099,2,Told You I Could Drink,6XECmlbKVHK2Oc4MS0nm7g,Hot 30 Weekend Countdown Lyrics\n30. Jessie J...,country
3868,8617375,0,Good as Hell - Two Stacks Remix,18Mf7Xqt0DEL8Uo8P40BQk,\nRepertorio Álbumes Traducidos al Español\nLo...,hip-hop
3869,8620675,33,Can't Take This,3H5eCyts4yAaxhoZzde5Kh,"\nIntro :\nYou Know\nAye Shoutout Detroit, all...",club
3870,8622772,45,It's Not Too Late,3j3xA2JFjRvV9MVF2mSBc5,Met Gala Lyrics\nHey\nWizop\nOffset\nWoo\nMil...,dubstep


## Data Cleaning

### Removing songs without lyrics

In [6]:
data = data[~data['lyrics'].str.contains('This song is an instrumental')]
data = data[~data['lyrics'].str.contains('This music does not contain words')]

### Removing non-English songs

In [7]:
from langdetect import detect

def detectSong(text):
    try:
        return detect(text)
    except:
        return 'NA'

In [8]:
data = data[data['lyrics'].apply(detectSong) == 'en']

### Keeping only the top 20 most prevalent genres

In [9]:
genre_freq = data['track_genre'].value_counts().reset_index()
genres = genre_freq.iloc[:20, 0]

In [10]:
data = data[data['track_genre'].isin(genres)]

## Text Cleaning

In [12]:
text = data['lyrics']
text

9       \nOld man, look at my life\nI'm a lot like you...
10      \nI read the news today—oh, boy\nAbout a lucky...
12      \nI see a red door and I want it painted black...
13      \nWell, you can tell by the way I use my walk\...
20      \nOh yeah, yeah, yeah\nNow if there's a smile ...
                              ...                        
3850    Christmas Is Here  Lyrics\nChristmas time is h...
3851    \nYeah\nI know what darkness means\nThe isolat...
3860    The World Is Yours To Take  Lyrics\n" ft. Lil ...
3863    Lucky You  Lyrics\nAnd seriously\nHow lucky ar...
3867    Hot 30 Weekend Countdown  Lyrics\n30. Jessie J...
Name: lyrics, Length: 1589, dtype: object

### Lowercase

In [13]:
text_lower = text.str.lower()

### Remove punctuation

In [14]:
from string import punctuation

def remove_punctuation(document):

    no_punct = ''.join([character for character in document if character not in punctuation])
    
    return no_punct

In [15]:
text_no_punct = text_lower.apply(remove_punctuation)
text_no_punct

9       \nold man look at my life\nim a lot like you w...
10      \ni read the news today—oh boy\nabout a lucky ...
12      \ni see a red door and i want it painted black...
13      \nwell you can tell by the way i use my walk\n...
20      \noh yeah yeah yeah\nnow if theres a smile on ...
                              ...                        
3850    christmas is here  lyrics\nchristmas time is h...
3851    \nyeah\ni know what darkness means\nthe isolat...
3860    the world is yours to take  lyrics\n ft lil ba...
3863    lucky you  lyrics\nand seriously\nhow lucky ar...
3867    hot 30 weekend countdown  lyrics\n30 jessie ja...
Name: lyrics, Length: 1589, dtype: object

### Tokenize words

In [16]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
from nltk.tokenize import word_tokenize

text_tokenized = text_no_punct.apply(word_tokenize)
text_tokenized.head()

9     [old, man, look, at, my, life, im, a, lot, lik...
10    [i, read, the, news, today—oh, boy, about, a, ...
12    [i, see, a, red, door, and, i, want, it, paint...
13    [well, you, can, tell, by, the, way, i, use, m...
20    [oh, yeah, yeah, yeah, now, if, theres, a, smi...
Name: lyrics, dtype: object

### Removing stop-words

In [18]:
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(document):
    
    words = [word for word in document if not word in stop_words]
    
    return words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
text_no_stop = text_tokenized.apply(remove_stopwords)
text_no_stop

9       [old, man, look, life, im, lot, like, old, man...
10      [read, news, today—oh, boy, lucky, man, made, ...
12      [see, red, door, want, painted, black, colours...
13      [well, tell, way, use, walk, im, womans, man, ...
20      [oh, yeah, yeah, yeah, theres, smile, face, tr...
                              ...                        
3850    [christmas, lyrics, christmas, time, happiness...
3851    [yeah, know, darkness, means, isolation, sting...
3860    [world, take, lyrics, ft, lil, baby, welcome, ...
3863    [lucky, lyrics, seriously, lucky, earths, 45, ...
3867    [hot, 30, weekend, countdown, lyrics, 30, jess...
Name: lyrics, Length: 1589, dtype: object

### Lemmatize words

In [20]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [21]:
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()

def lemm(document):
    
    lemmatized_document = [lemmatizer.lemmatize(word) for word in document]
    
    return lemmatized_document

In [58]:
text_lemmed = text_no_stop.apply(lemm)
text_lemmed

9       [old, man, look, life, im, lot, like, old, man...
10      [read, news, today—oh, boy, lucky, man, made, ...
12      [see, red, door, want, painted, black, colour,...
13      [well, tell, way, use, walk, im, woman, man, t...
20      [oh, yeah, yeah, yeah, there, smile, face, try...
                              ...                        
3850    [christmas, lyric, christmas, time, happiness,...
3851    [yeah, know, darkness, mean, isolation, sting,...
3860    [world, take, lyric, ft, lil, baby, welcome, l...
3863    [lucky, lyric, seriously, lucky, earth, 45, bi...
3867    [hot, 30, weekend, countdown, lyric, 30, jessi...
Name: lyrics, Length: 1589, dtype: object

## Document-Term Matrix

In [59]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

text_detokenized = text_lemmed.apply(TreebankWordDetokenizer().detokenize)
text_detokenized

9       old man look life im lot like old man look lif...
10      read news today—oh boy lucky man made grade th...
12      see red door want painted black colour anymore...
13      well tell way use walk im woman man time talk ...
20      oh yeah yeah yeah there smile face tryin ’ foo...
                              ...                        
3850    christmas lyric christmas time happiness cheer...
3851    yeah know darkness mean isolation sting echo b...
3860    world take lyric ft lil baby welcome life ther...
3863    lucky lyric seriously lucky earth 45 billion y...
3867    hot 30 weekend countdown lyric 30 jessie james...
Name: lyrics, Length: 1589, dtype: object

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer()

sparse_dtm = countvec.fit_transform(text_detokenized)
sparse_dtm

<1589x77555 sparse matrix of type '<class 'numpy.int64'>'
	with 436265 stored elements in Compressed Sparse Row format>

In [25]:
countvec2 = CountVectorizer(min_df=0.005)
sparse_dtm2 = countvec2.fit_transform(text_detokenized)

dtm2 = pd.DataFrame(sparse_dtm2.toarray(), columns=countvec2.get_feature_names(), index=data['track_id'])
dtm2.sum().sort_values(ascending=False) 



one         8364
like        7688
know        5304
time        5269
would       4724
            ... 
foresaw        8
forbids        8
disabled       8
noel           8
oklahoma       8
Length: 9939, dtype: int64

## Modeling

In [26]:
from sklearn.model_selection import train_test_split

y = data['track_genre']
X = dtm2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=88)
X_train.shape, X_test.shape

((1112, 9939), (477, 9939))

In [27]:
print(y_train.value_counts())
print(y_test.value_counts())

world-music          87
soul                 73
sad                  70
rock                 68
honky-tonk           64
power-pop            63
rockabilly           59
metalcore            59
bluegrass            56
grindcore            55
synth-pop            52
psych-rock           50
black-metal          48
songwriter           46
progressive-house    45
industrial           44
garage               44
show-tunes           43
metal                43
country              43
Name: track_genre, dtype: int64
world-music          37
soul                 31
sad                  30
rock                 29
power-pop            27
honky-tonk           27
metalcore            25
rockabilly           25
bluegrass            24
grindcore            23
synth-pop            22
psych-rock           22
black-metal          20
songwriter           20
progressive-house    20
show-tunes           19
metal                19
industrial           19
country              19
garage               19
Name: tr

In [33]:
# Baseline accuracy
print('ACC', 48/(len(y_test)))

ACC 0.10062893081761007


### Decision Tree

In [44]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(random_state=88)
dtc.fit(X_train, y_train)

In [50]:
from sklearn.metrics import accuracy_score

y_pred = dtc.predict(X_test)
print ("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.15723270440251572


### Decision Tree with CV

In [56]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

grid_values = {'ccp_alpha': np.linspace(0, 0.1, 101)}

dtc = DecisionTreeClassifier(random_state=88)
dtc_cv = GridSearchCV(dtc, param_grid=grid_values, cv=10).fit(X_train, y_train)

In [57]:
y_pred = dtc_cv.predict(X_test)
print ("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.17819706498951782


### Random Forest

In [47]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=88)
rf.fit(X_train, y_train)

In [49]:
y_pred = rf.predict(X_test)
print ("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.2578616352201258


### Gradient Boosting

In [54]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)

In [55]:
y_pred = gbc.predict(X_test)
print ("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.18448637316561844


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=a34ba4df-bfd5-4f60-9d20-5960e78e84da' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>