In [1]:
import pandas as pd
import numpy as np

In [24]:
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.model_selection import cross_val_score

## 1. Preparing data for feature engineering


In [3]:
dualipa_df = pd.read_csv("dualipa.csv", index_col=0)
dualipa_df.head()

Unnamed: 0,song_title,lyrics
0,un día remix,you know that sometimes i think about us now a...
1,future nostalgia,future you want a timeless song i wanna change...
2,that kind of woman,one look was enough enough for me the whole ro...
3,if it aint me,i bet were higher than the people on cloud 9 t...
4,levitating remix,woohoo if you wanna run away with me i know a ...


In [4]:
edsheeran_df = pd.read_csv("edsheeran.csv", index_col=0)
edsheeran_df.head()

Unnamed: 0,song_title,lyrics
0,eraser,i was born inside a small town i lost that sta...
1,bibia be ye ye,bibia be ye ye i lost my shoes last night i do...
2,save myself,i gave all my oxygen to people that could brea...
3,nancy mulligan,i was twenty four years old when i met the wom...
4,perfect duet,i found a love for me oh darling just dive rig...


In [5]:
print(dualipa_df.shape, edsheeran_df.shape)

(60, 2) (88, 2)


In [6]:
# drop remix songs as the lyrics are more or less the same as the original

print(dualipa_df[dualipa_df['song_title'].str.contains("remix")])
print(edsheeran_df[edsheeran_df['song_title'].str.contains("remix")])

                          song_title  \
0                       un día remix   
4                   levitating remix   
16                   new rules remix   
17          that kind of woman remix   
18              break my heart remix   
19            kiss and make up remix   
20           hallucinate remix remix   
21                    physical remix   
22            love is religion remix   
23                 hallucinate remix   
24  break my heart cosmic girl remix   
25                  love again remix   
26             don t start now remix   
27           boys will be boys remix   
28               pretty please remix   
29            future nostalgia remix   
30                        cool remix   
31                 good in bed remix   
50              blow your mind remix   
56              dont start now remix   

                                               lyrics  
0   you know that sometimes i think about us now a...  
4   woohoo if you wanna run away with me i know

In [7]:
# dropping remixes
dualipa_remix = dualipa_df[dualipa_df['song_title'].str.contains("remix")]
dualipa_df.drop(dualipa_remix.index, inplace=True)
print(dualipa_df.shape)

edsheeran_remix = edsheeran_df[edsheeran_df['song_title'].str.contains("remix")]
edsheeran_df.drop(edsheeran_remix.index, inplace=True)
print(edsheeran_df.shape)

(40, 2)
(79, 2)


In [8]:
# Adding an artist label and combining both dataframes and shuffling

dualipa_df['label'] = 'dua_lipa'
edsheeran_df['label'] = 'ed_sheeran'

data = pd.concat([dualipa_df, edsheeran_df], ignore_index=True, axis=0)

data = data.sample(frac=1).reset_index(drop=True)

data.head()

Unnamed: 0,song_title,lyrics,label
0,dont,i met this girl late last year she said dont y...,ed_sheeran
1,cool,guess i never had a love like this hit me hard...,dua_lipa
2,1000 nights,oh i been on for a thousand nights new york to...,ed_sheeran
3,even my dad does sometimes,its alright to cry even my dad does sometimes ...,ed_sheeran
4,i dont want your money,ayy i waited for you all day i been away on th...,ed_sheeran


## 2. Adding text features

In [9]:
# adding simple features such as character count, word count and average word length

#adding character count
data['char_count'] = data['lyrics'].apply(len)

#adding word count
def word_counter(text):
    words = text.split()
    return (len(words))

data['word_count'] = data['lyrics'].apply(word_counter)

#adding average word length
def avg_word_len(text):
    words = text.split()
    words_lens = [len(word) for word in words]
    return (np.mean(words_lens))

data['avg_word_len'] = data['lyrics'].apply(avg_word_len)

In [10]:
data.head()

Unnamed: 0,song_title,lyrics,label,char_count,word_count,avg_word_len
0,dont,i met this girl late last year she said dont y...,ed_sheeran,2786,594,3.691919
1,cool,guess i never had a love like this hit me hard...,dua_lipa,1862,414,3.5
2,1000 nights,oh i been on for a thousand nights new york to...,ed_sheeran,3092,649,3.765794
3,even my dad does sometimes,its alright to cry even my dad does sometimes ...,ed_sheeran,747,159,3.704403
4,i dont want your money,ayy i waited for you all day i been away on th...,ed_sheeran,2033,449,3.530067


In [11]:
# tokenizing and lemmatizing text 
# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_sm')

example = data.loc[data['song_title']=='physical', 'lyrics'].values[0]

# Create a Doc object
doc = nlp(example)

# Generate the tokens
tokens = [token.text for token in doc]
print(tokens)

# Generate tokens and pos tags
pos = [(token.text, token.pos_) for token in doc]
print(pos)

['common', 'love', 'is', 'nt', 'for', 'us', 'we', 'created', 'something', 'phenomenal', 'do', 'nt', 'you', 'agree', 'do', 'nt', 'you', 'agree', 'you', 'got', 'me', 'feeling', 'diamond', 'rich', 'nothing', 'on', 'this', 'planet', 'compares', 'to', 'it', 'do', 'nt', 'you', 'agree', 'dontyouagree', 'who', 'needs', 'togo', 'to', 'sleep', 'when', 'i', 'gotyou', 'next', 'to', 'me', 'all', 'night', 'ill', 'riot', 'with', 'you', 'i', 'know', 'you', 'got', 'my', 'back', 'and', 'you', 'know', 'i', 'got', 'you', 'so', 'come', 'on', 'come', 'on', 'come', 'on', 'lets', 'get', 'physical', 'lights', 'out', 'follow', 'the', 'noise', 'baby', 'keep', 'on', 'dancing', 'like', 'you', 'ai', 'nt', 'got', 'a', 'choice', 'so', 'come', 'on', 'come', 'on', 'come', 'on', 'lets', 'get', 'physical', 'adrenaline', 'keeps', 'on', 'rushing', 'in', 'love', 'the', 'simulation', 'were', 'dreaming', 'in', 'do', 'nt', 'you', 'agree', 'do', 'nt', 'you', 'agree', 'i', 'do', 'nt', 'wanna', 'live', 'another', 'life', 'cause',

In [12]:
# creating functions to return the numbers of different Part of Speech tags

def verb_counter(text, model=nlp):
    doc = model(text)
    pos = [token.pos_ for token in doc]
    return pos.count('VERB')

def noun_counter(text, model=nlp):
    doc = model(text)
    pos = [token.pos_ for token in doc]
    return pos.count('NOUN')

def adj_counter(text, model=nlp):
    doc = model(text)
    pos = [token.pos_ for token in doc]
    return pos.count('ADJ')

def pronoun_counter(text, model=nlp):
    doc = model(text)
    pos = [token.pos_ for token in doc]
    return pos.count('PRON')

data['verb_count'] = data['lyrics'].apply(verb_counter)
data['noun_count'] = data['lyrics'].apply(noun_counter)
data['adj_count'] = data['lyrics'].apply(adj_counter)
data['pronoun_count'] = data['lyrics'].apply(pronoun_counter)

data.head()

Unnamed: 0,song_title,lyrics,label,char_count,word_count,avg_word_len,verb_count,noun_count,adj_count,pronoun_count
0,dont,i met this girl late last year she said dont y...,ed_sheeran,2786,594,3.691919,107,96,18,116
1,cool,guess i never had a love like this hit me hard...,dua_lipa,1862,414,3.5,88,55,33,117
2,1000 nights,oh i been on for a thousand nights new york to...,ed_sheeran,3092,649,3.765794,91,147,40,90
3,even my dad does sometimes,its alright to cry even my dad does sometimes ...,ed_sheeran,747,159,3.704403,32,22,6,26
4,i dont want your money,ayy i waited for you all day i been away on th...,ed_sheeran,2033,449,3.530067,87,85,20,111


In [13]:
lemmas = [token.lemma_ for token in doc]
print(print(' '.join(lemmas)))

common love be nt for we we create something phenomenal do nt you agree do nt you agree you get I feel diamond rich nothing on this planet compare to it do nt you agree dontyouagree who need togo to sleep when I gotyou next to I all night ill riot with you I know you get my back and you know I get you so come on come on come on let get physical light out follow the noise baby keep on dance like you ai nt get a choice so come on come on come on let get physical adrenaline keep on rush in love the simulation be dream in do nt you agree do nt you agree I do nt wanna live another life cause this one pretty nice live it up who need to go to sleep when I get you next to I all night ill riot with you I know you get my back and you know I get you so come on come on come on let get physical light out follow the noise baby keep on dance like you ai nt get a choice so come on come on come on let get physical hold on just a little tight come on hold on tell I if you re ready come on baby keep on d

In [14]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS

stopwords.update(['ill', 'nt', 're', 'ai', 've'])

clean_lemmas = [lemma for lemma in lemmas if lemma.isalpha() and lemma not in stopwords]
print(print(' '.join(clean_lemmas)))

common love create phenomenal agree agree I feel diamond rich planet compare agree dontyouagree need togo sleep I gotyou I night riot I know know I come come come let physical light follow noise baby dance like choice come come come let physical adrenaline rush love simulation dream agree agree I wanna live life cause pretty nice live need sleep I I night riot I know know I come come come let physical light follow noise baby dance like choice come come come let physical hold little tight come hold tell I ready come baby dance let physical hold little tight come hold tell I ready come baby dance let physical night riot I know know I come come come let physical light follow noise baby dance like choice come come come let physical let physical physical let physical come phy phy phy physical
None


In [15]:
# applying lemmatization on the whole data

def preprocess(text):
    # Create doc object
    doc = nlp(text)
    # Generate lemmas
    lemmas = [token.lemma_ for token in doc]
    # Remove stopwords and non-alphabetic characters
    clean_lemmas = [lemma for lemma in lemmas 
                    if lemma.isalpha() and lemma not in stopwords]
    
    return ' '.join(clean_lemmas)
  

data['lemm_lyrics'] = data['lyrics'].apply(preprocess)

data.head()

Unnamed: 0,song_title,lyrics,label,char_count,word_count,avg_word_len,verb_count,noun_count,adj_count,pronoun_count,lemm_lyrics
0,dont,i met this girl late last year she said dont y...,ed_sheeran,2786,594,3.691919,107,96,18,116,I meet girl late year worry I disappear I tell...
1,cool,guess i never had a love like this hit me hard...,dua_lipa,1862,414,3.5,88,55,33,117,guess I love like hit I hard I expect goddamn ...
2,1000 nights,oh i been on for a thousand nights new york to...,ed_sheeran,3092,649,3.765794,91,147,40,90,oh I thousand night new york london different ...
3,even my dad does sometimes,its alright to cry even my dad does sometimes ...,ed_sheeran,747,159,3.704403,32,22,6,26,alright cry dad wipe eye tear remind alive alr...
4,i dont want your money,ayy i waited for you all day i been away on th...,ed_sheeran,2033,449,3.530067,87,85,20,111,ayy I wait day I away road little today I m he...


## 3. Bag of Words

In [35]:
# split data to training and testing

X = data.drop(['label', 'lyrics'], axis=1)

y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, stratify=y)


In [36]:
vectorizer = CountVectorizer(stop_words='english')

# Fit and transform X_train
X_train_bow = vectorizer.fit_transform(X_train['lyrics'])

# Transform X_test
X_test_bow = vectorizer.transform(X_test['lyrics'])

# Print shape of X_train_bow and X_test_bow
print(X_train_bow.shape)
print(X_test_bow.shape)

(83, 2864)
(36, 2864)


In [37]:
# build a naiive bias model using only BoW vectorization of lemmatized lyrics
clf = MultinomialNB()

# Fit the classifier
clf.fit(X_train_bow, y_train)

# Measure the accuracy
accuracy = clf.score(X_test_bow, y_test)
print("The accuracy of the classifier on the test set is %.3f" % accuracy)

The accuracy of the classifier on the test set is 0.694


In [38]:
# we can do this also using pipeline and include the other features

preprocessor = ColumnTransformer(transformers=[
        ('text', CountVectorizer(stop_words='english'), 'lyrics')])
                                 
pipeline = Pipeline([
     ('preprocessor', preprocessor),
     ('clf', MultinomialNB())])

pipeline.fit(X_train, y_train)                               

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('text',
                                                  CountVectorizer(stop_words='english'),
                                                  'lyrics')])),
                ('clf', MultinomialNB())])

In [39]:
pipeline.score(X_test, y_test)

0.6944444444444444

In [40]:
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

    dua_lipa       0.55      0.50      0.52        12
  ed_sheeran       0.76      0.79      0.78        24

    accuracy                           0.69        36
   macro avg       0.65      0.65      0.65        36
weighted avg       0.69      0.69      0.69        36

0.6944444444444444


In [41]:
print(confusion_matrix(y_test, y_pred))

[[ 6  6]
 [ 5 19]]


In [42]:
scores = cross_val_score(pipeline, X, y, cv=5, n_jobs=-1)
scores.mean()

0.7826086956521741

In [43]:
# we can also try with tfidf vectorizer instead of count vectorizer
preprocessor = ColumnTransformer(transformers=[
        ('text', TfidfVectorizer(stop_words='english'), 'lyrics')])
                                 
pipeline = Pipeline([
     ('preprocessor', preprocessor),
     ('clf', MultinomialNB())])

scores = cross_val_score(pipeline, X, y, cv=5, n_jobs=-1)
scores.mean()    

0.6637681159420289