In [1]:
import os
import re
import pandas as pd
import numpy as np
import torch

from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline, AutoTokenizer
from collections import defaultdict

In [2]:
pipe = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=4)
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
path = os.path.join(os.getcwd(), "song_lyrics.csv")

# read 5000 and drop rap
df = pd.read_csv(path, nrows=5000)
df = df[df['tag']!='rap']

In [3]:
# clean non-alphanumeric characters and limit token length
def preprocess(text, max_tokens=512):

    # remove brackets [] or ()
    pattern = r"\[.+?\]"
    text = re.sub(pattern, '', text)
    pattern = r"\(.+?\)"
    text = re.sub(pattern, '', text)

    # Replace newline characters with full-stop
    text = re.sub(r'\n+', '.', text)

    
    tokens = tokenizer.tokenize(text)
    max_tokens -= 2 # for the start/end tokens
    limited_tokens = tokens[:max_tokens]
    assert len(limited_tokens) <= 512
    
    cleaned_text = tokenizer.convert_tokens_to_string(limited_tokens)
    
    return cleaned_text

# 
df['cleaned_lyrics'] = df['lyrics'].apply(preprocess)

# reset index
df = df.reset_index(drop=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (865 > 512). Running this sequence through the model will result in indexing errors


In [4]:
# process emotions with pipeline
df['emotions'] = df['cleaned_lyrics'].apply(pipe)
# output is [[list of list]] -> remove outer layer
df['emotions'] = df['emotions'].apply(lambda x: x[0])


In [5]:
# remove 'neutral' if in top 3,
# otherwise, only keep the top 3

def keep_three_emotions(emotions):
    for emotion in emotions:
        if emotion['label'] == 'neutral':
            emotions.remove(emotion)
    if len(emotions) > 3:
        emotions.pop()
    return emotions

df['emotions'] = df['emotions'].apply(keep_three_emotions)

In [6]:
# cosine similarity
possible_emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval',
                     'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
                     'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
                     'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
                     'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']

def vectorize_emotions(emotions):
    vector = np.zeros(27)
    for emotion in emotions:
        label = emotion['label']
        score = emotion['score']
        index = possible_emotions.index(label)
        vector[index] = score
    return vector

test = [{'label': 'sadness', 'score': 0.7652305364608765},
        {'label': 'disappointment', 'score': 0.13722138106822968},
        {'label': 'admiration', 'score': 0.05047781765460968}]


df['emotion_vector'] = df['emotions'].apply(vectorize_emotions)


In [7]:
def calculate_cosine_similarity(vector1, vector2):
    # (27, ) -> (1, 27)
    vector1 = vector1.reshape(1, -1)
    vector2 = vector2.reshape(1, -1)

    similarity = cosine_similarity(vector1, vector2)
    return similarity[0][0]

In [17]:
df.iloc[80:90]

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language,cleaned_lyrics,emotions,emotion_vector
80,7 Days DJ Premier Remix,rb,Craig David,2001,30459,"{""Yasiin Bey""}",[Intro: Craig David]\nOh no! Look at who they ...,1460,en,en,en,.Oh no! Look at who they let in the back door....,"[{'label': 'love', 'score': 0.3754351735115051...","[0.0, 0.0, 0.0, 0.0, 0.15897437930107117, 0.0,..."
81,Buffalo Bills,misc,E. E. Cummings,2010,9142,{},Buffalo Bill's\ndefunct\n who used to\n ...,1462,en,en,en,Buffalo Bill's.defunct. who used to. ...,"[{'label': 'curiosity', 'score': 0.69202506542...","[0.5669746994972229, 0.0, 0.0, 0.0, 0.04568846..."
82,Song 2,rock,Blur,1997,289635,{},[Intro]\nWoo-hoo\nWoo-hoo\nWoo-hoo\nWoo-hoo\n\...,1463,en,en,en,.Woo-hoo.Woo-hoo.Woo-hoo.Woo-hoo.I got my head...,"[{'label': 'joy', 'score': 0.689399242401123},...","[0.0, 0.0, 0.0, 0.0, 0.13548609614372253, 0.0,..."
83,Ego-Tripping there may be a reason,misc,Nikki Giovanni,2010,149574,{},I was born in the Congo\nI walked to the Ferti...,1468,en,en,en,I was born in the Congo.I walked to the Fertil...,"[{'label': 'admiration', 'score': 0.7476054430...","[0.7476054430007935, 0.0, 0.0, 0.0, 0.09654973..."
84,Voodoo Child Slight Return,rock,The Jimi Hendrix Experience,1968,131105,{},"[Instrumental Intro]\n\n[Verse 1]\nWell, I sta...",1469,en,en,en,".Well, I stand up next to a mountain.And I cho...","[{'label': 'amusement', 'score': 0.65913641452...","[0.0, 0.6591364145278931, 0.0, 0.0404175408184..."
85,Sweet Child O’ Mine,rock,Guns N' Roses,1987,715596,"{""Guns N\\' Roses""}",[Instrumental Intro/Guitar Riff]\n\n[Verse 1]\...,1470,en,en,en,.She's got a smile that it seems to me.Reminds...,"[{'label': 'love', 'score': 0.894432783126831}...","[0.2166137397289276, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
86,Sams Town,rock,The Killers,2006,50008,{},[Verse 1]\nNobody ever had a dream 'round here...,1480,en,en,en,.Nobody ever had a dream 'round here.But I don...,"[{'label': 'fear', 'score': 0.2615561783313751...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.15734085..."
87,Wild Horses,rock,The Rolling Stones,1971,229150,{},[Verse 1]\nChildhood living is easy to do\nThe...,1483,en,en,en,.Childhood living is easy to do.The things you...,"[{'label': 'sadness', 'score': 0.3253589272499...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
88,The Road Not Taken,misc,Robert Frost,1916,122918,{},Two roads diverged in a yellow wood\nAnd sorry...,1561,en,en,en,Two roads diverged in a yellow wood.And sorry ...,"[{'label': 'remorse', 'score': 0.5971220135688...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1348208338022..."
89,The Bad Touch,rock,Bloodhound Gang,1999,324007,{},"[Intro]\nHa-Ha! Well now, we call this the act...",1488,en,en,en,".Ha-Ha! Well now, we call this the act of mati...","[{'label': 'approval', 'score': 0.136673420667...","[0.0, 0.0, 0.0, 0.014483623206615448, 0.136673..."


In [21]:
def get_similar_songs(index, top_k=3):
    # get target vector
    input_vector = df.iloc[index]['emotion_vector']
    
    # exclude the target song
    vector_series = df['emotion_vector'].drop(index=index)
    
    result = vector_series.apply(lambda x: calculate_cosine_similarity(input_vector, x))
    ranked_result = result.sort_values(ascending=False)
    
    return ranked_result[:top_k]

similar_to_3 = get_similar_songs(3, top_k=5)
similar_to_3

111    0.996976
236    0.996200
362    0.993859
61     0.991528
11     0.989845
Name: emotion_vector, dtype: float64

In [22]:
print(df.iloc[3]['lyrics'])
print(df.iloc[3]['emotions'])


[Intro: Jay-Z] + (Aaliyah)
Sup Baby Girl (Ohhh)
Thought I had to talk to you again
Been a long time, missing you (Hey, ey, ey, ey)
Tim is missing you, Missy's missing you
Rashad is missing you, your Mom is missing you (N-n-n-n-no no no)
Your Pop is missing you, Dame is missing you
Damn, we missing you ...

[Verse 1: Jay-Z] + (Aaliyah)
All we listen to is all the different yous
Four page letters in addition to
Have you ever loved somebody who used to get the party poppin'
We used to party-hop, we used to be in the Hamptons, party a lot
We was The Breakfast Club, you was a part of the Roc
We used to make up special names for the food we ate
Remember cereal pie, one of your favorite plates? (I miss you)
Well, Dame told me tell you, he's doin' well
Due to the circumstances, it could've been Bellevue
But I ain't got to tell you, you looking over us
Our little angel, but you know what
[Hook: Jay-Z] + (Aaliyah)
Brooklyn's missing you, Detroit is missing you
New Orleans missing you, Philly's m

In [27]:
print(df.iloc[111]['lyrics'])
print(df.iloc[111]['emotions'])

[Verse 1]
Your day breaks, your mind aches
You find that all her words of kindness linger on
When she no longer needs you

[Verse 2]
She wakes up, she makes up
She takes her time and doesn't feel she has to hurry
She no longer needs you

[Bridge]
And in her eyes, you see nothing
No sign of love behind the tears
Cried for no one
A love that should have lasted years

[Verse 3]
You want her, you need her
And yet you don't believe her when she says her love is dead
You think she needs you
[French Horn Solo: Alan Civil]

[Bridge]
And in her eyes you see nothing
No sign of love behind the tears
Cried for no one
A love that should have lasted years

[Verse 4]
You stay home, she goes out
She says that long ago she knew someone
But now he's gone, she doesn't need him

[Verse 5]
Your day breaks, your mind aches
There will be times when all the things she said will fill your head
You won't forget her

[Bridge]
And in her eyes you see nothing
No sign of love behind the tears
Cried for no one
A lov

In [26]:
df.iloc[111]

title                                                    For No One
tag                                                             pop
artist                                                  The Beatles
year                                                           1966
views                                                        103896
features                                                         {}
lyrics            [Verse 1]\nYour day breaks, your mind aches\nY...
id                                                             1576
language_cld3                                                    en
language_ft                                                      en
language                                                         en
cleaned_lyrics    .Your day breaks, your mind aches.You find tha...
emotions          [{'label': 'sadness', 'score': 0.7760543227195...
emotion_vector    [0.0, 0.0, 0.0, 0.022029567509889603, 0.0, 0.0...
Name: 111, dtype: object

In [18]:
def get_similar_songs(index, top_k=3):
    # get target vector
    input_vector = df.iloc[index]['emotion_vector']
    
    # exclude the target song
    vector_series = df['emotion_vector'].drop(index=index)
    
    result = vector_series.apply(lambda x: calculate_cosine_similarity(input_vector, x))
    ranked_result = result.sort_values(ascending=False)
    
    return ranked_result[:top_k]

similar_to_3 = get_similar_songs(85, top_k=5)
similar_to_3

123    0.981211
275    0.980233
335    0.977343
207    0.975215
344    0.973209
Name: emotion_vector, dtype: float64

In [19]:
print(df.iloc[85]['lyrics'])
print(df.iloc[85]['emotions'])

[Instrumental Intro/Guitar Riff]

[Verse 1]
She's got a smile that it seems to me
Reminds me of childhood memories
Where everything was as fresh as the bright blue sky (Sky)
Now and then when I see her face
She takes me away to that special place
And if I stared too long I'd probably break down and cry

[Chorus]
Woah-oh-oh! Sweet child o' mine
Woah, oh-oh-oh! Sweet love of mine

[Post-Chorus Instrumental Break]

[Verse 2]
She's got eyes of the bluest skies
As if they thought of rain
I hate to look into those eyes and see an ounce of pain
Her hair reminds me of a warm, safe place
Where as a child I'd hide
And pray for the thunder and the rain to quietly pass me by
[Chorus]
Woah-oh-oh! Sweet child o' mine
Ooh, oh-oh-oh! Sweet love of mine

[Post-Chorus Instrumental Break]

[Chorus]
Oh yeah! Woah-oh-oh-oh! Sweet child o' mine
Ooh-oh, oh, oh! Sweet love of mine
Woah, oh-oh-oh! Sweet child o' mine, ooh yeah
Ooh! Sweet love of mine

[Guitar Solo]

[Outro]
Where do we go?
Where do we go now?


In [23]:
print(df.iloc[335]['lyrics'])
print(df.iloc[335]['emotions'])

[Intro]
I need you, boo
I gotta see you, boo
And the hearts all over the world tonight
Said the hearts all over the world tonight
And I need you, boo (Oh)
I gotta see you, boo (Hey)
And the hearts all over the world tonight
Said the hearts all over the world tonight (Uh, uh)

[Verse 1]
Hey, little mama, ooh, you're a stunner
Hot little figure, yes, you're a winner
And I'm so glad to be yours
You're a class all your own
And ooh, little cutie, when you talk to me
I swear the whole world stops, you're my sweetheart
And I'm so glad that you're mine
You are one of a kind and

[Pre-Chorus]
You mean to me what I mean to you
And together, baby, there is nothing we won't do
'Cause if I got you, I don't need money, I don't need cars
Girl, you're my all
[Chorus]
And oh, I'm into you
And girl, no one else would do
'Cause with every kiss and every hug
You make me fall in love
And now I know I can't be the only one
I bet there's hearts all over the world tonight
With the love of their life who feels

In [28]:
df.iloc[335]

title                                                      With You
tag                                                             pop
artist                                                  Chris Brown
year                                                           2007
views                                                        308054
features                                                         {}
lyrics            [Intro]\nI need you, boo\nI gotta see you, boo...
id                                                             4423
language_cld3                                                    en
language_ft                                                      en
language                                                         en
cleaned_lyrics    .I need you, boo.I gotta see you, boo.And the ...
emotions          [{'label': 'love', 'score': 0.8472604751586914...
emotion_vector    [0.2265433818101883, 0.0, 0.0, 0.0, 0.14656439...
Name: 335, dtype: object