In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from wordcloud import WordCloud
from collections import Counter

import sklearn
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from sklearn.decomposition import PCA

# **Emotions 🙂**

In this notebook, our goal is to assign emotions to the tracks present in our dataset. 

### **1.** Clean datasets

First, we retrieve the `tracks` dataset which has been preprocessed previously. Then, we load the dataset `NRC-Emotion-Lexicon-Wordlevel` which match a list of words to some emotions (fear, sadness, joy, ...). 

In [3]:
path = "data/tracks.csv"

# ---- Retrieve Tracks Dataset ----
tracks_df = pd.read_csv(path)
tracks_df.drop_duplicates(subset='track_id', inplace=True)

# Clean tags
tracks_df['tags'] = tracks_df.tags.apply(lambda x : np.array(x.replace('[', '')
                                             .replace(']', '')
                                             .replace('\'', '')
                                             .replace(',', '')
                                             .split()))

# Add an id to each track 
tracks_df['id'] = [i for i in range(len(tracks_df))]

tracks_init = tracks_df.copy()

# Explode on the list of tags to get only one tag per row 
tracks_df = tracks_df.explode('tags')
tracks_df

Unnamed: 0,track,artist,tags,arousal,dominance,track_id,genre,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,id
0,'Till I Collapse,Eminem,aggressive,5.273125,5.690625,4xkOaSrkexMciUUogZKVTS,rap,0.06220,0.548,0.847,0.0000,1,0.0816,-3.237,1,0.1860,171.447,4,0.1000,0
1,St. Anger,Metallica,aggressive,5.833000,5.427250,3fOc9x06lKJBhz435mInlH,metal,0.00131,0.249,0.949,0.0228,2,0.0953,-2.642,0,0.0678,185.252,4,0.4980,1
2,Speedin',Rick Ross,aggressive,5.870000,5.490000,3Y96xd4Ce0J47dcalLrEC8,rap,0.10900,0.668,0.787,0.0000,1,0.2100,-4.226,1,0.0429,100.059,4,0.4780,2
3,Bamboo Banga,M.I.A.,aggressive,5.537214,5.691357,6tqFC1DIOphJkCwrjVzPmg,hip-hop,0.04930,0.805,0.918,0.0000,9,0.0691,-4.554,1,0.2120,125.984,4,0.7130,3
3,Bamboo Banga,M.I.A.,fun,5.537214,5.691357,6tqFC1DIOphJkCwrjVzPmg,hip-hop,0.04930,0.805,0.918,0.0000,9,0.0691,-4.554,1,0.2120,125.984,4,0.7130,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61585,Secret,Quietdrive,transparent,3.450000,5.330000,2bRIsZ92JRKlvQOZlyR9CO,,0.03340,0.396,0.915,0.0000,0,0.1350,-5.126,0,0.1560,167.996,4,0.0734,60456
61586,The Last of the Rest Was the End,Medications,transparent,3.450000,5.330000,7o3Np7cho9cBCrNDokxzYC,,0.00433,0.244,0.866,0.0722,2,0.0782,-5.774,0,0.0674,144.844,3,0.3400,60457
61587,Lovechild,Daniel Lanois,transparent,4.405000,5.625000,4fVObxldDzxxRD6a5Eth9s,indie,0.90100,0.236,0.107,0.7480,10,0.1120,-20.091,0,0.0344,79.476,4,0.0720,60458
61588,Last Inhale,Tapage,transparent,3.341667,4.466667,5WxwRwUQ4R4L46VEm3213y,ambient,0.07180,0.542,0.909,0.8170,2,0.1250,-8.977,0,0.1230,160.011,4,0.0552,60459


In [4]:
path_nrc = "data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"

#Add library NRC 
nrc_rows = []
with open(path_nrc, 'r') as f:
    for line in f:
        nrc_rows.append(line.split())

#Transform to DataFrame
df_nrc = pd.DataFrame(nrc_rows, columns = ['name', 'emotion', 'yes'])

#Retrieve names
names_nrc = df_nrc['name'].unique()

df_nrc

Unnamed: 0,name,emotion,yes
0,aback,anger,0
1,aback,anticipation,0
2,aback,disgust,0
3,aback,fear,0
4,aback,joy,0
...,...,...,...
141535,zoom,negative,0
141536,zoom,positive,0
141537,zoom,sadness,0
141538,zoom,surprise,0


Then, we filter tags to only keep the ones which appear more than 50 times in all the `tracks` dataset. 

In [5]:
#Retrieve tags as a list 
tags = tracks_df['tags'].values.tolist()

# Use Counter to count the frequency of each word
word_freq = dict(Counter(tags))

# Create a DataFrame from the dictionary
word_freq_df = pd.DataFrame.from_dict(word_freq, orient='index', columns=['count'])

#Only keep tags which appear more than 50 times
word_freq_df = word_freq_df.loc[word_freq_df['count']>=50]
word_freq_df

Unnamed: 0,count
aggressive,820
fun,917
sexy,908
energetic,873
angry,869
...,...
philosophical,170
rustic,100
pastoral,74
clinical,52


Finally, we keep only the tags which appear in the two datasets. 

In [6]:
#Retrieve list of common tags
common_tags = list(set(df_nrc['name'].tolist()) & set(word_freq_df.index.tolist()))

#Filter NRC dataset
df_nrc = df_nrc.loc[df_nrc['name'].isin(common_tags)]

#Filter tracks dataset
tracks_df = tracks_df[['id', 'tags']].loc[tracks_df['tags'].isin(common_tags)]

We merge the two dataframes. 

In [7]:
df_nrc = pd.pivot_table(data = df_nrc, index=["name"], columns= ["emotion"], values='yes')

df = tracks_df.merge(df_nrc, how='left', left_on='tags', right_on='name')

#Only keep columns we are interested in 
df = df[['id', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative',
       'positive', 'sadness', 'surprise', 'trust']]
df

Unnamed: 0,id,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,3,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
56193,60335,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
56194,60336,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
56195,60337,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
56196,60338,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


## **2.** Assign one emotion to each of the track

In this part, our goal is to assign one emotion to each track based on its tags. 

In [8]:
#First, we group by the name of the track and sum over all the "emotion" vectors 
#assigned to that track to get one vector per track. 
df = df.groupby('id').sum().astype(int).reset_index()
df

Unnamed: 0,id,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,0,1,0,0,1,0,1,0,0,0,0
1,1,1,0,0,1,0,1,0,0,0,0
2,2,1,0,0,1,0,1,0,0,0,0
3,3,1,1,0,1,1,1,2,0,0,0
4,4,1,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
44515,60335,0,0,0,0,0,0,1,1,0,0
44516,60336,0,0,0,0,0,0,1,1,0,0
44517,60337,0,0,0,0,0,0,1,1,0,0
44518,60338,0,0,0,0,0,0,1,1,0,0


In [9]:
#Create a key which representing the values of the "emotion" vector associated to the track
df["key"] = df.apply(lambda x : str(x['joy']) + '.' + str(x['trust']) + '.' +
                                str(x['fear']) + '.' + str(x['surprise'])+ '.' +
                                str(x['sadness']) + '.' + str(x['disgust']) + '.' +
                                str(x['anger']) + '.' + str(x['anticipation']), axis=1)

#Groupby that specific key, so we get all tracks which to the same "emotion" vector
df = df.groupby('key').agg({'id': lambda x: list(x)})

df

Unnamed: 0_level_0,id
key,Unnamed: 1_level_1
0.0.0.0.0.0.0.0,"[1639, 1653, 1656, 1670, 1671, 1674, 1676, 167..."
0.0.0.0.0.0.1.0,"[1752, 1753, 1755, 1756, 1757, 1759, 1760, 176..."
0.0.0.0.0.0.2.0,"[1801, 1804, 1810]"
0.0.0.0.0.1.0.0,"[4099, 4889, 4955, 5088, 5182, 8907, 11562, 11..."
0.0.0.0.0.1.1.0,"[820, 821, 822, 823, 824, 825, 826, 828, 829, ..."
...,...
6.4.4.3.2.3.3.4,[395]
6.5.1.4.0.1.1.3,"[3441, 3465]"
7.6.2.4.0.1.1.4,"[3312, 3370]"
7.6.2.4.2.1.1.4,[3220]


We have 504 different groups/emotions, which is way too much for our project. Thus, we will simplify the keys and merge some groups together. 

First, we retrieve the index of the emotion which is the strongest (with the maximum count) for each key. We then set the values associated with these index to 1, and the rest to 0. 

For instance : 
- [1,2,1,0] --> [0,1,0,0] 
- [1,1,0,0] --> [1,1,0,0].

In [10]:
#Retrieve original keys
keys = df.index.values

#Create new keys
new_keys = []
for k in keys : 
  #Create an array
  vals = np.array(k.split('.')).astype(int)
  #Retrieve the index of max values
  index_max = np.where(vals==np.max(vals))

  #Add one if max value, else 0
  new = np.zeros((len(vals)), dtype=int)
  for i in index_max : 
    new[i] = 1
  
  #Transform to string
  new = ''.join(map(str, new))
  new_keys.append(new)

#Transform old keys to new keys
df['new_key'] = new_keys

#Reset index
df.index = range(len(df))
df

Unnamed: 0,id,new_key
0,"[1639, 1653, 1656, 1670, 1671, 1674, 1676, 167...",11111111
1,"[1752, 1753, 1755, 1756, 1757, 1759, 1760, 176...",00000010
2,"[1801, 1804, 1810]",00000010
3,"[4099, 4889, 4955, 5088, 5182, 8907, 11562, 11...",00000100
4,"[820, 821, 822, 823, 824, 825, 826, 828, 829, ...",00000110
...,...,...
499,[395],10000000
500,"[3441, 3465]",10000000
501,"[3312, 3370]",10000000
502,[3220],10000000


Finally, we group by the new keys to get a smaller number of groups. 

In [11]:
final = df.groupby('new_key').agg({'id': lambda x: sum(x, [])}).reset_index()
final

Unnamed: 0,new_key,id
0,00000001,"[4710, 14603, 19756, 2940, 4067, 5160, 9431, 9..."
1,00000010,"[1752, 1753, 1755, 1756, 1757, 1759, 1760, 176..."
2,00000100,"[4099, 4889, 4955, 5088, 5182, 8907, 11562, 11..."
3,00000110,"[820, 821, 822, 823, 824, 825, 826, 828, 829, ..."
4,00001000,"[1638, 1698, 1699, 1701, 1702, 1703, 1706, 174..."
...,...,...
120,11110111,"[1159, 2111, 2124, 2128, 2170, 2177, 2179, 220..."
121,11111001,"[5639, 5704, 12965, 13417]"
122,11111101,"[12674, 13413, 13633, 13698]"
123,11111110,"[3205, 3208, 3216, 3221, 3232, 3233, 3238, 324..."


We still have a large number of groups. Hence, we have decided to filter out some groups and keep only the ones we are able to explicitly assign an emotion (using Plutchik’s wheel of emotions). 

In [12]:
to_keep = \
['10000000',
'01000000',
'00100000',
'00010000',
'00001000',
'00000100',
'00000010',
'00000001',
'11000000',
'01100000',
'00110000',
'00011000',
'00001100',
'00000110',
'00000011',
'10000001']

emotions = [
    'anticipation',
    'anger',
    'disgust',
    'contempt',
    'sadness',
    'remorse',
    'surprise',
    'fear',
    'awe',
    'trust',
    'submission',
    'joy',
    'optimism',
    'love',
]

final = final.loc[final['new_key'].isin(to_keep)]

#Count the number of tracks per key
final['counts'] = final.id.apply(lambda x : len(x))

#Assign emotions using the Plutchik’s wheel of emotions
final['emotions'] = emotions

final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['counts'] = final.id.apply(lambda x : len(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['emotions'] = emotions


Unnamed: 0,new_key,id,counts,emotions
0,1,"[4710, 14603, 19756, 2940, 4067, 5160, 9431, 9...",40,anticipation
1,10,"[1752, 1753, 1755, 1756, 1757, 1759, 1760, 176...",512,anger
2,100,"[4099, 4889, 4955, 5088, 5182, 8907, 11562, 11...",149,disgust
3,110,"[820, 821, 822, 823, 824, 825, 826, 828, 829, ...",751,contempt
4,1000,"[1638, 1698, 1699, 1701, 1702, 1703, 1706, 174...",3433,sadness
6,1100,"[1647, 2784, 6467, 6531, 8249, 8510, 8897, 891...",708,remorse
8,10000,"[9429, 9440, 9466, 22500]",4,surprise
10,100000,"[4036, 5212, 6309, 6311, 6342, 6346, 7709, 776...",834,fear
24,110000,"[3271, 3343, 3443, 3488, 3737, 3938, 3946, 3485]",8,awe
35,1000000,"[4069, 4972, 4974, 5355, 6428, 6614, 8846, 114...",705,trust


In [14]:
#To finish, we explode the dataset on the id and the merge with the initial dataframe 
final = final.explode('id')[['id', 'emotions']]
final

Unnamed: 0,id,emotions
0,4710,anticipation
0,14603,anticipation
0,19756,anticipation
0,2940,anticipation
0,4067,anticipation
...,...,...
85,11379,love
85,14011,love
85,14450,love
85,17337,love


In [15]:
tracks_final = final.merge(tracks_init, on='id', how='left')
tracks_final

Unnamed: 0,id,emotions,track,artist,tags,arousal,dominance,track_id,genre,acousticness,...,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,4710,anticipation,Waving My Arms In The Air [Take 1],Syd Barrett,"[ominous, fractured, insular, wry, eccentric, ...",3.620607,4.231495,0pMOAZz9GxlXi2fXkRr0nN,psychedelic rock,0.793000,...,0.1200,0.000000,2,0.1490,-17.179,1,0.0430,105.726,4,0.4620
1,14603,anticipation,Uni Iso,Alva Noto,"[intimate, nervous]",5.850000,5.533333,158gPbiLX3MUEptQOJgQES,glitch,0.000015,...,0.3390,0.910000,4,0.0513,-17.213,1,0.0528,127.642,4,0.0567
2,19756,anticipation,red moon,Kalafina,"[dramatic, spiritual, yearning, anxious]",4.597881,4.549661,7FH8PR1dYdBm0pT1FIeI7o,j-pop,0.086000,...,0.5760,0.000000,2,0.0862,-6.782,0,0.0378,166.013,3,0.0921
3,2940,anticipation,Nobody Loves You Like I Do,MakTub,"[harsh, urgent, yearning]",3.976000,4.889000,6C7NKesRR4mN3Dr4goQHlh,singer-songwriter,0.030000,...,0.4310,0.000006,5,0.1520,-8.650,1,0.0279,111.080,4,0.5050
4,4067,anticipation,Harem Scarem,Focus,"[manic, eerie, urgent, campy, passionate]",4.432000,3.902000,0QhfAl5OwRfnBfqLlV3b6N,progressive rock,0.022800,...,0.8040,0.865000,5,0.1140,-9.591,1,0.0342,88.484,4,0.6940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12985,11379,love,Hunger For The Great Light,Dave Matthews Band,"[nocturnal, ambitious, carefree, freewheeling,...",4.018125,5.930000,2CqxMRNufuvCcraGit5Q2m,rock,0.104000,...,0.7050,0.000008,0,0.2930,-6.113,1,0.0496,119.973,4,0.1490
12986,14011,love,I Love You so Much It Hurts,Ray Charles,"[gentle, lush, playful, passionate, sensual, r...",4.216378,5.779331,03BwJtnWJj6WFps8aBEFLQ,soul,0.940000,...,0.1840,0.000005,2,0.3290,-16.608,0,0.0371,67.492,3,0.1600
12987,14450,love,The Closing of the Doors,Róisín Murphy,"[intimate, lush, stylish, playful, sensual, sw...",4.150000,5.922007,4nAJ7rNmLIqSz9sd1Uua02,piano,0.984000,...,0.0642,0.000006,3,0.1440,-14.807,1,0.0427,87.970,4,0.0677
12988,17337,love,Silent Cry,Feeder,"[soothing, passionate, reflective, powerful, s...",4.654326,6.402057,1xo2vBELyICpeBNhCz0RU1,rock,0.003550,...,0.8730,0.000319,10,0.1250,-4.108,1,0.0471,107.990,4,0.2650


In [17]:
#Store to csv file
tracks_final.to_csv('tracks_final.csv', index=False)

#Retrieve the spotify_ids of the tracks selected for the project and store to .txt
spotify_ids = tracks_final.track_id.values.tolist()

with open('spotify_id.txt', 'w') as file:
    for id in spotify_ids:
        file.write(id + '\n')