In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
import spotipy



In [3]:
list = [{'label': 'disgust', 'score': 0.3320395350456238},
        {'label': 'anger', 'score': 0.2978857457637787},
        {'label': 'sadness', 'score': 0.2527250647544861},
        {'label': 'neutral', 'score': 0.05638010799884796},
        {'label': 'fear', 'score': 0.03310847282409668},
        {'label': 'surprise', 'score': 0.017489802092313766},
        {'label': 'joy', 'score': 0.010371341370046139}]

sorted(list, key= lambda x: x['label'])

[{'label': 'anger', 'score': 0.2978857457637787},
 {'label': 'disgust', 'score': 0.3320395350456238},
 {'label': 'fear', 'score': 0.03310847282409668},
 {'label': 'joy', 'score': 0.010371341370046139},
 {'label': 'neutral', 'score': 0.05638010799884796},
 {'label': 'sadness', 'score': 0.2527250647544861},
 {'label': 'surprise', 'score': 0.017489802092313766}]

# Load data

In [2]:
DATA_URL = "~/code/federicoeramirez/jars/raw_data/data_lyrics_10k.csv"

DTYPES_RAW_OPTIMIZED = {
    "Unnamed: 0": "int16",
    "valence": "float32",
    "year": "int16",
    "acousticness": "float32",
    "artists": "O",
    "danceability": "float32",
    "duration_ms": "int32",
    "energy": "float32",
    "explicit": "int8",
    "id": "O",
    "instrumentalness": "float32",
    "key": "int8",
    "liveness": "float32",
    "loudness": "float32",
    "mode": "int8",
    "name": "O",
    "popularity": "int8",
    "release_date": "O",
    "speechiness": "float32",
    "tempo": "float32",
    "lyrics": "O"
}

df = pd.read_csv(DATA_URL, dtype=DTYPES_RAW_OPTIMIZED)

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,...,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,lyrics
0,0,0.817,2013,0.0158,['Parmalee'],0.551,214933,0.863,0,3Bdqlr7jQLNhITAgcBGQBG,...,11,0.0843,-3.506,1,Close Your Eyes,50,2013-12-10,0.0322,143.951996,Well take a look at what's left in that sunset...
1,1,0.548,2003,0.00661,['JAY-Z'],0.494,234627,0.887,1,7sLpSWxQazJzDVG6YGzlVs,...,6,0.103,-4.297,0,99 Problems,61,2003-11-14,0.398,89.554001,"If you're havin' girl problems, I feel bad for..."
2,2,0.732,2014,0.0477,['Sam Hunt'],0.59,235507,0.94,0,3BuPop8SzLG2Q88TJcFAjp,...,9,0.379,-4.124,1,Raised On It,54,2014-10-27,0.0409,94.019997,Snapbacks and Levi jeans PBR and burnt CDs Run...


In [4]:
# remove redundant columns or rows
df = df.drop(columns=['Unnamed: 0'])
df = df.drop_duplicates()

df.shape

(10000, 20)

In [5]:
# remove long duration (podcasts)
df['duration_s'] = df['duration_ms'].apply(lambda x: x/1000)
df = df[df['duration_s'] < 600]

# transform duration to minutes
df['duration_m'] = df['duration_s']/60

# cleanup artists column
df['artists'] = df['artists'].apply(lambda x: x.replace("['", '').replace("'", '').replace("]", ''))

# create index column containing artists and song name
df['index'] = df['artists'] + ' - "' + df['name'] + '"'

# sort columns in alphabetical order
df = df.sort_index(axis=1)

# drop columns that are no longer needed
df = df.drop(columns=['artists', 'name', 'release_date', 'duration_ms', 'duration_s'])

In [6]:
df.head(3)

Unnamed: 0,acousticness,danceability,duration_m,energy,explicit,id,index,instrumentalness,key,liveness,loudness,lyrics,mode,popularity,speechiness,tempo,valence,year
0,0.0158,0.551,3.582217,0.863,0,3Bdqlr7jQLNhITAgcBGQBG,"Parmalee - ""Close Your Eyes""",0.0,11,0.0843,-3.506,Well take a look at what's left in that sunset...,1,50,0.0322,143.951996,0.817,2013
1,0.00661,0.494,3.91045,0.887,1,7sLpSWxQazJzDVG6YGzlVs,"JAY-Z - ""99 Problems""",0.0,6,0.103,-4.297,"If you're havin' girl problems, I feel bad for...",0,61,0.398,89.554001,0.548,2003
2,0.0477,0.59,3.925117,0.94,0,3BuPop8SzLG2Q88TJcFAjp,"Sam Hunt - ""Raised On It""",0.0,9,0.379,-4.124,Snapbacks and Levi jeans PBR and burnt CDs Run...,1,54,0.0409,94.019997,0.732,2014


In [7]:
#df.to_csv('../raw_data/data_lyrics_10k_sorted.csv', mode='a')

# Preprocess

In [8]:
# drop columns that won't be used for vector space
df_processed = df.drop(columns=['id', 'lyrics', 'index'])
df_processed = df_processed.set_index(df['index'])

In [9]:
df_processed.head(3)

Unnamed: 0_level_0,acousticness,danceability,duration_m,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,year
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
"Parmalee - ""Close Your Eyes""",0.0158,0.551,3.582217,0.863,0,0.0,11,0.0843,-3.506,1,50,0.0322,143.951996,0.817,2013
"JAY-Z - ""99 Problems""",0.00661,0.494,3.91045,0.887,1,0.0,6,0.103,-4.297,0,61,0.398,89.554001,0.548,2003
"Sam Hunt - ""Raised On It""",0.0477,0.59,3.925117,0.94,0,0.0,9,0.379,-4.124,1,54,0.0409,94.019997,0.732,2014


In [11]:
df_processed.shape

(9858, 15)

# Make Recommendation

In [12]:
# song to create playlist from
song = df_processed.iloc[6]

In [13]:
v1 = np.array(song).reshape(1, -1)
sim1 = cosine_similarity(df_processed, v1).reshape(-1)

# create dataframe with top recommendation
recommendation_df = pd.DataFrame(sim1, index = df_processed.index)
recommendation_df = recommendation_df.rename(columns={0:'cosine_similarity'}).reset_index()
recommendation_df = recommendation_df.merge(df[['id', 'index']], how='left')
recommendation_df.sort_values('cosine_similarity', ascending=False, inplace=True)

In [14]:
recommendation_df.head(10)

Unnamed: 0,index,cosine_similarity,id
6,"Ray Price - ""A Girl in the Night""",1.0,3GBqKmJ62SJ943NQ9i1JuE
2662,"The Monkees - ""Poll""",0.999998,2Ika0knHzCDx3l4k75r48K
4973,"Al Martino - ""Rudolph, The Red-Nosed Reindeer ...",0.999998,0SaGRHNUlIN4bZqc0RVx4a
8446,"Kenny Dorham - ""If Ever I Would Leave You - Ru...",0.999998,2DPcK58j3Zms6ybVIKSIIY
9115,"Queen - ""Flash To The Rescue - Remastered 2011""",0.999998,5765ZVfLL96bvWlS1ge9iX
375,"Billie Holiday - ""If My Heart Could Only Talk""",0.999997,2Fqphno2uueRloqhs9nvpH
875,"Billie Holiday, Sy Oliver & His Orchestra - ""D...",0.999997,594VViGBPR7H4S7hWitk4K
6869,"Sam Cooke - ""Try A Little Love""",0.999996,3l9N7l4kH0HOBwiiARxzVg
7114,"Lata Mangeshkar - ""Main Kya Karoon Ram""",0.999996,38K0oVqAU6AlcrDZlYdIMe
5575,"Waylon Jennings - ""Love Of The Common People""",0.999996,3TemiwfgcJ6lzNN7f0urOC


# Search for a song outside our original dataset

In [15]:
# needed credentials and permissions for Spotify API (scope)
cid = os.environ.get('SPOTIFY_CLIENT_ID')
secret = os.environ.get('SPOTIFY_CLIENT_SECRET')
redirect_uri = os.environ.get('SPOTIPY_REDIRECT_URI')
scope = 'playlist-modify-private'
username = None

# testing locally for user authentication, later development is needed
spotify_token = spotipy.util.prompt_for_user_token(username,
                                           scope,
                                           cid,
                                           secret,
                                           redirect_uri)

sp = spotipy.Spotify(auth=spotify_token)

In [16]:
# input for testing purposes
input = "efecto amalia recuerdos"

In [17]:
# make Spotify API call to search for song info
search_result = sp.search(input, limit=1)

# get song id for search
id = search_result['tracks']['items'][0]['id']

# get song features
song_features = sp.audio_features(id)[0]

# create columns with necessary features
song_features['year'] = search_result['tracks']['items'][0]['album']['release_date'][:4]
song_features['explicit'] = search_result['tracks']['items'][0]['explicit']
song_features['popularity'] = search_result['tracks']['items'][0]['popularity']

# convert dict to series
song_features = pd.Series(song_features)

# cleanup
song_features['explicit'] = song_features['explicit'] * 1
song_features['duration_m'] = (song_features['duration_ms'] /1000)/60
song_features = song_features.drop(['id', 'uri', 'track_href', 'analysis_url', 'type', 'duration_ms', 'time_signature']).sort_index()

# get artist name
song_artist = search_result['tracks']['items'][0]['artists'][0]['name']

# get track name
song_name = search_result['tracks']['items'][0]['name']

# create index
song_index = song_artist + ' - "' + song_name + '"'

In [18]:
print(song_index)
print(id)
print(song_features)

Efecto Amalia - "recuerdos"
51coTLX9YFZpjoxgmiG6aY
acousticness          0.0113
danceability           0.363
duration_m          2.915383
energy                 0.718
explicit                   0
instrumentalness       0.222
key                        6
liveness               0.108
loudness              -6.155
mode                       1
popularity                 2
speechiness           0.0298
tempo                105.293
valence                0.179
year                    2019
dtype: object


In [20]:
if id not in df['id']:
    print(id)
else:
    print('nope')

51coTLX9YFZpjoxgmiG6aY


In [21]:
new_df = pd.DataFrame(song_features).transpose()

In [24]:
new_df['index'] = song_index
new_df['id'] = id
df.append(new_df, ignore_index=True)

Unnamed: 0,acousticness,danceability,duration_m,energy,explicit,id,index,instrumentalness,key,liveness,loudness,lyrics,mode,popularity,speechiness,tempo,valence,year
0,0.0158,0.551,3.582217,0.863,0,3Bdqlr7jQLNhITAgcBGQBG,"Parmalee - ""Close Your Eyes""",0.0,11,0.0843,-3.506,Well take a look at what's left in that sunset...,1,50,0.0322,143.951996,0.817,2013
1,0.00661,0.494,3.91045,0.887,1,7sLpSWxQazJzDVG6YGzlVs,"JAY-Z - ""99 Problems""",0.0,6,0.103,-4.297,"If you're havin' girl problems, I feel bad for...",0,61,0.398,89.554001,0.548,2003
2,0.0477,0.59,3.925117,0.94,0,3BuPop8SzLG2Q88TJcFAjp,"Sam Hunt - ""Raised On It""",0.0,9,0.379,-4.124,Snapbacks and Levi jeans PBR and burnt CDs Run...,1,54,0.0409,94.019997,0.732,2014
3,0.000473,0.34,4.815783,0.974,0,7EvjTEzuv7TWaIaWY63sWV,"Iron Maiden - ""Drifter - 2015 Remaster""",0.0928,0,0.373,-5.114,,1,29,0.106,101.276001,0.475,1981
4,0.994,0.41,3.294217,0.169,0,38PozVGXXoeO8dTEVzy74Y,"Markos Vamvakaris, Apostolos Xatzixristos - ""S...",0.901,2,0.113,-18.862,,1,0,0.0391,93.889999,0.55,1930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9854,0.944,0.422,3.030883,0.477,0,0PkzQsgs6DeCAQPvx4iFgo,"M. K. Thyagaraja Bhagavathar - ""Vallalalai Paa...",0.606,3,0.134,-12.02,,0,0,0.037,89.672997,0.834,1943
9855,0.682,0.776,3.57295,0.139,0,6Hb2J7m0fhGiIU4Zx9Pk4C,"Ismael Rivera - ""Boriqueneando""",0.000003,2,0.0536,-22.629999,Borinquen! (Borinqueneando borinco Borinquenea...,1,24,0.081,128.395996,0.902,1975
9856,0.187,0.724,2.999783,0.666,0,2oziP5rlqR0kKHbGzIOL0b,"Marco Antonio Solís, Los Bukis - ""Corazon Limpio""",0.000322,4,0.247,-12.263,,1,38,0.0417,91.411003,0.896,1995
9857,0.00554,0.453,3.10245,0.911,0,2iptAVf9jZerlywLwiGhak,"Three Days Grace - ""Fallen Angel""",0.0,2,0.0627,-3.775,Late at night I could hear the crying I hear i...,0,64,0.0382,137.011002,0.218,2015


In [None]:
from jars.interface.main import get_recommendation

In [25]:
recommendation_df = pd.DataFrame(sim1, index = df_processed.index)
recommendation_df = recommendation_df.rename(columns={0:'cosine_similarity'}).reset_index()
recommendation_df = recommendation_df.merge(df[['id', 'index']], how='left')
recommendation_df.sort_values('cosine_similarity', ascending=False, inplace=True)
recommendation = recommendation_df[['index', 'id']].head(15).set_index('index')

recommendation.T.to_dict()

{'Ray Price - "A Girl in the Night"': {'id': '3GBqKmJ62SJ943NQ9i1JuE'},
 'The Monkees - "Poll"': {'id': '2Ika0knHzCDx3l4k75r48K'},
 'Al Martino - "Rudolph, The Red-Nosed Reindeer - Remastered"': {'id': '0SaGRHNUlIN4bZqc0RVx4a'},
 'Kenny Dorham - "If Ever I Would Leave You - Rudy Van Gelder Edition/Digital Remaster/24 Bit Mastering/1998"': {'id': '2DPcK58j3Zms6ybVIKSIIY'},
 'Queen - "Flash To The Rescue - Remastered 2011"': {'id': '5765ZVfLL96bvWlS1ge9iX'},
 'Billie Holiday - "If My Heart Could Only Talk"': {'id': '2Fqphno2uueRloqhs9nvpH'},
 'Billie Holiday, Sy Oliver & His Orchestra - "Do Your Duty"': {'id': '594VViGBPR7H4S7hWitk4K'},
 'Sam Cooke - "Try A Little Love"': {'id': '3l9N7l4kH0HOBwiiARxzVg'},
 'Lata Mangeshkar - "Main Kya Karoon Ram"': {'id': '38K0oVqAU6AlcrDZlYdIMe'},
 'Waylon Jennings - "Love Of The Common People"': {'id': '3TemiwfgcJ6lzNN7f0urOC'},
 'Carmen McRae - "Baltimore Oriole"': {'id': '2xdyH2pPOwRA7B0Ws3RB5J'},
 'Jackie Gleason - "The Man I Love"': {'id': '3KREEXd