### Recommendation Engine

Overview: The idea behind a song recommendation system is to identify songs that a user may enjoy based on their listening preferences. In order to accomplish this task I will pull in my listening history and compare various song features from my listening history with songs I not in my listening history to recommend songs that are most like the ones I listen to.

Step 1: Combine my listening history with a random library of songs 
<br>Step 2: Preprocess the data so that each desired feature is considered
<br>Step 3: Create a cosine similarity matrix
<br>Step 4: Create a function to make recommendations given a selected song

### Import Libraries & Data

In [3]:
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity, cosine_distances
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [41]:
%store -r history
%store -r kaggle_df

In [9]:
history.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3624 entries, 0 to 3797
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   endTime            3624 non-null   object 
 1   artistName         3624 non-null   object 
 2   trackName          3624 non-null   object 
 3   msPlayed           3624 non-null   int64  
 4   count              3624 non-null   int64  
 5   artist_id          3624 non-null   object 
 6   genres             3624 non-null   object 
 7   artist_popularity  3624 non-null   float64
 8   followers          3624 non-null   float64
 9   trackID            3624 non-null   object 
 10  danceability       3624 non-null   float64
 11  energy             3624 non-null   float64
 12  key                3624 non-null   float64
 13  loudness           3624 non-null   float64
 14  mode               3624 non-null   float64
 15  speechiness        3624 non-null   float64
 16  acousticness       3624 

In [6]:
kaggle_df.head(1)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954


In [42]:
historytemp = history.drop(columns = ['endTime', 'artistName', 'msPlayed', 'count', 'artist_id', 'genres', 'followers', 'trackID', 'duration_ms', 'time_signature'])
historytemp.head()

Unnamed: 0,trackName,artist_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Transform (feat. Charlotte Day Wilson),76.0,0.498,0.292,5.0,-10.656,1.0,0.031,0.511,1.9e-05,0.256,0.348,68.963
1,Coconut Water,42.0,0.724,0.432,11.0,-9.945,1.0,0.203,0.0876,3.2e-05,0.0828,0.696,84.992
2,Positions,42.0,0.639,0.535,1.0,-7.714,1.0,0.0596,0.489,0.00126,0.109,0.321,84.975
3,Essence (feat. Tems),76.0,0.849,0.707,0.0,-6.002,1.0,0.113,0.0266,9e-06,0.618,0.602,104.027
4,Finesse,48.0,0.795,0.486,5.0,-8.364,1.0,0.0475,0.134,2.6e-05,0.26,0.351,132.045


In [43]:
library = kaggle_df.drop(columns = ['year', 'artists', 'duration_ms', 'explicit', 'id', 'release_date'])
library.head()

Unnamed: 0,valence,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,name,popularity,speechiness,tempo
0,0.0594,0.982,0.279,0.211,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,0.0366,80.954
1,0.963,0.732,0.819,0.341,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,0.415,60.936
2,0.0394,0.961,0.328,0.166,0.913,3,0.101,-14.85,1,Gati Bali,5,0.0339,110.339
3,0.165,0.967,0.275,0.309,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,0.0354,100.109
4,0.253,0.957,0.418,0.193,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,0.038,101.665


In [16]:
print(library.shape, historytemp.shape)

(170653, 13) (3624, 13)


In [44]:
library = library[['name', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]
library.rename(columns = {'name':'trackName', 'popularity': 'artist_popularity'}, inplace = True)
library.head()

Unnamed: 0,trackName,artist_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,0.279,0.211,10,-20.096,1,0.0366,0.982,0.878,0.665,0.0594,80.954
1,Clancy Lowered the Boom,5,0.819,0.341,7,-12.441,1,0.415,0.732,0.0,0.16,0.963,60.936
2,Gati Bali,5,0.328,0.166,3,-14.85,1,0.0339,0.961,0.913,0.101,0.0394,110.339
3,Danny Boy,3,0.275,0.309,5,-9.316,1,0.0354,0.967,2.8e-05,0.381,0.165,100.109
4,When Irish Eyes Are Smiling,2,0.418,0.193,3,-10.096,1,0.038,0.957,2e-06,0.229,0.253,101.665


In [23]:
combined = pd.concat([library, historytemp], axis=0)
combined.drop_duplicates(inplace = True)
combined.shape

(171289, 13)

#### Feature Engineering 

Features
<br> danceability, energy, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, genre, popularity, followers

1) Normalize values over 1
2) Get Dummies or TFDIF on the genre
3) Turn into single vector
4) Calculate Cosine Similarity
5) Generate Recommendations

In [7]:
features = ['danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 
            'instrumentalness', 'liveness', 'valence', 'popularity']

In [47]:
#Step 1 Normalize Features over 1
combined['artist_popularity'] = pd.DataFrame(preprocessing.normalize([combined['artist_popularity']]).T)
combined['loudness'] = pd.DataFrame(preprocessing.normalize([combined['loudness']]).T)
#normal_follow = preprocessing.normalize([combined['followers']])


#normal_feats = pd.DataFrame(normal_pop[0])
#normal_feats.rename(columns = {0:'popularity'}, inplace = True)
#normal_feats['follow'] = normal_follow[0]
#normal_feats['loudness'] = normal_loud[0]

In [24]:
#Step 2 TFDIF Genre Category
#history['genres'] = [''.join(x) for x in history['genres']]

#vectorizer = TfidfVectorizer()
#vectors = vectorizer.fit_transform(history['genres'])

#genre_df = pd.DataFrame(vectors.toarray())
#genre_df.reset_index(drop = True, inplace=True)
#genre_df.iloc[0]

In [25]:
#Step 3
#combined.reset_index(inplace = True)
#stream_feats = pd.concat([combined, genre_df], axis = 1)

In [26]:
#stream_feats.head(1)

In [13]:
#test = pd.concat([history[['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence']], genre_df, normal_feats], axis = 1)
#test.index = stream_feats['trackName']

In [52]:
combined.index = combined['trackName']
combined.drop(columns = ['trackName'], inplace = True)
combined.head()

KeyError: 'trackName'

### Get Recommendations

In [59]:
similarity = cosine_similarity(combined[0:10_000])

In [62]:
sim = pd.DataFrame(similarity, index = combined[0:10_000].index, columns = combined[0:10_000].index)

In [82]:
combined.loc[['BREAK MY SOUL']]

Unnamed: 0_level_0,artist_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
trackName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
BREAK MY SOUL,0.002634,0.693,0.887,1.0,-0.002377,0.0,0.0795,0.0581,3e-06,0.27,0.864,114.942


In [88]:
combined.loc[['BREAK MY SOUL']].index

Index(['BREAK MY SOUL'], dtype='object', name='trackName')

In [89]:
similarity_bms = cosine_similarity(combined, combined.loc[['BREAK MY SOUL']])
sim = pd.DataFrame(similarity_bms, index = combined.index, columns = combined.loc[['BREAK MY SOUL']].index)

In [92]:
sim.sort_values(by = 'BREAK MY SOUL', ascending = False)

trackName,BREAK MY SOUL
trackName,Unnamed: 1_level_1
BREAK MY SOUL,1.000000e+00
La Temperatura (feat. Eli Palacios),9.999994e-01
Murder On The Dancefloor - Radio Edit,9.999992e-01
Cinderella,9.999990e-01
Born To Love (feat. SHELLS),9.999990e-01
...,...
Magic Window,2.690451e-05
Pause Track - Live,2.067528e-05
StaggerLee Has His Day at the Beach,2.067528e-05
Pause Track,2.067528e-05
