In [89]:
import pandas as pd 
import numpy as np

In [90]:
spotify_df = pd.read_csv('SpotifyFeatures.csv')

In [91]:
spotify_sample_df = spotify_df.sample(n= 20000, random_state= 42)

In [92]:
spotify_sample_df.head(5)

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
788,Country,A Thousand Horses,My Time's Comin',16zol4GvHyTER5irYODUk0,45,0.00192,0.327,194107,0.835,0.00015,C,0.167,-4.952,Major,0.0609,171.795,4/4,0.385
207109,Soundtrack,Mark Mothersbaugh,House Tour,6ac5gUfGTckpdGQCyWsdh2,25,0.932,0.253,102920,0.0798,0.568,C,0.0906,-18.512,Major,0.0439,110.931,4/4,0.0487
138644,Reggae,Unified Highway,We Can't Fall (Remix) [feat. J. Patz],09Yz6koF1Y15n1012t1UX6,19,0.0331,0.821,225437,0.737,0.0134,E,0.132,-6.295,Minor,0.212,137.968,4/4,0.787
37164,Electronic,Stooki Sound,Endz - Original Mix,3dzEZARDL4ZwICMKVta7Xn,29,0.00428,0.745,225400,0.772,0.114,E,0.0722,-3.949,Major,0.0904,133.113,4/4,0.17
174351,Comedy,Bill Hicks,I Love My Job (Live),39Z1G5384UgGa5vmW6WyxC,17,0.965,0.502,287973,0.804,9.6e-05,G,0.902,-9.935,Major,0.807,104.576,3/4,0.185


In [93]:
spotify_sample_df = spotify_sample_df.reset_index(drop= True)

In [94]:
spotify_sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   genre             20000 non-null  object 
 1   artist_name       20000 non-null  object 
 2   track_name        20000 non-null  object 
 3   track_id          20000 non-null  object 
 4   popularity        20000 non-null  int64  
 5   acousticness      20000 non-null  float64
 6   danceability      20000 non-null  float64
 7   duration_ms       20000 non-null  int64  
 8   energy            20000 non-null  float64
 9   instrumentalness  20000 non-null  float64
 10  key               20000 non-null  object 
 11  liveness          20000 non-null  float64
 12  loudness          20000 non-null  float64
 13  mode              20000 non-null  object 
 14  speechiness       20000 non-null  float64
 15  tempo             20000 non-null  float64
 16  time_signature    20000 non-null  object

In [95]:
categorical_features = ['genre', 'artist_name', 'track_name', 'key', 'mode']

In [96]:
X_features = list(set(spotify_sample_df.columns) - set(categorical_features))
X_features

['speechiness',
 'duration_ms',
 'track_id',
 'instrumentalness',
 'loudness',
 'energy',
 'acousticness',
 'danceability',
 'popularity',
 'time_signature',
 'valence',
 'liveness',
 'tempo']

In [97]:
X_features.remove('track_id')
X_features.remove('time_signature')
X_features

['speechiness',
 'duration_ms',
 'instrumentalness',
 'loudness',
 'energy',
 'acousticness',
 'danceability',
 'popularity',
 'valence',
 'liveness',
 'tempo']

In [98]:
from sklearn.preprocessing import StandardScaler
Scaler = StandardScaler()
spotify_sample_df[X_features] = Scaler.fit_transform(spotify_sample_df[X_features])

In [99]:
# Concatenating categorical features into a single feature
spotify_sample_df['combined_features'] = spotify_sample_df[categorical_features].apply(lambda x: ' '.join(x), 
                                                                                       axis=1)

In [100]:
# Computing cosine similarity between items
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(spotify_sample_df['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [101]:
# Function to get recommendations
def get_recommendations(track_name, cosine_sim, data):
    idx = data.loc[data['track_name'] == track_name].index[0]
    similar_scores = list(enumerate(cosine_sim[idx]))
    similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True)
    similar_scores = similar_scores[1:11]  # excluding the first entry as it is the same track

    recommended_tracks = [data.iloc[i[0]]['track_name'] for i in similar_scores]

    return recommended_tracks

In [102]:
track_name = 'House Tour'
recommendations = get_recommendations(track_name, cosine_sim, spotify_sample_df)
print(recommendations)

['Garmadon Attacks', 'Weird Things Happen', 'S P A C E', "The People's House", 'Le tour de force', "What's Not to Love", 'XO Tour Llif3', 'Un tour de manège', 'My House Your House', 'Prayer Changes Everything']
