In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
df = pd.read_csv('dataset.csv')

In [3]:
df.drop(df.columns[0], axis=1, inplace=True)

In [4]:
df.columns

Index(['track_id', 'artists', 'album_name', 'track_name', 'popularity',
       'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'track_genre'],
      dtype='object')

In [5]:
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
df.shape

(113549, 20)

In [6]:
tracks = df.drop(['track_id', 'duration_ms'], axis=1)
tracks.head()

Unnamed: 0,artists,album_name,track_name,popularity,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,Gen Hoshino,Comedy,Comedy,73,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,Chord Overstreet,Hold On,Hold On,82,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [7]:
from sklearn.cluster import KMeans

model = TSNE(n_components=2, random_state=0)
kmeans = KMeans(n_clusters=3, random_state=0)
sample_data = tracks.drop(
    ['artists', 'album_name', 'track_name', 'track_genre'], axis=1)[:500]
cluster_labels = kmeans.fit_predict(sample_data)
tsne_data = model.fit_transform(sample_data)
tsne_df = pd.DataFrame(tsne_data, columns=['tsne_1', 'tsne_2'])
tsne_df['cluster'] = cluster_labels
fig = px.scatter(tsne_df, x='tsne_1', y='tsne_2', color='cluster',
                 color_continuous_scale='viridis', opacity=0.8, title='t-SNE Plot with Clusters')
fig.update_layout(
    plot_bgcolor = '#9DB5B2',
    paper_bgcolor = 'rgb(17,17,17)',
    font_color = 'grey',
    width = 700,
)
fig.show()

In [8]:
tracks['track_name'].nunique(), tracks.shape

(73608, (113549, 18))

In [9]:
tracks = tracks.sort_values('popularity', ascending=False)
tracks.drop_duplicates(subset=['track_name'], keep='first', inplace=True)

In [10]:
tracks['track_name'].nunique(), tracks.shape

(73608, (73608, 18))

In [11]:
tracks = tracks.drop(['mode', 'time_signature'], axis = 1)

In [12]:
song_vectorizer = CountVectorizer()
song_vectorizer.fit(tracks['track_genre'])

In [13]:
tracks = tracks.sort_values('popularity', ascending=False).head(5000)

In [14]:
def get_similarities(song_name, data):
    text_array1 = song_vectorizer.transform(data[data['track_name'] == song_name]['track_genre']).toarray()
    num_array1 = data[data['track_name'] == song_name].select_dtypes(include=np.number).to_numpy()

    similarities = []
    for index, row in data.iterrows():
        name = row['track_name']
        
        text_array2 = song_vectorizer.transform(data[data['track_name']==name]['track_genre']).toarray()
        num_array2 = data[data['track_name']==name].select_dtypes(include=np.number).to_numpy()

        text_sim = cosine_similarity(text_array1, text_array2)[0][0]
        num_sim = cosine_similarity(num_array1, num_array2)[0][0]
        similarities.append(text_sim + num_sim)
    return similarities


In [15]:
from fuzzywuzzy import process


def recommend_songs(song_name, data=tracks):
    # Base case
    similar_strings = process.extract(song_name, tracks['track_name'], limit=1)
    if similar_strings[0][1] < 88:
        print('This song is either not so popular or you\
        have entered invalid_name.\n Some songs you may like:\n')

        for song in data.sample(n=5)['track_name'].values:
            print(song)
        return

    extracted_song_name = similar_strings[0][0]
    data['similarity_factor'] = get_similarities(extracted_song_name, data)

    data.sort_values(by=['similarity_factor', 'popularity'],
                     ascending=[False, False],
                     inplace=True)
    display(data[['track_name', 'artists', 'album_name', 'popularity',
            'track_genre']].iloc[1:11])

In [16]:
recommend_songs('Hello')


Unnamed: 0,track_name,artists,album_name,popularity,track_genre
11050,Chasing Pavements,Adele,19,74,british
11012,Only Love Can Hurt Like This - Slowed Down Ver...,Paloma Faith,Only Love Can Hurt Like This (Slowed Down Vers...,77,british
11014,Only Love Can Hurt Like This,Paloma Faith,A Perfect Contradiction (Outsiders' Expanded E...,87,british
11052,Hey Jude,The Beatles,Love,66,british
11319,She Loves You - Mono / Remastered,The Beatles,1 (Remastered),66,british
11361,Last Request,Paolo Nutini,These Streets,69,british
11068,parents,YUNGBLUD,weird!,70,british
11167,All My Loving - Remastered 2009,The Beatles,With The Beatles (Remastered),68,british
11113,Don't Let Me Down - Remastered 2009,The Beatles,The Beatles 1967 - 1970 (Remastered),68,british
11519,Keep Your Head Up,Ben Howard,Every Kingdom,67,british
