Based on: https://github.com/ashishrana160796/Online-Course-Recommendation-System

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import sklearn.metrics
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.decomposition import TruncatedSVD
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import Normalizer
import pickle
import json

# Train - clustering

In [2]:
dataset_path = "datasets/tracks.csv"
dataset_test_path = "datasets/tracks_test.csv"

In [3]:
df = pd.read_csv(dataset_path)
df.shape

(500000, 9)

In [5]:
df = df.drop(columns=["Unnamed: 0", "MBID", "id"])
df = df.fillna(-1)

In [6]:
df.head(5)

Unnamed: 0,duration,playcount,name,artists,albums,tags
0,-1,4,000003+Music+Instructor/_/Dj%27s+Rock+Da+House...,0,-1.0,-1.0
1,-1,495,00-01/_/%D0%A2%D0%B5%D0%BA%D1%81%D1%82,1,-1.0,-1.0
2,-1,2,0005.+Overkill/_/Overkill,2,-1.0,-1.0
3,-1,2,000C+Tony+Dize/_/Ruleta+Rusa,3,-1.0,-1.0
4,-1,1,000+Oscarcito/_/Tumbay%E2%80%9A+(Lyrics),4,-1.0,-1.0


In [7]:
df['name'] = df['name'].replace({"\+": " ", "_": " ", "/": " ", "\%": " "}, regex=True)
df.head(5)

Unnamed: 0,duration,playcount,name,artists,albums,tags
0,-1,4,000003 Music Instructor Dj 27s Rock Da House...,0,-1.0,-1.0
1,-1,495,00-01 D0 A2 D0 B5 D0 BA D1 81 D1 82,1,-1.0,-1.0
2,-1,2,0005. Overkill Overkill,2,-1.0,-1.0
3,-1,2,000C Tony Dize Ruleta Rusa,3,-1.0,-1.0
4,-1,1,000 Oscarcito Tumbay E2 80 9A (Lyrics),4,-1.0,-1.0


In [7]:
# vectorizer = TfidfVectorizer(stop_words='english')
# X = vectorizer.fit_transform(course_df['name'])

In [8]:
true_k = 30

# usig SVD for LSA
# svd = TruncatedSVD(true_k)
# lsa = make_pipeline(svd, Normalizer(copy=False))
# X = lsa.fit_transform(X)
df = df.drop(columns=["name"])

model = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=15)
model.fit(df)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=500,
       n_clusters=30, n_init=15, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [9]:
# Save machine learning model
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

# Recommend

In [10]:
def cluster_predict(row):
    prediction = model.predict(row)
    return prediction


def recommend(row):
    prediction_inp = cluster_predict(row)
    prediction_inp = int(prediction_inp)
    
    temp_df = df.loc[df['cluster_prediction'] == prediction_inp]
    temp_df = temp_df.sample(10)
    
    return list(temp_df['id'])

In [11]:
with open('finalized_model.sav', 'rb') as fid:
    model = pickle.load(fid)

In [12]:
df = pd.read_csv(dataset_test_path)
df = df.drop(columns=["Unnamed: 0", "MBID", "name"])
df = df.fillna(-1)

df['cluster_prediction'] = ""

In [13]:
df.head()

Unnamed: 0,id,duration,playcount,artists,albums,tags,cluster_prediction
0,492764,93000,163,58780,-1.0,47980.0,
1,492765,-1,9,58780,-1.0,-1.0,
2,492766,52000,9,58780,-1.0,230376.0,
3,492767,399000,9,58780,-1.0,47980.0,
4,492768,102000,9,58780,-1.0,47980.0,


In [14]:
# Cluster category for each live course
df['cluster_prediction'] = cluster_predict(df.drop(columns=['cluster_prediction', 'id']))

In [15]:
df.head(10)

Unnamed: 0,id,duration,playcount,artists,albums,tags,cluster_prediction
0,492764,93000,163,58780,-1.0,47980.0,19
1,492765,-1,9,58780,-1.0,-1.0,0
2,492766,52000,9,58780,-1.0,230376.0,24
3,492767,399000,9,58780,-1.0,47980.0,10
4,492768,102000,9,58780,-1.0,47980.0,19
5,492769,-1,3700,58780,-1.0,-1.0,0
6,492770,-1,2,58780,-1.0,-1.0,0
7,492771,41000,2,58780,-1.0,229779.0,24
8,492772,-1,66,58780,-1.0,-1.0,0
9,492773,315000,66,58780,-1.0,47980.0,28


In [16]:
data = [[[-1, 123, 15, -1, -1]], [[3711000,2,1732,-1.0,11702.0]]]

for song in data:
    recommendations = recommend(song)
    print(recommendations)

[533087, 665529, 665321, 665159, 665412, 847539, 831320, 664782, 498774, 664905]
[635791, 643034, 793899, 536774, 954545, 793925, 821857, 793903, 793916, 793905]


In [17]:
df.groupby('cluster_prediction')['id'].nunique()

cluster_prediction
0     321705
1          1
3          1
5      20112
7        185
8          3
9         32
10      6745
11     21321
12         2
14        64
16     17738
17      3028
18      2670
19     14220
21      4433
23        89
24      9400
25     19267
26     23606
27     11641
28     16005
29       758
Name: id, dtype: int64

In [18]:
df[df['cluster_prediction'] == 28].head(10)

Unnamed: 0,id,duration,playcount,artists,albums,tags,cluster_prediction
9,492773,315000,66,58780,-1.0,47980.0,28
488,493247,328000,6211,58816,30406.0,73582.0,28
562,493320,286000,2821,58816,30402.0,70618.0,28
583,493341,307000,261,58816,30401.0,34720.0,28
595,493353,342000,2621,58816,-1.0,117167.0,28
623,493380,343000,14499,58816,30401.0,12436.0,28
712,493464,294000,10,58816,-1.0,34720.0,28
719,493471,347000,1122,58816,30409.0,12039.0,28
745,493320,286000,3689,58816,30402.0,70618.0,28
806,493554,380000,535,58816,30401.0,12436.0,28


In [21]:
metrics_df = df.drop(columns=['cluster_prediction', 'id'])
labels = df['cluster_prediction'].tolist()
metrics = {}

metrics['calinski_harabasz'] = sklearn.metrics.calinski_harabasz_score(metrics_df, labels)  # more is better
metrics['davies_bouldin'] = sklearn.metrics.davies_bouldin_score(metrics_df, labels)  # less is better

In [25]:
# this cell takes a long time to compute (above 10 minutes) so it's skipped
# metrics['silhouette'] = sklearn.metrics.silhouette_score(metrics_df, labels, metric='euclidean')  # -1 (worst) to +1 (best)

In [26]:
metrics

{'calinski_harabasz': 3293217.8436075626, 'davies_bouldin': 0.8200523919592189}