In [49]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sklearn
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from collections import defaultdict

artist_list = [i.strip().split('\t') for i in open('out.txt', 'r', encoding="utf8").readlines()]
artist_df = pd.DataFrame(artist_list, columns = ['ArtistID', 'ArtistName'])
artist_df['ArtistID'] = artist_df['ArtistID'].apply(pd.to_numeric)

#print(artist_df.head())

rating = pd.read_csv('finalUserData.csv', sep=',', header=None, dtype=int)
rating.columns = ['UserID', 'ArtistID', 'Rating']
#print(rating.head())

rat = pd.merge(rating, artist_df, on='ArtistID')
columns = ['ArtistID', 'ArtistName', 'UserID', 'Rating']
rat.head()

rat = rat.dropna(axis=0, subset=['UserID'])
user_ratingCount = (rat.groupby(by = ['UserID'])['Rating'].
                     count().
                     reset_index().
                     rename(columns = {'Rating': 'totalUserRatingCount'})
                     [['UserID', 'totalUserRatingCount']]
                     )
user_ratingCount.head()

Unnamed: 0,UserID,totalUserRatingCount
0,1,522
1,2,501
2,3,689
3,4,557
4,5,710


In [50]:
rating_with_totalRatCount = rat.merge(user_ratingCount, left_on='UserID', right_on='UserID')
rating_with_totalRatCount.head()

Unnamed: 0,UserID,ArtistID,Rating,ArtistName,totalUserRatingCount
0,1,1000026,90,112,522
1,1,1000084,0,The 2 Live Crew,522
2,1,1000125,100,2Pac,522
3,1,1000152,0,3rd Bass,522
4,1,1000202,0,702,522


In [51]:
pd.set_option('display.float_format', lambda x: '%.3f' %x)
print(user_ratingCount['totalUserRatingCount'].describe())

count    200.000
mean    1147.085
std     1140.344
min      501.000
25%      594.250
50%      807.500
75%     1230.750
max     8808.000
Name: totalUserRatingCount, dtype: float64


In [52]:
print(user_ratingCount['totalUserRatingCount'].quantile(np.arange(0.9,1,0.01))),

0.900   1793.900
0.910   1905.880
0.920   2017.240
0.930   2073.310
0.940   2149.160
0.950   2366.700
0.960   2851.080
0.970   3622.620
0.980   5262.560
0.990   7627.620
Name: totalUserRatingCount, dtype: float64


(None,)

In [53]:
pop_threshold = 2000
rating_popular_user = rating_with_totalRatCount.query('totalUserRatingCount >= @pop_threshold')
rating_popular_user.head()

Unnamed: 0,UserID,ArtistID,Rating,ArtistName,totalUserRatingCount
7737,18,1000026,60,112,7623
7738,18,1000084,60,The 2 Live Crew,7623
7739,18,1000125,50,2Pac,7623
7740,18,1000202,60,702,7623
7741,18,1000247,60,A Tribe Called Quest,7623


In [54]:
list(rating_popular_user)
rating_pivot = rating_popular_user.pivot_table(index = 'UserID', columns='ArtistName', values='Rating').fillna(0)
rating_pivot.head()
rating_matrix = csr_matrix(rating_pivot.values)

In [55]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(rating_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [56]:
list_query_index  = []
for i in range (0, rating_pivot.shape[0]):
    list_query_index.append(i)

dict_analysis = dict()
for j in list_query_index:

    query_index = j #np.random.choice(rating_pivot.shape[0])
    
    distances, indices = model_knn.kneighbors(rating_pivot.iloc[query_index,:].values.reshape(1,-1), n_neighbors=4)

    for i in range (0, len(distances.flatten())):
        if i ==0:
            print()
            print('Similar user Recommendations for {0}:\n'.format(rating_pivot.index[query_index]))
            dict_analysis[rating_pivot.index[query_index]]=[]
        else:
            print('{0}: {1}, with distance of {2}:'.format(i, rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))
            dict_analysis[rating_pivot.index[query_index]].append(rating_pivot.index[indices.flatten()[i]])


Similar user Recommendations for 11:

1: 36, with distance of 0.5363391160393711:
2: 28, with distance of 0.5566433855826668:
3: 60, with distance of 0.5938401431120763:

Similar user Recommendations for 13:

1: 197, with distance of 0.6764928448179708:
2: 60, with distance of 0.7281161325135216:
3: 11, with distance of 0.750287803259059:

Similar user Recommendations for 18:

1: 168, with distance of 0.49701730199892047:
2: 60, with distance of 0.5093601940189851:
3: 28, with distance of 0.5226978791207193:

Similar user Recommendations for 28:

1: 60, with distance of 0.5009784616893814:
2: 18, with distance of 0.5226978791207193:
3: 11, with distance of 0.5566433855826668:

Similar user Recommendations for 36:

1: 11, with distance of 0.5363391160393711:
2: 28, with distance of 0.585399213776681:
3: 60, with distance of 0.6074408045784383:

Similar user Recommendations for 38:

1: 11, with distance of 0.7476118333698127:
2: 36, with distance of 0.7634263418049473:
3: 197, with dist

In [57]:
rating_pivot2 = rating_popular_user.pivot_table(index = 'ArtistName', columns='UserID', values='Rating').fillna(0)
rating_pivot2.head()

UserID,11,13,18,28,36,38,60,71,111,113,133,148,150,168,173,175,197
ArtistName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
!!!,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70.0,50.0,0.0,0.0
'68 Comeback,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0
'Til Tuesday,50.0,30.0,50.0,80.0,20.0,0.0,0.0,10.0,0.0,0.0,0.0,100.0,0.0,100.0,30.0,0.0,0.0
(Hed) P.E.,0.0,0.0,60.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0
*NSYNC,50.0,0.0,60.0,60.0,20.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0


In [58]:
#print(dict_analysis)
numTotReco=0
accur=0
total_reco = len(dict_analysis.keys())
#print(total_reco)
for j in dict_analysis.keys():
    for k in dict_analysis[j]:
        if j in dict_analysis[k]:
            accur += 1
print("Accuracy of the kNN algorithm is: ", (accur/(2*total_reco))*100, "%")

Accuracy of the kNN algorithm is:  58.82352941176471 %


In [59]:
rating_pivot2.shape

(14991, 17)

In [60]:
X = rating_pivot2.values.T
X.shape

(17, 14991)

In [61]:
import sklearn
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components=5, random_state=13)
matrix = SVD.fit_transform(X)
matrix.shape

(17, 5)

In [62]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
corr = np.corrcoef(matrix)
corr.shape

(17, 17)

In [63]:
user_ids = rating_pivot2.columns
user_ids_list = list(user_ids)
print(user_ids_list)
dict_analysis = defaultdict(list)
users = [11, 13, 18, 28, 36, 38, 60, 71, 111, 113, 133, 148, 150, 168, 173, 175, 197]

for i in users:
    us = user_ids_list.index(i)
    corr_user = corr[us]
    print("Similar users for ", i, "are: ", list(user_ids[(corr_user<1.0) & (corr_user>0.9)]))
    dict_analysis[i]=list(user_ids[(corr_user<1.0) & (corr_user>0.9)])

[11, 13, 18, 28, 36, 38, 60, 71, 111, 113, 133, 148, 150, 168, 173, 175, 197]
Similar users for  11 are:  [13, 28, 36, 38, 60, 71, 111, 148, 173, 175, 197]
Similar users for  13 are:  [11, 28, 36, 38, 60, 71, 111, 148, 173, 175, 197]
Similar users for  18 are:  [113]
Similar users for  28 are:  [11, 13, 36, 38, 60, 71, 111, 148, 173, 175, 197]
Similar users for  36 are:  [11, 13, 28, 38, 60, 71, 111, 148, 173, 175, 197]
Similar users for  38 are:  [11, 13, 28, 36, 60, 71, 111, 148, 173, 175, 197]
Similar users for  60 are:  [11, 13, 28, 36, 38, 60, 71, 111, 148, 173, 175, 197]
Similar users for  71 are:  [11, 13, 28, 36, 38, 60, 111, 148, 173, 175, 197]
Similar users for  111 are:  [11, 13, 28, 36, 38, 60, 71, 148, 173, 175, 197]
Similar users for  113 are:  [18, 113]
Similar users for  133 are:  []
Similar users for  148 are:  [11, 13, 28, 36, 38, 60, 71, 111, 173, 175, 197]
Similar users for  150 are:  [150]
Similar users for  168 are:  []
Similar users for  173 are:  [11, 13, 28, 36

In [64]:
#print(dict_analysis)
numTotReco=0
accur=0
total_reco = len(dict_analysis.keys())
#print(total_reco)
for j in dict_analysis.keys():
    if len(dict_analysis[j]) >= 2:
        accur+=1           
print("Accuracy of the SVD algorithm is: ", (accur/total_reco)*100, "%")

Accuracy of the SVD algorithm is:  76.47058823529412 %


In [None]:
#https://towardsdatascience.com/how-did-we-build-book-recommender-systems-in-an-hour-part-2-k-nearest-neighbors-and-matrix-c04b3c2ef55c