In [1]:
import sklearn
from surprise import Dataset
from surprise import evaluate
from surprise import KNNBasic
from surprise import Reader
import pandas as pd
import numpy as np
#from scipy.sparse import csr_matrix

In [2]:
triplet_data = pd.read_table('/Users/christinacampbell/downloads/10000.txt',header = None)
triplet_data.columns= ['user_id', 'song_id', 'listen_count']


In [3]:
popularity_threshold = 26
triplet_data = triplet_data.query('listen_count >= @popularity_threshold')
triplet_data.shape


(20509, 3)

In [4]:
#sort data by users
new_data=triplet_data.sort_values(by=['user_id']).reset_index(drop=True)

In [5]:
#shows the range of listen counts
print(new_data['listen_count'].describe())

count    20509.000000
mean        45.663026
std         37.053883
min         26.000000
25%         30.000000
50%         36.000000
75%         49.000000
max       2213.000000
Name: listen_count, dtype: float64


In [6]:
#check to see if you can limit the range of ratings
len(new_data.loc[new_data['listen_count']<=130])
#130 would allow for me to turn the ratings into 1-5 scale = easier to compare and will make the data more clustered

20066

In [7]:
#limit the data to only ratings less than 130
new_data = new_data[new_data.listen_count <=130]
#divide listen counts by 26 to get it into a 1-5 rating
new_data['listen_count'] = new_data['listen_count'].apply(lambda x: x/26)

In [8]:
#find new max and min
print(new_data['listen_count'].describe())

count    20066.000000
mean         1.615479
std          0.707456
min          1.000000
25%          1.115385
50%          1.346154
75%          1.846154
max          5.000000
Name: listen_count, dtype: float64


In [9]:
#drop users that only have 1 appearance
new_data['is_duplicate']=  new_data.duplicated(['user_id'], keep =False).values
new_data = new_data.query('is_duplicate == True')
new_data=new_data.drop(['is_duplicate'], axis=1).reset_index(drop=True)


In [10]:
#drop songs that only have 1 rating
new_data['song_dups'] = new_data.duplicated(['song_id'], keep =False).values
new_data= new_data.query('song_dups== True')
new_data = new_data.drop(['song_dups'], axis = 1).reset_index(drop=True)
new_data.shape

(12063, 3)

In [11]:
#limit data to have only songs that are listened to by more than 7 users
#helps increase the number of predictions that can be made because tightens the clusters
#doing this for users leads to worse performance because it limits the amount of comparable data
new_data = new_data.groupby('song_id').filter(lambda x : (x['song_id'].count()>=7).any())

In [12]:
new_data.shape

(6238, 3)

In [13]:
#remove 1 rating for each user in order to use it to evaluate the accuracy of my predictions
#I now have a rating to compare the given prediction with for a selection of the data
new_data['find_dup']= new_data.duplicated(['user_id'], keep='first').values
new_data['find_nondup'] = new_data.duplicated(['user_id'], keep=False).values
testing_ratings = new_data[(new_data.find_dup==False) & (new_data.find_nondup==True)]
new_data =new_data[(new_data.find_dup==True) | (new_data.find_nondup==False)]


In [14]:
len(testing_ratings)

1792

In [15]:
#remove the unnessecary columns and reset the index
new_data = new_data.drop(['find_dup','find_nondup'],axis=1).reset_index(drop=True)


In [16]:
#load the data into the surprise dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(new_data[['user_id', 'song_id', 'listen_count']],reader = reader)
#create the train set
trainset = data.build_full_trainset()

In [17]:
#initialize the model
sim_options = {
    'name': 'cosine',
    'user_based': False
}
 
knn = KNNBasic(sim_options=sim_options)

In [18]:
#fit the trainset using the knn model
knn.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1205d1cf8>

In [19]:
# create a list of predictions for each of the rows in the testing set I collected above. 
#This will be used to allow me to accurately see how successful the model is, because I have the given ratings by these users
from collections import defaultdict
predictions=[]

test_rates = testing_ratings.values
for i in range(len(test_rates)-1):
    predictions.append(knn.predict(test_rates[i][0],test_rates[i][1], test_rates[i][2]))
    

In [20]:
import math
#calculates average difference between the actual rating and the predicted rating
diff=[]
exact=[]
len(predictions)
for i in range(len(predictions)-1):
    #val= (predictions[i][2] - predictions[i][3])/2213
    val = abs(predictions[i][2] - predictions[i][3])
    diff.append(val)
    
avg_difference=sum(diff)/len(diff)
avg_difference

0.5566769704159487

In [23]:
exact=[]
#calculates the percent of possible predictions where the difference was less than the average difference
for i in range(len(predictions)-1):
    val= (predictions[i][2] - predictions[i][3])
    #val = (abs(Eval[i][2] - Eval[i][3]))
    if val<avg_difference:
        exact.append(val)
len(exact)/len(predictions)*100

82.41206030150754

In [24]:
#rmse for all predictions (including those that were impossible and therefore just given the mean score)
#interestingly, the rmse is better for all predictions, which shows assigning the mean rating proves to be successful
from surprise import accuracy
accuracy.rmse(predictions,verbose =True)

RMSE: 0.7896


0.7895896072791613

In [25]:
# a function to return the n top recommendation for each user based off of the predictions; 
#as of right now, it returns just the song id of each, but I plan to fix it to return the song name as well as the difference in the ratings
from collections import defaultdict
def top_predictions(prediction,n):
    top = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top[uid] = user_ratings[:n]

    return top

In [26]:
##originally was 54 million tests, had to find a way to limit the number of songs without ratings
#build_anti_testset(): makes a testset of all of the songs that arent rated by a user
#AKA every song that a user hasn't listened to will be added to the test set for it to calculate a prediction for it
testset = trainset.build_anti_testset()
    

In [27]:
len(testset)

1145978

In [28]:
#calculates all of the predictions
all_predictions = knn.test(testset)

In [29]:
#creates a list of all of the top predictions for each user
top = top_predictions(all_predictions,5)

In [30]:
#returns a list of all of the users and their predictions;
# I will makes this prettier!
for uid, user_ratings in top.items():
    print(uid, [iid for (iid, _) in user_ratings])

0031572620fa7f18487d3ea22935eb28410ecc4c ['SOLDJLX12A6D4F9A78']
0091e0326c4c034cc04be6454742912845740a1f ['SOQQAAQ12A67ADE34D']
00a443baf550f4bbdd974ba73720abf2759166f3 ['SOUNZHU12A8AE47481']
00bc2c7795aeec17772d5e3be3bb0bd23f1167b9 ['SOMHLHX12A58A7AA5B']
00d941401206a4b252886d059dc9cf37096a68ee ['SOAUWYT12A81C206F1']
00d94de9e6f82fa1527da8091d56d2cb3b8b30de ['SOTUARP12A8C13CB54']
010e23053b49bcdcf857fdd61b60884d53a67212 ['SOLCKBV12A8C13CEC1']
014175eede622c7e05150727c768c64036636f8a ['SOOJJCT12A6310E1C0']
0181c36f49953188d993fcb87d690edc58b97f56 ['SOSFPMR12A8C13D44A']
019203f26b0adde655119bfc43a1e59ee8eeb582 ['SOTCMDJ12A6D4F8528']
019d0d1c7a01f8736ba59a124160e5fc70666db7 ['SOSJSSU12A6D4F8F41']
01ab42d3aefa8162b1a76b00c49f51a069832a21 ['SOVDSJC12A58A7A271']
01c2c1cfbe9b647b7b0bf9a1171a1e11e07c89c6 ['SOWEHOM12A6BD4E09E']
0200cb00bc2bd1710d3d4f48205c9bda7078e8ef ['SOEGIYH12A6D4FC0E3']
0226968d8e0f4cba57377df983148a8e15ab68b5 ['SOIOZHO12AB017FE5E']
028467fe4ace77bcab679c6107cb8a39201bcbb0

7c235f4d5fce28ad62bfd2ca926b0b59676199b7 ['SOLRGVL12A8C143BC3']
7c3d863b6a93d309df6fee84519c887ff2445746 ['SORJICW12A8C13640D']
7c45383e890d452854208ac9106294fc14bdd165 ['SOLTAEJ12A8C13F793']
7c67aa0272105ba014c1a56c64e8b7dc4c8f0fb5 ['SOQQAAQ12A67ADE34D']
7c81e7e7d4348e03a594734108a2889a825ac704 ['SODGVGW12AC9075A8D']
7c930b2aacb4400fd604a81d504cb6f067e3bd58 ['SOIZFTE12AB0186842']
7c99668ad426369d6fe186ad9d5a8dadb6765668 ['SOSXLTC12AF72A7F54']
7ca6de912614693ddc9cf5f9f7a96e5a96e94a4c ['SOCNAXF12A6D4F9B34']
7cbe8c723ab07a441487e002592459254f70730d ['SOODRHW12A6310D8FF']
7cd122ef955df23b2cf2b7f9a19c8a4963992453 ['SOXLOQG12AF72A2D55']
7cf1d117bbc2c47436e72222176aef57af5a9291 ['SOPEFDW12AF72A1709']
7cfe3fb5408cd82311e8a3ab5b9738f6f719a45a ['SOBONKR12A58A7A7E0']
7d14a1e38f5bf170c25923c37c2a15229c7158fa ['SOOFYTN12A6D4F9B35']
7d3d5e39927bd780b5c6ba7a23e5117d237b4d7f ['SOCPIVV12A6D4F7B4C']
7d57adfbfa3bc61f8a0979cbf47b412db02d0229 ['SOTVLHC12A6701EA31']
7d6396eefd333a03e6cdba91e1028e9d9a4e41fc

e9902857d9dde3ec866cc971efcb6d254a8a583e ['SOBXHDL12A81C204C0']
e9ad79eaeab3edd75dc9988584fb1db84a6eb08b ['SOYRTVQ12AB018BD14']
e9cdfc759555806576079e8bd86bc0a6eae31c0a ['SOKOXWU12AF72AD1BC']
ea3bbb58048c88a9be1140b76d94312bd7a25e27 ['SOSXLTC12AF72A7F54']
ea76a4f34792c3d593eaa95672269b3edfa26721 ['SOUVTSM12AC468F6A7']
ea8a07f6710296b0214c0c265e12cbad32bd2cfb ['SOPPROJ12AB0184E18']
ea940768f1b65e3f599f7638ead0546366f272ee ['SOFRQTD12A81C233C0']
eac0b7cace5be71a7ae59da79a588f6998992a58 ['SOUNZHU12A8AE47481']
eaf4aa2f91a228dbed5b20948b25b8bad0963144 ['SOAUWYT12A81C206F1']
eb09e940bf5532b312c49e702e62267ebaf1e3a9 ['SOSCIZP12AB0181D2F']
eb434675de09cb2d3e5488a941e2cd7c9a23d47b ['SOMIHYA12AB018DD37']
eb53fd919425cd78761695bb1e845bddeb2f774f ['SOZCDWG12A6D4F81E1']
eb671bc38ee5d5d13b815fef98f1e9de44580c45 ['SOEGIYH12A6D4FC0E3']
ebaad5e02278ff8cce9617e1f171cfbf2ddaef1b ['SOIOESO12A6D4F621D']
ebb45f48a9a8beb435915af1de4f25da75bac81d ['SOIRUXQ12A8C133060']
ebe8d1d06db78a35c6693f75fec5af081372e5ff

In [31]:
#rmse of all predictions, including the impossible predictions

accuracy.rmse(all_predictions,verbose = True)

RMSE: 0.2316


0.23156620029398656

In [None]:
#notice that the accuracy is much better with more data and predictions