### This code is implmenting KNN Basic algorithm with the help scikit-surprise package. This algorithm is implemented on the basis of User-User collaborative filtering. 

In [1]:
import pandas as pd
#To build a recomendation model,using machine learning 
movies_df = pd.read_csv('movies.csv')
movies_df = movies_df.head(200000)
##Collabrative Filtering now 
ratings_df=pd.read_csv('ratings.csv')
ratings_df = ratings_df.head(200000)
ratings_df.drop('timestamp',axis=1)

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5
...,...,...,...
199995,1371,1261,5.0
199996,1371,1274,4.5
199997,1371,1291,5.0
199998,1371,1449,4.0


###### Merging the movies and ratings dataset 

In [2]:
df_combined = pd.merge(ratings_df, movies_df, on = 'movieId')

In [3]:
df_combined= df_combined.drop('timestamp',axis=1)

##### Creating training and testing dataset from combined dataset. Training dataset is 20 percent of the combined dataset

In [4]:
test_data_size = 40000  #20 percent
train_data = df_combined[:-test_data_size]
test_data = df_combined[-test_data_size:]

In [5]:

train_data= train_data[pd.notnull(train_data['rating'])]
train_data['userId'] = train_data ['userId'].astype(int)

###### Importing surprise package

In [6]:
from surprise import Reader, Dataset,KNNBasic

In [7]:
from surprise.model_selection import cross_validate

##### Creating the reader to load the dataset in the knn algorithm. Similarity option has been chosen as 'Pearson' but it can be changed to 'cosine'

In [8]:
reader = Reader()
# We can make knnbasic to implement cosine as well. just change 'pearson' to 'cosine'.
data_train = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)
trainset = data_train.build_full_trainset()
similarity_option = {'name': 'pearson'}
knn=KNNBasic(k=13, sim_options=similarity_option)
#algo.fit(train_data)

##### Training the knn algorithm on the trainset 

In [9]:
knn.fit(trainset)
testset = trainset.build_anti_testset()

predictions = knn.test(testset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


###### creating the seperate data for movie and title to help in predicting the movie for user

In [14]:
data_title = train_data[['movieId','title']]

In [15]:
data_title.set_index('movieId', inplace = True)

In [None]:
data_title

#### Making sure that we are getting the required result 

In [16]:
user = train_data[(train_data['userId'] == 220) & (train_data['rating'] == 4 )]
user = user.set_index('movieId')
user = user.drop('userId',axis =1)
user = user.drop('rating',axis =1)
user = user.drop('genres',axis =1)
user = user.merge(data_title,on = 'title',how='left')
print(user)

                             title
0      Seven (a.k.a. Se7en) (1995)
1      Seven (a.k.a. Se7en) (1995)
2      Seven (a.k.a. Se7en) (1995)
3      Seven (a.k.a. Se7en) (1995)
4      Seven (a.k.a. Se7en) (1995)
...                            ...
11558              Identity (2003)
11559              Identity (2003)
11560              Identity (2003)
11561              Identity (2003)
11562              Identity (2003)

[11563 rows x 1 columns]


##### creating a movie summary to get the mean value of rating with respect to movie id.

In [32]:
data_movie_summary = train_data.groupby('movieId')['rating'].agg(['count','mean']).drop_duplicates()
data_movie_summary

Unnamed: 0_level_0,count,mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,202,4.00495
2,128,3.085938
3,54,3.194444
5,51,2.990196
6,111,3.927928
7,53,3.424528
10,124,3.439516
11,91,3.664835
16,101,3.727723
17,74,3.702703


In [33]:
data_movie_summary.index = data_movie_summary.index.map(int)
movie_reference = round(data_movie_summary['count'].quantile(0.8),0)
print(movie_reference)

129.0


In [46]:
drop_movie_list = data_movie_summary[data_movie_summary['count'] < movie_reference].index
drop_movie_list

Int64Index([    2,     3,     5,     6,     7,    10,    11,    16,    17,
               19,
            ...
            49272, 51662, 54286, 55820, 56367, 58559, 59315, 60069, 68954,
            79132],
           dtype='int64', name='movieId', length=416)

#### Getting user summary by calculaing the mean rating of every user

In [35]:
data_cust_summary = train_data.groupby('userId')['rating'].agg(['count','mean'])
data_cust_summary

Unnamed: 0_level_0,count,mean
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,93,3.822581
3,114,4.350877
7,117,3.435897
11,166,4.394578
14,111,3.630631
24,238,3.634454
29,101,3.623762
31,54,3.425926
35,81,4.320988
53,67,3.895522


In [36]:
data_cust_summary.index = data_cust_summary.index.map(int)
cust_reference = round(data_cust_summary['count'].quantile(0.8),0)
print(cust_reference)
drop_cust_list = data_cust_summary[data_cust_summary['count'] < cust_reference].index

249.0


In [37]:
train_data = train_data[~train_data['movieId'].isin(drop_movie_list)]
train_data = train_data[~train_data['userId'].isin(drop_cust_list)]

### Main prediction method

In [56]:
def prediction( userid ):
    
  user = data_title.copy()
  user = user.reset_index()
  user = user[~user['movieId'].isin(drop_movie_list)]
  user['Estimate_Score'] = user['movieId'].apply(lambda x: knn.predict(userid, x).est)
  user = user.drop('movieId', axis = 1)
  user= user.sort_values('Estimate_Score', ascending=False)
  x=print(user.head(1))
  return

In [61]:
#Input any userId
prediction(116)

                                title  Estimate_Score
96752  CJ7 (Cheung Gong 7 hou) (2008)             5.0


### Can check the results here by putting the userid and movie name in.The result should be empty

In [63]:
pd.set_option('display.max_rows', None)
train_data[(train_data['userId'] == 116) & (train_data['title'] == 'CJ7 (Cheung Gong 7 hou) (2008)') ]

Unnamed: 0,userId,movieId,rating,title,genres


In [45]:
train_data

Unnamed: 0,userId,movieId,rating,title,genres
324,54,32,5.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
325,58,32,5.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
330,91,32,4.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
340,116,32,4.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
351,147,32,3.5,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
354,156,32,5.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
369,208,32,3.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
385,247,32,3.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
399,294,32,4.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
401,298,32,3.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller


### To get the RMSD value of the KNN algorithm with the chosen similarity layer

In [None]:
#Evaluating the KNNBasic model on test data
#Computing the RMSD for the chosen similarity 
result = cross_validate(knn, data_test
                        , measures=['RMSE'], cv=5, verbose=True)
tmp = pd.DataFrame.from_dict(result).mean(axis=0)
tmp.head()

### To get the recall value of the KNN algorithm with the chosen similarity layer

In [None]:
from collections import defaultdict
def recall_at_k(predictions, k, threshold=3.5):
   # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])
        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return recalls

In [None]:
from surprise.model_selection import KFold
data = Dataset.load_from_df(test_data[['userId', 'movieId', 'rating']], reader = reader )
kf = KFold(n_splits=5)

for trainset, testset in kf.split(data):
    predictions = knn.test(testset)
    recalls = recall_at_k(predictions, k=5, threshold=2)

    # Precision and recall can then be averaged over all users
    print(sum(rec for rec in recalls.values()) / len(recalls))