# KNN and SVD based Movie Recommendation System

In [50]:
import pandas as pd
import numpy as np
import os
import regex as re

#! pip install surprise
from surprise import Reader
from surprise import KNNWithMeans, get_dataset_dir

In [51]:
from surprise import Dataset

In [52]:
df = pd.read_csv('data1.csv',names = ['timestamp','userID','data_str'])
df.shape

(303671, 3)

In [53]:
df.columns

Index(['timestamp', 'userID', 'data_str'], dtype='object')

In [54]:
df1 = df.loc[ df['data_str'].str.contains('=')]
df1.reset_index(inplace=True)

In [55]:
df1['rating'] = df1['data_str'].apply(lambda x : x.split('=')[-1] ) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['rating'] = df1['data_str'].apply(lambda x : x.split('=')[-1] )


In [56]:
df1['movieID'] = df1['data_str'].apply(lambda x : re.findall( 'rate/' + "(.*)" + '=', x)[0] ) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['movieID'] = df1['data_str'].apply(lambda x : re.findall( 'rate/' + "(.*)" + '=', x)[0] )


In [57]:
df1.head()

Unnamed: 0,index,timestamp,userID,data_str,rating,movieID
0,83,2023-02-08T02:22:51,684782,GET /rate/big+hero+6+2014=4,4,big+hero+6+2014
1,187,2023-02-08T02:22:51,242281,GET /rate/the+exploding+girl+2009=4,4,the+exploding+girl+2009
2,255,2023-02-08T02:22:51,708816,GET /rate/jodhaa+akbar+2008=4,4,jodhaa+akbar+2008
3,280,2023-02-08T02:22:52,165725,GET /rate/the+sweet+hereafter+1997=5,5,the+sweet+hereafter+1997
4,1201,2023-02-08T02:22:53,327968,GET /rate/guardians+of+the+galaxy+2014=3,3,guardians+of+the+galaxy+2014


In [58]:
reader = Reader()

In [59]:
data = Dataset.load_from_df(df1[["userID", "movieID", "rating"]], reader)

In [60]:
from surprise import SVD
from surprise.model_selection import cross_validate
import time

In [61]:
# To use user-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": True,
}

algo1 = KNNWithMeans(sim_options=sim_options)
algo2 = SVD()


In [62]:
start_time=time.time()
cross_validate(algo = algo1, data = data, measures=['RMSE','MAE'], return_train_measures=True)
end_time=time.time()-start_time
print(end_time)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
0.46486616134643555


In [63]:
start_time=time.time()
cross_validate(algo2, data, measures=['RMSE','MAE'], cv=5, verbose=True)
end_time=time.time()-start_time
print(end_time)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7482  0.7055  0.7649  0.7270  0.7316  0.7354  0.0201  
MAE (testset)     0.6200  0.5877  0.6234  0.5987  0.6030  0.6066  0.0134  
Fit time          0.03    0.03    0.03    0.04    0.03    0.03    0.00    
Test time         0.10    0.00    0.00    0.00    0.00    0.02    0.04    
0.30176806449890137


In [64]:
#print(type(algo1))
#print(type(algo2))
from surprise import accuracy
# from suprise import evaluate

In [65]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=0.25)
start_time_one=time.time()
algo1.fit(trainset)
end_time_one=time.time()-start_time
start_time_two=time.time()
algo2.fit(trainset)
end_time_two=time.time()-start_time
print(start_time_one)
print(start_time_two)

Computing the cosine similarity matrix...
Done computing similarity matrix.
1676858561.137407
1676858561.189383


In [66]:
start_time_one=time.time()
predictions1 = algo1.test(testset)
end_time_one=time.time()-start_time
start_time_two=time.time()
predictions2 = algo2.test(testset)
end_time_two=time.time()-start_time
print(start_time_one)
print(start_time_two)

1676858561.233613
1676858561.2395673


In [67]:
algo2.predict(uid=708816, iid='jodhaa+akbar+2008')

Prediction(uid=708816, iid='jodhaa+akbar+2008', r_ui=None, est=3.7613558221008274, details={'was_impossible': False})

In [73]:
print(predictions1)
print(predictions2)

[Prediction(uid=797121, iid='all+tomorrows+parties+2009', r_ui=4.0, est=3.722112211221122, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}), Prediction(uid=869238, iid='the+wild+child+1970', r_ui=3.0, est=3.722112211221122, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}), Prediction(uid=291814, iid='the+terminator+1984', r_ui=4.0, est=3.722112211221122, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}), Prediction(uid=534170, iid='a+chinese+ghost+story+1987', r_ui=4.0, est=3.722112211221122, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}), Prediction(uid=518367, iid='exit+2006', r_ui=3.0, est=3.722112211221122, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}), Prediction(uid=607924, iid='about+last+night+2014', r_ui=3.0, est=3.722112211221122, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}), Prediction(uid=642296, iid='the+s

In [69]:
import pickle

In [70]:
pickle.dump(algo1, open('knnmodel.pkl','wb'))
pickle.dump(algo2, open('svdmodel.pkl','wb'))

In [71]:
#to get accuracy we need the test set
ac1=accuracy.rmse(predictions1)
ac2=accuracy.rmse(predictions2)
print(ac1)
print(ac2)

RMSE: 0.7441
RMSE: 0.7345
0.7441289189598318
0.734546637628123


In [72]:
# print predictions
nn = algo1.get_neighbors(algo1.trainset.to_inner_uid(708816), k=20)
nn_raw = []
for x in nn:
  print(x, algo1.trainset.to_raw_uid(x))
  nn_raw.append(algo1.trainset.to_raw_uid(x))
df1[df1['userID'].isin(nn_raw) ]

0 39709
1 642288
2 936404
3 547063
4 303414
5 430256
6 568924
7 667787
8 5840
9 252302
10 315721
11 367018
12 84612
13 904687
14 155381
15 292322
16 129638
17 695951
18 900301
19 240573


Unnamed: 0,index,timestamp,userID,data_str,rating,movieID
100,16817,2023-02-08T02:23:22,129638,GET /rate/my+man+godfrey+1957=4,4,my+man+godfrey+1957
242,37008,2023-02-08T02:23:59,84612,GET /rate/the+romantic+englishwoman+1975=4,4,the+romantic+englishwoman+1975
259,39436,2023-02-08T02:24:04,155381,GET /rate/playing+for+keeps+2012=3,3,playing+for+keeps+2012
303,44944,2023-02-08T02:24:14,240573,GET /rate/raajneeti+2010=4,4,raajneeti+2010
621,92903,2023-02-08T02:25:42,430256,GET /rate/toy+story+1995=4,4,toy+story+1995
682,101846,2023-02-08T02:25:59,292322,GET /rate/a+walk+to+remember+2002=4,4,a+walk+to+remember+2002
698,103175,2023-02-08T02:26:01,547063,GET /rate/for+the+birds+2000=4,4,for+the+birds+2000
716,106023,2023-02-08T02:26:07,367018,GET /rate/shark+tale+2004=5,5,shark+tale+2004
798,119179,2023-02-08T02:26:31,900301,GET /rate/au+revoir+les+enfants+1987=5,5,au+revoir+les+enfants+1987
926,139124,2023-02-08T02:27:08,252302,GET /rate/vanishing+point+1971=3,3,vanishing+point+1971
