## Using surprise 
- Read data
- Build basic user or item based models
- Grid Search 
- Top N users or items

In [1]:
import os
import pandas as pd
import surprise

data_dir="E:\\Work\\Machine Learning Course\\Python\\Module 7 Reccomendation Engines\\Data"
os.chdir(data_dir)

In [2]:
## To read a file using surprise, one needs to make sure that data is in a specific format, there are two common ways to read a dataset, to be used in the library
# Reading from dataframe
# Reading from a text file

# Reading data from a dataframe
df=pd.read_csv("sample_data.csv")
df.head()

Unnamed: 0,user,rating,item
0,1,2,1
1,2,2,1
2,3,3,2
3,4,3,2
4,5,1,1


In [17]:
# Surprise expects data to have three columns, user, rating and item. The spelling of these columns names should be as given. If your dataframe has other columns or column names are differen, remember to change them before trying to reading them in surprise

# We will need to create a reader object before we can load our dataframe into surprise 
reader=surprise.dataset.Reader(line_format='user rating item',rating_scale=(1,5))

In [11]:
data=surprise.dataset.Dataset.load_from_df(df,reader=reader)

In [12]:
data.raw_ratings

[(1, 2, 1.0, None),
 (2, 2, 1.0, None),
 (3, 3, 2.0, None),
 (4, 3, 2.0, None),
 (5, 1, 1.0, None)]

In [13]:
# We can load the dataset from a text file as well, directly, just make sure the text file 
# has three columns named as user, rating and item
reader=surprise.dataset.Reader(line_format='user rating item',sep=",", 
                               rating_scale=(1,5),skip_lines=1)

In [14]:
data1=surprise.dataset.Dataset.load_from_file("sample_data.csv",reader=reader)

In [15]:
data1.raw_ratings

[('1', '1', 2.0, None),
 ('2', '1', 2.0, None),
 ('3', '2', 3.0, None),
 ('4', '2', 3.0, None),
 ('5', '1', 1.0, None)]

In [16]:
## Let's now, work with a slightly larger dataset and train memory based collaborative filtering models
data_dir="E:\Work\Machine Learning Course\Python\Module 7 Reccomendation Engines\Data\ml-latest-small"
os.chdir(data_dir)

In [18]:
mr=pd.read_csv("ratings.csv")
mr.head()
mr.drop('timestamp',axis=1,inplace=True)
mr.rename(columns={'userId':'user','movieId':'item','rating':'rating'},inplace=True)

In [19]:
# user, item, rating on scale of 1 to 5
reader=surprise.dataset.Reader(line_format='user item rating', rating_scale=(1,5))

In [20]:
mr_train=surprise.dataset.Dataset.load_from_df(mr,reader=reader)
mr_trainset=mr_train.build_full_trainset()

In [21]:
## Create a neighbourhood based user and item based collaborative filtering model
import surprise.prediction_algorithms.knns as knns
knnbasic=knns.KNNBasic(k=40,min_k=1,sims_options={'name':'cosine','user_based':True})

In [22]:
knnbasic.train(mr_trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [23]:
mr.head()

Unnamed: 0,user,item,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [24]:
knnbasic.predict(uid=1,iid=31,r_ui=2.5)

Prediction(uid=1, iid=31, r_ui=2.5, est=2.9185383364430808, details={'actual_k': 40, 'was_impossible': False})

In [25]:
## Lets build an item based collaborative filter
knnbasic=knns.KNNBasic(k=40,min_k=1,sims_options={'name':'cosine','user_based':False})

In [26]:
knnbasic.train(mr_trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [27]:
knnbasic.predict(uid=1,iid=31)

Prediction(uid=1, iid=31, r_ui=None, est=2.9185383364430808, details={'actual_k': 40, 'was_impossible': False})

In [28]:
## Collaborative filter with average effects
knnbasic=knns.KNNWithMeans(k=40,min_k=1,sims_options={'name':'pearson','user_based':False})
knnbasic.train(mr_trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [29]:
knnbasic.predict(uid=1,iid=31)

Prediction(uid=1, iid=31, r_ui=None, est=2.0926559856597908, details={'actual_k': 40, 'was_impossible': False})

In [30]:
## Instead of  using just one train set, we can split the data into parts
# and then evaluate the model performance out of sample
mr_train.split(n_folds=3)
surprise.evaluate(knns.KNNBasic(k=40,sims_options={'name':'cosine','user_based':False}),mr_train)

Evaluating RMSE, MAE of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9733
MAE:  0.7487
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9781
MAE:  0.7547
------------
Fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9877
MAE:  0.7581
------------
------------
Mean RMSE: 0.9797
Mean MAE : 0.7538
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.74872093689765096,
                             0.75465788233213871,
                             0.75810036421493732],
                            'rmse': [0.97327304641188805,
                             0.9781246039613456,
                             0.98767221181366405]})

In [31]:
## Build a collaborative filter model with average effects
surprise.evaluate(knns.KNNWithMeans(k=40,sims_options={'name':'cosine','user_based':False}),mr_train)

Evaluating RMSE, MAE of algorithm KNNWithMeans.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9238
MAE:  0.7072
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9271
MAE:  0.7107
------------
Fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9331
MAE:  0.7117
------------
------------
Mean RMSE: 0.9280
Mean MAE : 0.7098
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.70718960177945001,
                             0.71067270304483987,
                             0.71166737941406921],
                            'rmse': [0.92383487450363266,
                             0.92711798698085512,
                             0.93307176562199878]})

In [32]:
## Doing Grid Search
param_grid = {'k': [10, 20],
              'sim_options': {'name': ['msd', 'cosine'],
                              'user_based': [False]}
              }

In [33]:
algo=knns.KNNWithMeans

In [34]:
grid_search = surprise.GridSearch(algo,param_grid=param_grid, measures=['RMSE', 'MAE'])

[{'k': 10, 'sim_options': {'name': 'msd', 'user_based': False}}, {'k': 10, 'sim_options': {'name': 'cosine', 'user_based': False}}, {'k': 20, 'sim_options': {'name': 'msd', 'user_based': False}}, {'k': 20, 'sim_options': {'name': 'cosine', 'user_based': False}}]


In [35]:
grid_search.evaluate(mr_train)

------------
Parameters combination 1 of 4
params:  {'k': 10, 'sim_options': {'name': 'msd', 'user_based': False}}
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
------------
Mean RMSE: 0.9496
Mean MAE : 0.7294
------------
------------
Parameters combination 2 of 4
params:  {'k': 10, 'sim_options': {'name': 'cosine', 'user_based': False}}
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
------------
Mean RMSE: 0.9618
Mean MAE : 0.7382
------------
------------
Parameters combination 3 of 4
params:  {'k': 20, 'sim_options': {'name': 'msd', 'user_based': False}}
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
------------
Mean RMSE: 0.9323
Mean MAE : 0.7148
------------
------------
Parameters combination 4 of 4
params:  {'k': 20, 'sim_options': {'name': 'cosine', 'user_based': False}}
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
----------

In [36]:
print(grid_search.best_params['RMSE'])
print(grid_search.best_params['MAE'])

{'k': 20, 'sim_options': {'name': 'msd', 'user_based': False}}
{'k': 20, 'sim_options': {'name': 'msd', 'user_based': False}}


In [37]:
print(grid_search.best_score['RMSE'])
print(grid_search.best_score['MAE'])

0.932325352684
0.714826941331


In [38]:
## Top 5 recommendations for an item
model=knns.KNNWithMeans(k=20,sim_options={'name': 'msd', 'user_based': False})
model.train(mr_trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [39]:
mr.head()

Unnamed: 0,user,item,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [40]:
mr_trainset.to_inner_iid(1061)

2

In [41]:
model.get_neighbors(mr_trainset.to_inner_iid(1029),5)

[133, 136, 252, 357, 503]

In [48]:
for i in [133, 136, 252, 357, 503]:
   print(mr_trainset.to_raw_iid(i))

27369
50068
2102
4718
26


In [42]:
## Top 5 recommendations for a user
model=knns.KNNWithMeans(k=20,sim_options={'name': 'msd', 'user_based': True})
model.train(mr_trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [43]:
mr.head()

Unnamed: 0,user,item,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [44]:
mr_trainset.to_inner_uid(1)
model.get_neighbors(mr_trainset.to_inner_uid(1),5)

[8, 25, 32, 37, 67]

In [50]:
for i in [8, 25, 32, 37, 67]:
    print(mr_trainset.to_raw_uid(i))

9
26
33
38
68
