# Amazon Recommender System
![amazon](../images/amazon.png)

This EDA will explore how a model is finalized for the analysis.

In [1]:
# import library
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
df = pd.read_csv('http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games.csv', 
                 names=['item', 'user', 'rating', 'timestamp'])

In [56]:
df = df[['user', 'item', 'rating']]

In [60]:
df.rating = df.rating.astype('int32')

Datasets for X and y are separated. Then, train-test split has been done with test size = 0.33.

In [65]:
X = df[df.columns[:2]]
y = df[df.columns[-1]]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [66]:
from surprise import Reader, Dataset
# read in values as Surprise dataset 
reader = Reader()
# train data
# Loading the data again for the Surprise library
train = Dataset.load_from_df(pd.concat([X_train, y_train], axis = 1), reader)
# test data
# Loading the data again for the Surprise library
test = Dataset.load_from_df(pd.concat([X_test, y_test], axis = 1), reader)
# whole data for comparison
data = Dataset.load_from_df(df, reader)

In [70]:
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  1540618 

Number of items:  71982


In [71]:
train_set = train.build_full_trainset()
print('Number of users: ', train_set.n_users, '\n')
print('Number of items: ', train_set.n_items)

Number of users:  1139782 

Number of items:  65479


In [72]:
test_set = test.build_full_trainset()
print('Number of users: ', test_set.n_users, '\n')
print('Number of items: ', test_set.n_items)

Number of users:  644499 

Number of items:  53651


In [73]:
# importing relevant libraries
import surprise
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD, NMF
from surprise.model_selection import GridSearchCV
import numpy as np

In [74]:
from surprise.prediction_algorithms import CoClustering, BaselineOnly, KNNBasic

In [75]:
svd = SVD(random_state = 0)
val_svd = cross_validate(svd, train, measures=['RMSE', 'MAE'], cv=3)

In [76]:
np.mean(val_svd['test_rmse'])

1.2980251758593095

In [37]:
nmf = NMF(random_state = 0)
val_nmf = cross_validate(nmf, train, measures=['RMSE', 'MAE'], cv=3)

In [38]:
np.mean(val_nmf['test_rmse'])

1.4331195775188146

In [41]:
knnb = KNNBasic()
val_knnb = cross_validate(knnb, train, measures=['RMSE', 'MAE'], cv=3)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [42]:
np.mean(val_knnb['test_rmse'])

1.390615066754609

In [44]:
cocl = CoClustering(random_state = 0)
val_cocl = cross_validate(cocl, train, measures=['RMSE', 'MAE'], cv=3)
np.mean(val_cocl['test_rmse'])

1.405233032282113

In [45]:
base = BaselineOnly()
val_base = cross_validate(base, train, measures=['RMSE', 'MAE'], cv=3)
np.mean(val_base['test_rmse'])

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


1.3046543554928207

### Summary of RMSE Values from Model Validation
- SVD - 1.298
- NMF - 1.433
- KNNBasic - 1.391
- CoClustering - 1.405
- BaselineOnly - 1.305

SVD is selected for the baseline model.

In [77]:
svd = SVD(random_state = 0).fit(train_set)

In [78]:
pred = []
for i in range(len(X_test)):
    pred.append(svd.predict(X_test.iloc[i].values[0], X_test.iloc[i].values[1])[3])

In [79]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, pred, squared=False))

1.282665736278365


- RMSE for SVD using test data is 1.283.

In [48]:
from sklearn import preprocessing
le_item = preprocessing.LabelEncoder()
item_trans = le_item.fit_transform(X.item)
X['item'] = pd.DataFrame(item_trans)

le_user = preprocessing.LabelEncoder()
user_trans = le_user.fit_transform(X.user)
X['user'] = pd.DataFrame(user_trans)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [49]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

lr = LinearRegression()
rfr = RandomForestRegressor(random_state=0)
abr = AdaBoostRegressor(random_state=0)
gbr = GradientBoostingRegressor(random_state=0)

In [50]:
lr.fit(X_train, y_train)
rfr.fit(X_train, y_train)
abr.fit(X_train, y_train)
gbr.fit(X_train, y_train)

rating_pred_lr = lr.predict(X_test)
rating_pred_rfr = rfr.predict(X_test)
rating_pred_abr = abr.predict(X_test)
rating_pred_gbr = gbr.predict(X_test)

print(mean_squared_error(y_test, rating_pred_lr, squared=False))
print(mean_squared_error(y_test, rating_pred_rfr, squared=False))
print(mean_squared_error(y_test, rating_pred_abr, squared=False))
print(mean_squared_error(y_test, rating_pred_gbr, squared=False))

1.4064298478133768
1.494283731720911
1.4050249592110386
1.395876911454813


- Regressors from sklearn library are used to get RMSE using test data, but no RMSE is better than the RMSE using SVD.

### Gridsearch to reduce RMSE

In [69]:
params = {'n_factors': [90, 100, 110],
          'reg_all': [0.07, 0.08, 0.09], 
          'n_epochs': [110, 120, 130, 140, 150]}
gs_model = GridSearchCV(SVD, param_grid=params, cv=3, measures=["rmse"], n_jobs = -1)
gs_model.fit(train)

In [75]:
params = {'n_factors': [100],
          'reg_all': [0.06, 0.07, 0.08], 
          'n_epochs': [150, 160, 170]}
gs_model = GridSearchCV(SVD, param_grid=params, cv=3, measures=["rmse"], n_jobs = -1)
gs_model.fit(train)

In [76]:
gs_model.best_score

{'rmse': 1.2851814251783829}

- RMSE from model validation is lowered from 1.298.

In [77]:
gs_model.best_params

{'rmse': {'n_factors': 100, 'reg_all': 0.07, 'n_epochs': 150}}