In [18]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="section-four"></a>

## Importing Necessary Libraries

#### Here I am trying to use SURPRISE library which is like scikit library for recommendation algorithms


In [19]:
from surprise.model_selection import train_test_split
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor,KNNBasic,KNNWithMeans,KNNWithZScore,KNNBaseline,SVD,BaselineOnly,SVDpp,NMF,SlopeOne,CoClustering
from surprise.accuracy import rmse
from surprise import accuracy


In [20]:
users = pd.read_csv('BX-Users.csv', sep='\";\"', names=['User-ID', 'Location', 'Age'], encoding='latin-1', skiprows=1)
books = pd.read_csv('BX-Books.csv', sep='\";\"', names=['ISBN', 'Book-Title' ,'Book-Author','Year-Of-Publication', 'Publisher', 'Image-Url-S', 'Image-Url-M', 'Image-Url-L'], encoding='latin-1', skiprows=1)
ratings = pd.read_csv('BX-Book-Ratings.csv', sep='\";\"', names=['User-ID', 'ISBN', 'Book-Rating'], encoding='latin-1', skiprows=1)

  users = pd.read_csv('BX-Users.csv', sep='\";\"', names=['User-ID', 'Location', 'Age'], encoding='latin-1', skiprows=1)
  books = pd.read_csv('BX-Books.csv', sep='\";\"', names=['ISBN', 'Book-Title' ,'Book-Author','Year-Of-Publication', 'Publisher', 'Image-Url-S', 'Image-Url-M', 'Image-Url-L'], encoding='latin-1', skiprows=1)
  ratings = pd.read_csv('BX-Book-Ratings.csv', sep='\";\"', names=['User-ID', 'ISBN', 'Book-Rating'], encoding='latin-1', skiprows=1)


## Data Cleaning

* Replacing NULL values
* Removing Unnecessary characters

In [21]:
users['User-ID'] = users['User-ID'].str.replace("\"","")
users['Location'] = users['Location'].str.replace("\";NULL","")
users['Age'] = users['Age'].fillna("0")
users['Age'] = users['Age'].str.replace("\"","")
books['ISBN'] = books['ISBN'].str.replace("\"","")
books['Book-Title'] = books['Book-Title'].str.replace("\"","")
ratings['User-ID'] = ratings['User-ID'].str.replace("\"","")
ratings['Book-Rating'] = ratings['Book-Rating'].str.replace("\"","").astype(int)

<a id="section-four-one"></a>

## Candidate Generation

**This is the first stage of the Recommender Systems. Not all books and users are taken as quality books and users. There will be few stringent and lenient users.**

Stringent Users: They are insensitive towards ratings, they won't rate higher ratings and mostly give medium ratings for books

Lenient Users: They are very sensitive towards ratings, they will rate higher ratings as 9, 10 always for most of the books

### Normalization of users ratings is required

In [22]:
# Quality books having atleast 5 reviews

quality_ratings = ratings[ratings['Book-Rating']!=0]
quality_book = quality_ratings['ISBN'].value_counts().rename_axis('ISBN').reset_index(name = 'Count')
quality_book = quality_book[quality_book['Count']>5]['ISBN'].to_list()
quality_ratings = quality_ratings[quality_ratings['ISBN'].isin(quality_book)]
quality_ratings

Unnamed: 0,User-ID,ISBN,Book-Rating
8,276744,038550120X,7
16,276747,0060517794,9
19,276747,0671537458,9
20,276747,0679776818,8
28,276754,0684867621,8
...,...,...,...
1149743,276688,0836218655,10
1149744,276688,0836236688,10
1149761,276704,0345386108,6
1149771,276704,0743211383,7


In [23]:
# Quality Users making atleast 5 reviews

quality_user = quality_ratings['User-ID'].value_counts().rename_axis('User-ID').reset_index(name = 'Count')
quality_user = quality_user[quality_user['Count']>5]['User-ID'].to_list()
quality_ratings = quality_ratings[quality_ratings['User-ID'].isin(quality_user)]
quality_ratings

Unnamed: 0,User-ID,ISBN,Book-Rating
174,276847,3257200552,5
177,276847,3404148576,8
179,276847,3423071516,10
182,276847,3426029553,8
186,276847,3442413508,10
...,...,...,...
1149715,276688,0553575104,6
1149738,276688,0688156134,8
1149739,276688,0743202694,10
1149743,276688,0836218655,10


<a id="section-four-two"></a>
## Gaussian Normalization

* All ratings are normalized as gaussian distribution 
* Gaussian Ratings are scaled on (0-5) Rating scale



\begin{equation*}
R_{norm}^{u_i}(b) = \frac{R_b - R_{mean}^{u_i}}{\sqrt{\sum_{j} (R_{b_j} - R_{mean}^{u_i})^2}}
\end{equation*}

In [24]:
# Normalizing the Ratings

mean_rating_user = quality_ratings.groupby('User-ID')['Book-Rating'].mean().reset_index(name='Mean-Rating-User')
mean_data = pd.merge(quality_ratings, mean_rating_user, on='User-ID')
mean_data['Diff'] = mean_data['Book-Rating'] - mean_data['Mean-Rating-User']
mean_data['Square'] = (mean_data['Diff'])**2
norm_data = mean_data.groupby('User-ID')['Square'].sum().reset_index(name='Mean-Square')
norm_data['Root-Mean-Square'] = np.sqrt(norm_data['Mean-Square'])
mean_data = pd.merge(norm_data, mean_data, on='User-ID')
mean_data['Norm-Rating'] = mean_data['Diff']/(mean_data['Root-Mean-Square'])  
mean_data['Norm-Rating'] = mean_data['Norm-Rating'].fillna(0)
max_rating = mean_data.sort_values('Norm-Rating')['Norm-Rating'].to_list()[-1]
min_rating = mean_data.sort_values('Norm-Rating')['Norm-Rating'].to_list()[0]
mean_data['Norm-Rating'] = 5*(mean_data['Norm-Rating'] - min_rating)/(max_rating-min_rating)
mean_data['Norm-Rating'] = np.ceil(mean_data['Norm-Rating']).astype(int)
norm_ratings = mean_data[['User-ID','ISBN','Norm-Rating']]
mean_data.sort_values('Norm-Rating')

Unnamed: 0,User-ID,Mean-Square,Root-Mean-Square,ISBN,Book-Rating,Mean-Rating-User,Diff,Square,Norm-Rating
14735,124989,0.956522,0.978019,0061092096,9,9.956522,-0.956522,0.914934,0
54488,213760,56.769231,7.534536,0671693816,3,7.692308,-4.692308,22.017751,1
20494,136011,27.714286,5.264436,055328942X,3,7.428571,-4.428571,19.612245,1
47555,198114,53.875000,7.339959,037541200X,1,6.625000,-5.625000,31.640625,1
31142,160541,102.000000,10.099505,0749324791,2,8.000000,-6.000000,36.000000,1
...,...,...,...,...,...,...,...,...,...
98100,5815,21.333333,4.618802,0316693251,10,7.333333,2.666667,7.111111,5
98082,58067,12.000000,3.464102,0062512668,10,8.000000,2.000000,4.000000,5
76394,259430,29.666667,5.446712,0679781587,10,5.833333,4.166667,17.361111,5
76499,259734,15.428571,3.927922,0441003257,10,7.714286,2.285714,5.224490,5


In [25]:

reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(norm_ratings[['User-ID', 'ISBN', 'Norm-Rating']], reader)

<a id="section-five"></a>
# Machine Learning and Matrix Factorization Models 


Performing Cross validation and checking RMSE of all Machine Learning and Matrix Factorization algorithms available in surprise library


In [26]:
benchmark = []
for algorithm in [SVD(), 
                  SVDpp(), 
                  SlopeOne(), 
                  NMF(), 
                  NormalPredictor(), 
                  KNNBaseline(), 
                  KNNBasic(), 
                  KNNWithMeans(),
                  KNNWithZScore(), 
                  BaselineOnly(),
                  CoClustering()]:
    
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)

KeyboardInterrupt: 

#### We can observe that Baseline ML algorithm and SVD based Matrix Factorization has last RMSE.

> 📌  0.62 of RMSE says that predicted rating may have an error of 0.62

In [10]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
surprise_results

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.623371,0.467819,0.390958
SVD,0.637696,8.036514,0.470202
SVDpp,0.643012,166.14944,5.73675
KNNBasic,0.651345,2.242541,1.585632
KNNBaseline,0.656722,2.943204,1.889429
KNNWithMeans,0.718302,2.145302,1.89065
KNNWithZScore,0.730655,2.64488,2.090928
SlopeOne,0.74351,10.125962,2.916771
NMF,0.755212,9.955564,0.479167
CoClustering,0.765919,3.583364,0.327724


<a id="section-five-one"></a>
# Machine Learning based Model

### BaselineOnly

Algorithm predicting the baseline estimate for given user and item.

\begin{equation*}
b_{ui}=μ+b_u+b_i
\end{equation*}

If user u is unknown, then the bias b<sub>u</sub> is assumed to be zero. The same applies for item i with b<sub>i</sub>.

using SGD: Stocahstic Gradient Descent to minimize the loss with regularization parameter 0.5

In [11]:
# Baseline

train_set, test_set = train_test_split(data, test_size=0.25)
algo = BaselineOnly(bsl_options={'method': 'sgd','learning_rate': .00005, 'n_epochs':30, 'reg':0.5})
fit = algo.fit(train_set)
pred = fit.test(test_set)
accuracy.rmse(pred)

Estimating biases using sgd...
RMSE: 0.6175


0.6175033501537819

<a id="section-five-two"></a>
# Matrix Factorization Method

### SVD

The famous SVD algorithm, as popularized by Simon Funk during the Netflix Prize. When baselines are not used, This is equivalent to Probabilistic Matrix.

The prediction r<sup>ui</sup> is set as:
\begin{equation*}
r^{ui}=μ+b_u+b_i+q_i^Tp_u
\end{equation*}


To estimate all the unknown, we minimize the following regularized squared error:

\begin{equation*}
\sum_{r_{ui}∈R_{train}}(r_{ui}−r^{ui})^2+λ(b^2_i+b^2_u+||q_i||^2+||p_u||^2)
\end{equation*}


In [12]:
# SVD 

algo = SVD(reg_bi = 0.5, lr_bi=0.005)
fit = algo.fit(train_set)
pred = fit.test(test_set)
accuracy.rmse(pred)

RMSE: 0.6377


0.6376726463613793

In [13]:
recommend = algo.trainset
users_norm = list(set(norm_ratings['User-ID'].to_list()))
books_norm = list(set(norm_ratings['ISBN'].to_list()))
norm_ratings['User-ID'].unique()

array(['100009', '100053', '100066', ..., '99738', '99955', '99996'],
      dtype=object)

In [17]:
pred_users = [user for user in users_norm if recommend.knows_user(recommend.to_inner_uid(user))]
pred_books = []
for book in books_norm:
    try:
        if recommend.knows_item(recommend.to_inner_iid(book)):
            pred_books.append(book)
    except:
        pass
    

ValueError: User 172048 is not part of the trainset.

In [28]:
pred_users[:5]

['221732', '178797', '4795', '30408', '116006']

<a id="section-six"></a>
# Recommendation Evaluation

In [29]:
def recommend_books(user_id, count):
    result=[]
    for b in pred_books:
        result.append([b,algo.predict(user_id,b,r_ui=4).est])
    recom = pd.DataFrame(result, columns=['ISBN','Rating'])
    merge = pd.merge(recom,books, on='ISBN' )
    return merge.sort_values('Rating', ascending=False).head(count)

In [30]:
recommendation = recommend_books('36938', 5)

<a id="section-seven"></a>
# Scoring 

After candidate generation, another model scores and ranks the generated candidates to select the set of items to display. The recommendation system may have multiple candidate generators that use different sources, such as the following:

* User features that account for personalization.
* geographic information into account.
* Popular or trending items.


Here scoring is done based on published year

In [16]:
scoring = recommendation.sort_values('Year-Of-Publication')
view = "".join(["<span><img src='"+a+"'></span>" for a in scoring['Image-Url-M'].to_list()])
scoring[['Book-Title']]

NameError: name 'recommendation' is not defined

In [32]:
view

"<span><img src='http://images.amazon.com/images/P/0395353009.01.MZZZZZZZ.jpg'></span><span><img src='http://images.amazon.com/images/P/0312954468.01.MZZZZZZZ.jpg'></span><span><img src='http://images.amazon.com/images/P/0486284735.01.MZZZZZZZ.jpg'></span><span><img src='http://images.amazon.com/images/P/0446522252.01.MZZZZZZZ.jpg'></span><span><img src='http://images.amazon.com/images/P/0312872402.01.MZZZZZZZ.jpg'></span>"

<center><h1>My Top 5 Recommendations</h1></center>
<span><img src='http://images.amazon.com/images/P/0446310786.01.MZZZZZZZ.jpg'></span><span><img src='http://images.amazon.com/images/P/059035342X.01.MZZZZZZZ.jpg'></span><span><img src='http://images.amazon.com/images/P/0316666343.01.MZZZZZZZ.jpg'></span><span><img src='http://images.amazon.com/images/P/0385504209.01.MZZZZZZZ.jpg'></span><span><img src='http://images.amazon.com/images/P/0142001740.01.MZZZZZZZ.jpg'></span>

<a id="section-eight"></a>
# Neural Net Model

Will be updated !!