In [None]:
"""
Domain
    Retail

focus
    Optimize Book RENT

Business challenge/requirement
    BookRent is the largest online and offline book rental chain in India. The Company 
    charges a fixed fee per month plus rental per book. So, the company makes more 
    money when users rent more books. 
    You as an ML expert have to model a recommendation engine so that user gets 
    recommendation of books based on the behavior of similar users. This will ensure 
    that users are renting books based on their taste.
    The company is still unprofitable and is looking to improve both revenue and profit.

Key issues
    As of now a lot of users return the books and do not take the new rental. The right
    recommendation will entice a users to rent more books

Considerations
    NONE

Data volume
    - Approx 1 M records - file BX-Book-Ratings.csv and 2 more. But only 10K records 
    will be used

Fields in Data
    • user_id: Unique Id of the User
    • isbn: International Standard Book Number is a unique numeric commercial 
        book identifier
    • rating: the rating given by the user

Additional information
    - NA

Business benefits
    Increase in both top line and bottom line as more rentals per user means more 
    revenue and more profit
"""

In [144]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error

In [130]:
RATINGS_CSV_PATH = r'D:\CourseWork\data-science-python-certification-course\Assignments\11 Association Rules Mining and Recommendation Systems\Case Study\resources\BX-Book-Ratings.csv'
BOOKS_CSV_PATH = r'D:\CourseWork\data-science-python-certification-course\Assignments\11 Association Rules Mining and Recommendation Systems\Case Study\resources\BX-Books.csv'
USERS_CSV_PATH = r'D:\CourseWork\data-science-python-certification-course\Assignments\11 Association Rules Mining and Recommendation Systems\Case Study\resources\BX-Users.csv'

In [131]:

df_ratings = pd.read_csv(RATINGS_CSV_PATH, encoding="latin1")
df_ratings.sort_values(["user_id", "isbn"], inplace=True)
df_ratings = df_ratings.head(10000)
df_ratings.reset_index()
df_ratings.head()

Unnamed: 0,user_id,isbn,rating
9561,2,195153448,0
9562,7,34542252,0
9571,8,074322678X,5
9574,8,080652121X,0
9576,8,1552041778,5


In [132]:
le = LabelEncoder()
df_ratings["isbn"] = le.fit_transform(df_ratings["isbn"])
df_ratings.head()

Unnamed: 0,user_id,isbn,rating
9561,2,1467,0
9562,7,2490,0
9571,8,443,5
9574,8,475,0
9576,8,877,5


In [133]:
books_df = pd.read_csv(BOOKS_CSV_PATH, nrows=10000, encoding="latin1")
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   isbn                 10000 non-null  object
 1   book_title           10000 non-null  object
 2   book_author          10000 non-null  object
 3   year_of_publication  10000 non-null  object
 4   publisher            10000 non-null  object
dtypes: object(5)
memory usage: 390.8+ KB


In [134]:
users_df = pd.read_csv(USERS_CSV_PATH, nrows=10000, encoding="latin1")
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   user_id   10000 non-null  int64  
 1   Location  10000 non-null  object 
 2   Age       6273 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 234.5+ KB


In [136]:
n_users = df_ratings["user_id"].unique().shape[0]
n_books = df_ratings["isbn"].unique().shape[0]
data_matrix = np.zeros((n_users, n_books))
for line in df_ratings.head().itertuples():
    #print(line)
    data_matrix[line[1]-1, line[2]-1] = line[3]

In [138]:
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [139]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'book':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [140]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='book')

In [148]:
#root mean square error
def rmse(pred, test):
    pred = pred[test.nonzero()].flatten()
    test = test[test.nonzero()].flatten()
    return mean_squared_error(pred, test, squared=False)

In [149]:
rmse(user_prediction, data_matrix)

4.9988560970029745

In [150]:
rmse(item_prediction, data_matrix)

5.0