In [1]:
#importing the required libraries
#!pip install scikit-surprise
import pandas as pd
import numpy as np
import matplotlib as plt
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import sigmoid_kernel
from surprise import SVD,Reader, Dataset
from surprise.model_selection import KFold
from surprise.model_selection import train_test_split
from sklearn.metrics.pairwise import linear_kernel
                                                                                                                                                                                                                                               
#importing packages to ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Reading the datasets
ratings = pd.read_csv("./Data/Hybrid_Filtering/ratings.csv")
avg_ratings = pd.read_csv("./Data/Hybrid_Filtering/AverageRatings.csv")
books= pd.read_csv("./Data/Hybrid_Filtering/FinalData.csv")
rating_count = pd.read_csv("./Data/Hybrid_Filtering/RatingsCount.csv")

In [3]:
#Renaming the columns

avg_ratings = avg_ratings.rename(columns={'rating':'avg_rating'})
rating_count = rating_count.rename(columns={'rating':'rating_count'})

In [4]:
ratings['user_id'].nunique()

53417

In [5]:
ratings['user_id'].duplicated().sum()

3080127

In [6]:
#Extract unique users and ratings more than 
ratings = ratings.drop_duplicates('user_id')

##ratings.shape

In [7]:
ratings

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,260,5
6,4,70,4
35,8,479,4
81,9,397,3
...,...,...,...
3131500,47704,515,5
3131641,51115,47,5
3132908,27329,262,3
3133424,33111,979,2


In [8]:
####Merging ratings with books
ratings_with_book = books.merge(ratings, on = 'book_id')
ratings_with_book

Unnamed: 0,book_id,authors,title,Genres,user_id,rating
0,1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",SciFi;Drama,5621,4
1,1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",SciFi;Drama,3131,5
2,1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",SciFi;Drama,6183,5
3,1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",SciFi;Drama,8109,5
4,1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",SciFi;Drama,6383,5
...,...,...,...,...,...,...
53412,999,Judy Blume,"Tales of a Fourth Grade Nothing (Fudge, #1)",Kids,27423,5
53413,999,Judy Blume,"Tales of a Fourth Grade Nothing (Fudge, #1)",Kids,41171,5
53414,999,Judy Blume,"Tales of a Fourth Grade Nothing (Fudge, #1)",Kids,42916,5
53415,999,Judy Blume,"Tales of a Fourth Grade Nothing (Fudge, #1)",Kids,44534,3


In [9]:
#Now merging the ratings_with_book with rating_count

books_df = ratings_with_book.merge(rating_count, on = 'book_id')
books_df

Unnamed: 0,book_id,authors,title,Genres,user_id,rating,rating_count
0,1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",SciFi;Drama,5621,4,22806
1,1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",SciFi;Drama,3131,5,22806
2,1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",SciFi;Drama,6183,5,22806
3,1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",SciFi;Drama,8109,5,22806
4,1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",SciFi;Drama,6383,5,22806
...,...,...,...,...,...,...,...
53412,999,Judy Blume,"Tales of a Fourth Grade Nothing (Fudge, #1)",Kids,27423,5,1348
53413,999,Judy Blume,"Tales of a Fourth Grade Nothing (Fudge, #1)",Kids,41171,5,1348
53414,999,Judy Blume,"Tales of a Fourth Grade Nothing (Fudge, #1)",Kids,42916,5,1348
53415,999,Judy Blume,"Tales of a Fourth Grade Nothing (Fudge, #1)",Kids,44534,3,1348


In [10]:
#Creating a pivot table
rmat = books_df.pivot_table(
    columns = 'user_id',
    index = 'book_id',
    values = 'rating'
).fillna(0)

def standardize(row):
    new_row = (row-row.mean())/(row.max()-row.min())
    return new_row

rmat = rmat.apply(standardize)


In [11]:
#Compute the cosine similarity matrix 
cosine_sim = cosine_similarity(rmat, rmat)
cosine_sim = pd.DataFrame(cosine_sim, index=rmat.index, columns=rmat.index)

In [12]:
cosine_sim

book_id,1,2,3,4,5,6,7,8,9,10,...,990,991,992,993,994,995,996,997,998,999
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,-0.002021,-0.001994,-0.002148,-0.001958,-0.002209,-0.002082,-0.001995,-0.002048,-0.001955,...,-0.008536,-0.009257,-0.008874,-0.007721,-0.007721,-0.006488,-0.008874,-0.007721,-0.007965,-0.007721
2,-0.002021,1.000000,-0.002170,-0.002014,-0.002067,-0.002553,-0.002342,-0.002174,-0.002282,-0.002030,...,-0.010933,-0.011866,-0.011370,-0.009877,-0.009877,-0.008279,-0.011370,-0.009877,-0.010194,-0.009877
3,-0.001994,-0.002170,1.000000,-0.002391,-0.001955,-0.001990,-0.001937,-0.001925,-0.001928,-0.001983,...,-0.006649,-0.007201,-0.006908,-0.006025,-0.006025,-0.005085,-0.006908,-0.006025,-0.006212,-0.006025
4,-0.002148,-0.002014,-0.002391,1.000000,-0.002230,-0.002918,-0.002635,-0.002396,-0.002551,-0.002166,...,-0.013221,-0.014355,-0.013753,-0.011938,-0.011938,-0.009992,-0.013753,-0.011938,-0.012322,-0.011938
5,-0.001958,-0.002067,-0.001955,-0.002230,1.000000,-0.002110,-0.002012,-0.001956,-0.001989,-0.001954,...,-0.007745,-0.008396,-0.008050,-0.007010,-0.007010,-0.005900,-0.008050,-0.007010,-0.007230,-0.007010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.006488,-0.008279,-0.005085,-0.009992,-0.005900,-0.003589,-0.004254,-0.005063,-0.004503,-0.006349,...,0.001014,0.001228,0.001116,0.000758,0.000758,1.000000,0.001116,0.000758,0.000837,0.000758
996,-0.008874,-0.011370,-0.006908,-0.013753,-0.008050,-0.004794,-0.005737,-0.006877,-0.006088,-0.008679,...,0.002283,0.002655,0.002459,0.001842,0.001842,0.001116,1.000000,0.001842,0.001977,0.001842
997,-0.007721,-0.009877,-0.006025,-0.011938,-0.007010,-0.004207,-0.005017,-0.005999,-0.005319,-0.007552,...,0.001701,0.001999,0.001842,0.001346,0.001346,0.000758,0.001842,1.000000,0.001455,0.001346
998,-0.007965,-0.010194,-0.006212,-0.012322,-0.007230,-0.004331,-0.005169,-0.006185,-0.005482,-0.007791,...,0.001828,0.002142,0.001977,0.001455,0.001455,0.000837,0.001977,0.001455,1.000000,0.001455


Collaborative Filtering

In [13]:
#Creating SVD Collaborative Filter System
reader = Reader()

data = Dataset.load_from_df(books_df[['user_id', 'book_id','rating']], reader)

trainset, testset = train_test_split(data, test_size=0.3,random_state=10)

svd = SVD()
svd.fit(trainset)



<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1cae0641c70>

In [14]:
# run the trained model against the testset
test_pred = svd.test(testset)

In [15]:
# get RMSE
from surprise import accuracy
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.9435


0.9434573862760584

In [16]:
#making a hybird recommender system to recommend movies 
def hybrid(user_id, book_id, n_recs, books_df, cosine_sim, svd_model = svd):
    '''
    This function represents a hybrid recommendation system, it will have the following flow:
        1. Use a content-based model (cosine_similarity) to compute the 50 most similar books
        2. Compute the predicted ratings that the user might give these 50 books using a collaborative
           filtering model (SVD)
        3. Return the top n books with the highest predicted rating
        
    params:
        user_id (Integer) : The user_id 
        book_id (Integer) : The book_id 
        n_recs (Integer) : The number of recommendations you want
        df (DataFrame) : Original dataframe with all book information 
        cosine_sim (DataFrame) : The cosine similarity dataframe
        svd_model (Model) : SVD model
    '''
    #Vectoring the array
    vectorize = np.vectorize(np.int_)
    # sort similarity values in decreasing order and take top 50 results
    sim = list(enumerate(cosine_sim[book_id]))
    sim = sorted(sim, key=lambda x: x[1], reverse=True)
    sim = sim[1:50]
    
    # get book metadata
    book_idx = [i[0] for i in sim]
    books = books_df.iloc[book_idx][['user_id','book_id','title','rating']]
    
    # predict using the svd_model
    books['est'] = books.apply(lambda x: svd_model.predict(user_id, x['book_id'], x['rating']).est, axis = 1)
    
    # sort predictions in decreasing order and return top n_recs
    books = books.sort_values('est', ascending=False)
    return books.head(n_recs)

In [17]:
u_id = books_df['user_id'].values[1]
b_id = books_df['book_id'].values[1]
nrecs = 10

In [18]:
#books_df['user_id'].value_counts()

In [19]:
#Testing the model

hybrid(u_id,b_id,nrecs,books_df,cosine_sim,svd)

Unnamed: 0,user_id,book_id,title,rating,est
14,8840,1,"The Hunger Games (The Hunger Games, #1)",5,4.857603
21,11642,1,"The Hunger Games (The Hunger Games, #1)",5,4.857603
107,17908,1,"The Hunger Games (The Hunger Games, #1)",5,4.857603
29,12172,1,"The Hunger Games (The Hunger Games, #1)",5,4.857603
190,21430,1,"The Hunger Games (The Hunger Games, #1)",5,4.857603
77,15800,1,"The Hunger Games (The Hunger Games, #1)",2,4.857603
24,2081,1,"The Hunger Games (The Hunger Games, #1)",4,4.857603
50,13537,1,"The Hunger Games (The Hunger Games, #1)",3,4.857603
69,11405,1,"The Hunger Games (The Hunger Games, #1)",5,4.857603
127,18706,1,"The Hunger Games (The Hunger Games, #1)",4,4.857603
