In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.metrics as metrics
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation
from sklearn.metrics.pairwise import pairwise_distances
import warnings
warnings.filterwarnings('ignore')
import os,sys
import re

In [None]:
books=pd.read_csv('Books.csv',sep=';',error_bad_lines=False,encoding="ANSI")
books.columns=['ISBN','bookTitle','bookAuthor','yearOfPublication','publisher','imageUrlS','imageUrlM','imageUrlL']
users=pd.read_csv('users.csv',sep=';',error_bad_lines=False,encoding="ANSI")
users.columns=['userID','Location','Age']
ratings=pd.read_csv('Book-Ratings.csv',sep=';',error_bad_lines=False,encoding="ANSI")
ratings.columns=['userID','ISBN','bookRating']


In [None]:
books.shape

In [None]:
users.shape

In [None]:
ratings.shape

In [None]:
books.head()

In [None]:
books.drop(['imageUrlS','imageUrlM','imageUrlL'],axis=1,inplace=True)

In [None]:
books.head()

In [None]:
books.dtypes

In [None]:
pd.set_option('display.max_colwidth',-1)

In [None]:
books.yearOfPublication.unique()

In [None]:
books.loc[books.yearOfPublication =='DK Publishing Inc',:]

In [None]:
books.loc[books.ISBN=='078946697X','yearOfPublication']=2000
books.loc[books.ISBN=='078946697X','bookAuthor']="Michael Teitelbaum"
books.loc[books.ISBN=='078946697X','bookTitle']="DK Readers: Creating the X-Men, How It All Began"
books.loc[books.ISBN=='078946697X','publisher']="DK Publishing Inc"

books.loc[books.ISBN=='0789466953','yearOfPublication']=2000
books.loc[books.ISBN=='0789466953','bookAuthor']="James Buckley"
books.loc[books.ISBN=='0789466953','bookTitle']="DK Readers: Creating the X-Men, How Comic Books Come to Life"
books.loc[books.ISBN=='0789466953','publisher']="DK Publishing Inc"

In [None]:
books.loc[books.yearOfPublication =='Gallimard',:]

In [None]:
books.loc[books.ISBN=='2070426769','yearOfPublication']=2003
books.loc[books.ISBN=='2070426769','bookAuthor']="Jean-Marie Gustave Le ClÃ?Â©zio"
books.loc[books.ISBN=='2070426769','bookTitle']="Peuple du ciel, suivi de 'Les Bergers"
books.loc[books.ISBN=='2070426769','publisher']="Gallimard"


In [None]:
books.yearOfPublication=pd.to_numeric(books.yearOfPublication,errors='coerce')

In [None]:
sorted(books['yearOfPublication'].unique())

In [None]:
books.loc[(books.yearOfPublication>2006)|(books.yearOfPublication==0),'yearOfPublication']=np.NaN

In [None]:
books.yearOfPublication.fillna(round(books.yearOfPublication.mean()),inplace=True)

In [None]:
books.loc[(books.bookAuthor=='J. K. Rowling')&(books.yearOfPublication==1999),:]

In [None]:
books.yearOfPublication=books.yearOfPublication.astype(np.int32)

In [None]:
books.loc[books.publisher.isnull(),:]

In [None]:
books.loc[(books.ISBN =='193169656X'),'publisher']='other'
books.loc[(books.ISBN == '1931696993'),'publisher']='other'

In [None]:
books.loc[books.publisher.isnull(),:]

In [None]:
users.shape

In [None]:
users.Age.fillna(round(users.Age.mean()),inplace=True)
users.dtypes

In [None]:
users.head()

In [None]:
users.userID.values

In [None]:
sorted(users.Age.unique())

In [None]:
users.loc[(users.Age>90) |(users.Age<5),'Age']=np.nan
users.Age=users.Age.fillna(users.Age.mean())
users.Age=users.Age.astype(np.int32)

In [None]:
sorted(users.Age.unique())

In [None]:
ratings.shape

In [None]:
n_users=users.shape[0]
n_books=books.shape[0]
n_users*n_books

In [None]:
ratings.head(5)

In [None]:
ratings_new=ratings[ratings.ISBN.isin(books.ISBN)]
ratings_new=ratings_new[ratings_new.userID.isin(users.userID)]

In [None]:
ratings.shape

In [None]:
ratings_new.shape

In [None]:
sparisty=1.0-len(ratings_new)/float(n_users*n_books)

In [None]:
sparisty

In [None]:
ratings.bookRating.unique()

In [None]:
ratings_explicit=ratings_new[ratings_new.bookRating !=0]
ratings_implicit=ratings_new[ratings_new.bookRating==0]

In [None]:
ratings_explicit

In [None]:
ratings_implicit

In [None]:
users_exp_ratings=users[users.userID.isin(ratings_explicit.userID)]
users_imp_ratings=users[users.userID.isin(ratings_implicit.userID)]

In [None]:
sns.countplot(data=ratings_explicit,x='bookRating')

In [None]:
ratings_count=pd.DataFrame(ratings_explicit.groupby(['ISBN'])['bookRating'].sum())
top10=ratings_count.sort_values('bookRating',ascending=False).head(20)
print("Following books are recommended")
pd.merge(top10,books,on='ISBN',how='inner')

In [None]:
ratings_explicit.dtypes

In [None]:
counts=pd.DataFrame(ratings_explicit.groupby(['ISBN'])['bookRating'].count())
ratings_explicit=ratings_explicit[ratings_explicit['ISBN'].isin(counts[counts.bookRating>=80].index)]

In [None]:
#counts1=ratings_explicit['userID'].value_counts()
#ratings_explicit=ratings_explicit[ratings_explicit['userID'].isin(counts1[counts1>=100].index)]

In [None]:
ratings_explicit

In [None]:
ratings_matrix=ratings_explicit.pivot(index='userID',columns='ISBN',values='bookRating')
userID=ratings_matrix.index
ISBN=ratings_matrix.columns
ratings_matrix

In [None]:
books.sort_values(by=['ISBN'],inplace=True)
books.reset_index(inplace=True)
books

In [None]:
ratings_matrix.fillna(0,inplace=True)

In [None]:
ratings_matrix

In [None]:
global metric,k
k=10
metric='cosine'

In [None]:
def findksimilaritems(item_id, ratings, metric=metric, k=k):
    similarities=[]
    indices=[]
    ratings=ratings.T
    loc = ratings.index.get_loc(item_id)
    
    if ratings.shape[0]<10:
        k=ratings.shape[0]
    
    
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')
    model_knn.fit(ratings)
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k)
    similarities =1-distances.flatten()

    return similarities,indices

In [None]:
#This function predicts the rating for specified user-item combination based on item-based approach
def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):
    prediction= wtd_sum =0
    ratings_new=ratings.copy();
    IS=[]
    for i in range(ratings_new.shape[1]):
        if (ratings_new[ratings_new.columns[i]][user_id] ==0):
            if (item_id ==ratings_new.columns[i]):
                continue;
            else:    
                IS.append(ratings_new.columns[i])
    ratings_new.drop(IS,axis=1,inplace=True)
    
    user_loc = ratings_new.index.get_loc(user_id)   
    
   
    similarities, indices=findksimilaritems(item_id, ratings_new) #similar items based on correlation coefficients
    sum_wt = np.sum(similarities)-1
    product=1
    for i in range(1, len(indices.flatten())):
            product = ratings_new.iloc[user_loc,indices.flatten()[i]] * (similarities[i])
            wtd_sum = wtd_sum + product  
    if(sum_wt==0):
        prediction=0 
    else:    
        prediction = int(round(wtd_sum/sum_wt))
    
    if prediction <= 0:
        prediction = 1   
    elif prediction >10:
        prediction = 10   
    
    return prediction

In [None]:
def recommendItem(user_id, ratings, metric=metric):
    
    if (user_id not in ratings.index.values) or type(user_id) is not int:
         print("User id should be a valid integer from this list :\n\n {} ".format(re.sub('[\[\]]', '', np.array_str(ratings_matrix.index.values))))
    else:  
        prediction=[]
        for i in range(ratings.shape[1]): 
            if (ratings[str(ratings.columns[i])][user_id] ==0.0): #not rated already
                prediction.append(predict_itembased(user_id, str(ratings.columns[i]) ,ratings, metric))
            else:                    
                prediction.append(-1) #for already rated items           
        prediction = pd.Series(prediction)
        prediction = prediction.sort_values(ascending=False)
        recommended = prediction[:10]
   
        print("Following books are recommended...")
        for i in range(len(recommended)):
            print("{0}. {1}".format(i+1,books.bookTitle[recommended.index[i]]))                        