In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

bookDF=pd.read_csv('datasets/books/books.csv')
bookDF=bookDF.drop(['image_url','small_image_url','title','best_book_id','isbn','isbn13'],axis=1)
ratingsDF = pd.read_csv('datasets/books/ratings.csv')

# Any results you write to the current directory are saved as output.

In [2]:
bookDF.head(5)

Unnamed: 0,id,book_id,work_id,books_count,authors,original_publication_year,original_title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5
0,1,2767052,2792775,272,Suzanne Collins,2008.0,The Hunger Games,eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317
1,2,3,4640799,491,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543
2,3,41865,3212258,226,Stephenie Meyer,2005.0,Twilight,en-US,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439
3,4,2657,3275794,487,Harper Lee,1960.0,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267
4,5,4671,245494,1356,F. Scott Fitzgerald,1925.0,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718


In [3]:
ratingsDF.shape

(981756, 3)

In [4]:
ratingsDF.head(5)

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


This part of the code is responsible for building the item-item similarity matrix
We first convert build a list of dictionary. Each dictionary corresponds to a single book. The user_id is the key, while rating given by the user for the book is its value.

In [5]:
listOfDictonaries=[]
indexMap = {}
reverseIndexMap = {}
ptr=0;
testdf = ratingsDF
testdf=testdf[['user_id','rating']].groupby(testdf['book_id'])
for groupKey in testdf.groups.keys():
    tempDict={}

    groupDF = testdf.get_group(groupKey)
    for i in range(0,len(groupDF)):
        tempDict[groupDF.iloc[i,0]]=groupDF.iloc[i,1]
    indexMap[ptr]=groupKey
    reverseIndexMap[groupKey] = ptr
    ptr=ptr+1
    listOfDictonaries.append(tempDict)

We then use sklearn's DictVectorizer() function to create vectors corresponding to each book. We are trying to create a vector space with users as column vectors. Each point in the vector space represents a book. Rating of the book given an user is its magnitude. We then calculate similarity/distance between books in this vector space.

In [6]:
from sklearn.feature_extraction import DictVectorizer
dictVectorizer = DictVectorizer(sparse=True)
vector = dictVectorizer.fit_transform(listOfDictonaries)

We finally use sklearn's consine_similarity function to calculate pairwise similarity matrix 

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
pairwiseSimilarity = cosine_similarity(vector)

At last we define a function that takes an input id and outputs 5 top most recommendations. Please note that this id is the column id in book.csv. Values of id are in  [1,10000]

In [8]:
def printBookDetails(bookID):
    print("Title:", bookDF[bookDF['id']==bookID]['original_title'].values[0])
    print("Author:",bookDF[bookDF['id']==bookID]['authors'].values[0])
    print("Printing Book-ID:",bookID)
    print("=================++++++++++++++=========================")


def getTopRecommandations(bookID):
    row = reverseIndexMap[bookID]
    #print("------INPUT BOOK--------")
    #printBookDetails(bookID)
    #print("-------RECOMMENDATIONS----------")
    #similarBookIDs = [printBookDetails(indexMap[i]) for i in np.argsort(pairwiseSimilarity[row])[-12:-2][::-1]]
    return np.argsort(pairwiseSimilarity[row])[-12:-2][::-1]

# Getting Base Book for each User

In [9]:
#getting base book for all users
booksUsedForRecommendations = []
usersList = ratingsDF.groupby(by='user_id')
userIds = ratingsDF['user_id'].unique()

for x in range(len(userIds)):
    allUserRatings = pd.DataFrame(usersList.get_group(userIds[x]))
    allUserRatings = allUserRatings.sort_values(['rating'], ascending=[False])
    itemUsedForRecommendation = allUserRatings.head(1)
    booksUsedForRecommendations.append(itemUsedForRecommendation)

# Recommending 10 books for each user based on base book

In [10]:
ids = []
recommendations = []
titles = []
for x in range(len(userIds)):
    userId = userIds[x]
    bookId = booksUsedForRecommendations[x].book_id.item()
    result = getTopRecommandations(bookId) #array of book ids
    for i in range (len(result)):
        ids.append(userIds[x])
        recommendations.append(result[i] + 1)
        bookTitle = bookDF[bookDF['id']==(result[i]+1)]['original_title'].values[0]
        titles.append(bookTitle)
allUsersRecommendations_df = pd.DataFrame(list(zip(ids, recommendations,titles)),
            columns =['user_id', 'book_id', 'title'])

In [11]:
allUsersRecommendations_df

Unnamed: 0,user_id,book_id,title
0,314,31,The Help
1,314,2,Harry Potter and the Philosopher's Stone
2,314,20,Mockingjay
3,314,3,Twilight
4,314,93,The Secret Garden
...,...,...,...
534235,27590,6180,Paddy Clarke Ha Ha Ha
534236,27590,5478,The Line of Beauty
534237,27590,6607,The Rainbow
534238,27590,2612,A Suitable Boy


# Explaning using KNN

# Getting Association Rules 

In [12]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [13]:
#Computing Assosciation Rules

rules = None
item_sets = [
        [item for item in ratingsDF[ratingsDF.user_id == user].book_id]
        for user in ratingsDF.user_id.unique()
    ]

te = TransactionEncoder()
te_ary = te.fit(item_sets).transform(item_sets)

df = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = apriori(df,
                            min_support=.1,
                            use_colnames=True,
                            max_len=2)

rules = association_rules(frequent_itemsets,
                            metric="lift",
                            min_threshold=.1)
rules = rules[(rules['confidence'] > .1) &
               (rules['lift'] > .1)]

rules.consequents = [list(row.consequents)[0] for _, row in rules.iterrows()]
rules.antecedents = [list(row.antecedents)[0] for _, row in rules.iterrows()]

rules = rules[["consequents", "antecedents", "confidence"]]

ValueError: cannot call `vectorize` on size 0 inputs unless `otypes` is set

In [13]:
item_sets2 = [
        [item2 for item2 in ratingsDF[ratingsDF.user_id == user2].book_id]
        for user2 in ratingsDF.user_id.unique()
    ]

In [20]:
len(item_sets2[3])

187

In [21]:
te2 = TransactionEncoder()
te_ary2 = te2.fit(item_sets2).transform(item_sets2)

df2 = pd.DataFrame(te_ary2, columns=te2.columns_)

In [22]:
df2

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
0,True,False,True,False,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,True,False,False,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,True,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
3,True,True,False,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53419,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
53420,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
53421,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
53422,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [None]:
frequent_itemsets2 = apriori(df2,
                            min_support=0.00187,
                            use_colnames=True,
                            max_len=2)
frequent_itemsets2

# Model Fidelity

In [188]:
expl = allUsersRecommendations_df[[len(x) > 0 for x in allUsersRecommendations_df.explanations]]
fidelity = expl.groupby('user_id')['title'].count() / 10
modelFidelity = sum(fidelity)/len(userIds)