# Recommender System

Books

April 2019

Based on recommender system architecture [here](https://github.com/cipher813/recommender_system) and Book Crossing dataset [here](http://www2.informatik.uni-freiburg.de/~cziegler/BX/).

In [1]:
from scipy.sparse import csr_matrix 
from sklearn.neighbors import NearestNeighbors

from bm_util import *

import warnings
warnings.filterwarnings('ignore')

In [2]:
PATH = "/tmp/data/"

In [3]:
fp = PATH + "BX-CSV-Dump.zip"
url = 'http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip'
fp = download_url_to_filepath(fp, url)
unzip_file(fp,PATH)
os.listdir(PATH)

['BX-Book-Ratings.csv', 'BX-Books.csv', 'BX-CSV-Dump.zip', 'BX-Users.csv']

In [4]:
def read_BX_csv(filepath,encoding):
    df = pd.read_csv(filepath,delimiter=';',error_bad_lines=False,encoding=encoding)
    return df

def print_info(df):
    print(f"Shape:\n{df.shape}")
    print(f"Info:\n{df.info()}")
    print(f"Unique:\n{df.nunique()}")

In [5]:
br = read_BX_csv(PATH + "BX-Book-Ratings.csv",'unicode_escape')
bu = read_BX_csv(PATH + "BX-Users.csv",'unicode_escape')
bk = read_BX_csv(PATH + "BX-Books.csv",'latin-1')

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'


In [6]:
for df in [("Ratings",br), ("Users",bu), ("Info",bk)]:
    print(f"\n{df[0]}\n")
    print_info(df[1])


Ratings

Shape:
(1149780, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
User-ID        1149780 non-null int64
ISBN           1149780 non-null object
Book-Rating    1149780 non-null int64
dtypes: int64(2), object(1)
memory usage: 26.3+ MB
Info:
None
Unique:
User-ID        105283
ISBN           340556
Book-Rating        11
dtype: int64

Users

Shape:
(278858, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
User-ID     278858 non-null int64
Location    278858 non-null object
Age         168096 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB
Info:
None
Unique:
User-ID     278858
Location     57339
Age            165
dtype: int64

Info

Shape:
(271360, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
ISBN                   271360 non-null object
Book-Title         

In [7]:
df1 = br.merge(bu,how='left',on='User-ID')
df = df1.merge(bk,how='left',on='ISBN')
user_rank = pd.DataFrame(br.groupby("User-ID")["ISBN"].count()).rename(columns={"ISBN":"User-Rating-Count"}).sort_values("User-Rating-Count",ascending=False)
book_rank = pd.DataFrame(br.groupby("ISBN")["User-ID"].count()).rename(columns={"User-ID":"Book-Rating-Count"}).sort_values("Book-Rating-Count",ascending=False)

In [8]:
user_rank.head()

Unnamed: 0_level_0,User-Rating-Count
User-ID,Unnamed: 1_level_1
11676,13602
198711,7550
153662,6109
98391,5891
35859,5850


In [9]:
book_rank.head()

Unnamed: 0_level_0,Book-Rating-Count
ISBN,Unnamed: 1_level_1
971880107,2502
316666343,1295
385504209,883
60928336,732
312195516,723


In [10]:
df = df.merge(user_rank,how='left',on="User-ID")
df = df.merge(book_rank,how='left',on="ISBN")

# filtered to users who have given over 100 ratings, and books that have received over 100 ratings
df = df[(df['User-Rating-Count']>100) & (df['Book-Rating-Count']>100)]
print(df.shape)
df.head()

(65217, 14)


Unnamed: 0,User-ID,ISBN,Book-Rating,Location,Age,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-Rating-Count,Book-Rating-Count
413,276925,002542730X,10,"barcelona, barcelona, spain",22.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...,108,171
426,276925,0316666343,0,"barcelona, barcelona, spain",22.0,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown",http://images.amazon.com/images/P/0316666343.0...,http://images.amazon.com/images/P/0316666343.0...,http://images.amazon.com/images/P/0316666343.0...,108,1295
429,276925,0385504209,8,"barcelona, barcelona, spain",22.0,The Da Vinci Code,Dan Brown,2003,Doubleday,http://images.amazon.com/images/P/0385504209.0...,http://images.amazon.com/images/P/0385504209.0...,http://images.amazon.com/images/P/0385504209.0...,108,883
453,276925,0804106304,0,"barcelona, barcelona, spain",22.0,The Joy Luck Club,Amy Tan,1994,Prentice Hall (K-12),http://images.amazon.com/images/P/0804106304.0...,http://images.amazon.com/images/P/0804106304.0...,http://images.amazon.com/images/P/0804106304.0...,108,519
457,276925,0971880107,0,"barcelona, barcelona, spain",22.0,Wild Animus,Rich Shapero,2004,Too Far,http://images.amazon.com/images/P/0971880107.0...,http://images.amazon.com/images/P/0971880107.0...,http://images.amazon.com/images/P/0971880107.0...,108,2502


### Prepare Sparse Matrix

In [14]:
def data_to_sparse(data,index,columns,values):
    pivot = data.pivot(index=index,columns=columns,values=values).fillna(0)
    sparse = csr_matrix(pivot.values)
    print(sparse.shape)
    return pivot,sparse

def fit_knn(sparse):
    knn = NearestNeighbors(metric='cosine')
    knn.fit(sparse)
    print(knn)
    return knn

In [15]:
pivot, sparse = data_to_sparse(df,"ISBN","User-ID","Book-Rating")

(721, 1767)


In [16]:
knn = fit_knn(sparse)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)
