In [1]:
# Operating System
import os

# Numpy, Pandas and Scipy
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, save_npz, load_npz

# Scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer


In [4]:
def read_users() -> pd.DataFrame:

    path = os.path.join('BooksUsers.csv')
    data = pd.read_csv(path, names=['User-ID', 'Location', 'Age'], sep=',').drop(0)
    return data

user_data = read_users()
user_data.head()

Unnamed: 0,User-ID,Location,Age
1,2,"stockton, california, usa",18.0
2,8,"timmins, ontario, canada",
3,9,"germantown, tennessee, usa",
4,10,"albacete, wisconsin, spain",26.0
5,12,"fort bragg, california, usa",


In [5]:
def read_books_meta() -> pd.DataFrame:

    path = os.path.join('BooksMetaInfo.csv')
    data = pd.read_csv(path, names=['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L', 'authors', 'description', 'pageCount', 'categories'], sep=',').drop(0)
    return data

books_meta = read_books_meta()
books_meta.head()

  if (await self.run_code(code, result,  async_=asy)):


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,authors,description,pageCount,categories
1,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,"['Mark P. O. Morford', 'Robert J. Lenardon']",Provides an introduction to classical myths pl...,808.0,['Social Science']
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,['Richard Bruce Wright'],"In a small town in Canada, Clara Callan reluct...",414.0,['Actresses']
3,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,"[""Carlo D'Este""]","Here, for the first time in paperback, is an o...",555.0,['1940-1949']
4,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,['Gina Bari Kolata'],"Describes the great flu epidemic of 1918, an o...",330.0,['Medical']
5,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,['E. J. W. Barber'],A look at the incredibly well-preserved ancien...,240.0,['Design']


In [6]:
def read_books_ratings() -> pd.DataFrame:
                             
    path = os.path.join('BookRatings.csv')
    data = pd.read_csv(path, names=['User-ID', 'ISBN', 'Book-Rating'], sep=',').drop(0)
    return data

books_ratings = read_books_ratings()
books_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
1,99,316748641,7
2,99,446677450,10
3,99,553347594,9
4,99,451166892,3
5,99,671621009,10


In [7]:
def read_test_users() -> pd.DataFrame:
                             
    path = os.path.join('test_users.csv')
    data = pd.read_csv(path, names=['User-ID'], sep=',').drop(0)
    return data
test_users = read_test_users()
test_users.head()

Unnamed: 0,User-ID
1,114
2,507
3,850
4,3346
5,4092


In [8]:
# Check amount of users in training data
user_id_ratings = books_ratings['User-ID'].unique().tolist()
print(len(user_id_ratings))

# Check users in test data
test_user_id_ratings = test_users['User-ID'].unique().tolist()
print(len(test_user_id_ratings))

# Check the test users NOT in training
unique_test_users = [a for a in test_user_id_ratings if a in user_id_ratings]
print(len(unique_test_users))

5719
589
489


In [9]:
def extract_decade(value):
    try: 
        decade = int(10*round(float(value)/10))
        return str(decade)
    except:
        return ""

# Content Based filtering
books_meta['categories'] = books_meta['categories'].astype(str).fillna('')
books_meta['categories'].loc[books_meta['categories'] == 'nan'] = ''
genres = books_meta[['ISBN', 'categories']].groupby(by='ISBN')['categories'].apply(','.join).str.replace(' ', '').str.replace('\[|\]', '').apply(str.lower)
display(genres.head())


ISBN
0001055607           
0001061127    'chess'
0001232088    'bears'
0001360469           
0001711253    'bears'
Name: categories, dtype: object

In [10]:
# Unique Books with ratings
book_isbn_array = books_ratings['ISBN'].unique().tolist()
display(len(book_isbn_array))

# Build new DF with processed data
doc_tags = books_meta[['ISBN','categories', 'Year-Of-Publication']].set_index(['ISBN'])
display(doc_tags.head())

doc_tags['categories'] = doc_tags['categories'].str.replace(' ', '').str.replace('\[|\]', '').apply(str.lower)
doc_tags['Year-Of-Publication'] = doc_tags['Year-Of-Publication'].astype(str).apply(extract_decade)

display(len(doc_tags))
doc_tags = doc_tags.loc[doc_tags.index.isin(book_isbn_array)] # only use books with ratings!
display(doc_tags.head())
display(len(doc_tags))

47768

Unnamed: 0_level_0,categories,Year-Of-Publication
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,['Social Science'],2002
2005018,['Actresses'],2001
60973129,['1940-1949'],1991
374157065,['Medical'],1999
393045218,['Design'],1999


112341

Unnamed: 0_level_0,categories,Year-Of-Publication
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
2005018,'actresses',2000
60973129,'1940-1949',1990
374157065,'medical',2000
399135782,'fiction',1990
425176428,'history',2000


47768

In [11]:
def get_tf_idf(doc_series: pd.Series):
    vectorizer = TfidfVectorizer()
    vec = vectorizer.fit_transform(doc_series)
    doc_tf_idf = csr_matrix(vec)
    
    return doc_tf_idf

# Items Profile
tags_idf = get_tf_idf(doc_tags['categories'])
decades_idf = get_tf_idf(doc_tags['Year-Of-Publication'])
item_profiles = np.hstack([tags_idf.toarray(), decades_idf.toarray()])
item_profiles = csr_matrix(item_profiles)
display(item_profiles)

<47768x2676 sparse matrix of type '<class 'numpy.float64'>'
	with 97325 stored elements in Compressed Sparse Row format>

In [12]:
# Pivot ratings dataframe
pivot_ratings = books_ratings.pivot(index='User-ID', columns='ISBN', values='Book-Rating').astype(float).fillna(0)
display(pivot_ratings.head())


ISBN,0001360469,0001714600,0001935968,0001981307,0001981625,0002005018,0002006588,0002111314,0002116286,0002190915,...,987550095X,987906500X,9879065271,9879789970,9968746037,9971400162,9974643058,9976100256,9997507002,9997508769
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
def make_user_profiles(R, item_profiles):
    return np.dot(R, item_profiles)

R = csr_matrix(pivot_ratings) 
user_profiles = make_user_profiles(R, item_profiles)

In [14]:
def make_predictions(R, item_profiles, user_profiles):

    preds = cosine_similarity(user_profiles, item_profiles)
    
    # Exclude previously rated items.
    preds[R.nonzero()] = 0
    
    return csr_matrix(preds)

pred = make_predictions(R, item_profiles, user_profiles)


In [15]:
display(pred)

<5719x47768 sparse matrix of type '<class 'numpy.float64'>'
	with 254808222 stored elements in Compressed Sparse Row format>

In [16]:
user_profiles

<5719x2676 sparse matrix of type '<class 'numpy.float64'>'
	with 74251 stored elements in Compressed Sparse Row format>

In [17]:
item_profiles

<47768x2676 sparse matrix of type '<class 'numpy.float64'>'
	with 97325 stored elements in Compressed Sparse Row format>