In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
ratings=pd.read_csv('BX-Book-Ratings.csv',sep=';',error_bad_lines=False)

In [3]:
books=pd.read_csv('BX-Books.csv',sep=';', usecols=['ISBN','Book-Title','Book-Author','Year-Of-Publication','Publisher']
                  ,error_bad_lines=False)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
users=pd.read_csv('BX-Users.csv',sep=';',error_bad_lines=False)

In [5]:
#rating info
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [6]:
#books info
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [7]:
print(f'Users: {len(users)}\nBooks: {len(books)}\nRatings: {len(ratings)}')

Users: 278858
Books: 271379
Ratings: 1149779


In [8]:
combine_book_rating=pd.merge(ratings,books,on='ISBN')
columns=['Book-Author','Year-Of-Publication','Publisher']
combine_book_rating=combine_book_rating.drop(columns,axis=1)
combine_book_rating.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
0,276725,034545104X,0,Flesh Tones: A Novel
1,2313,034545104X,5,Flesh Tones: A Novel
2,6543,034545104X,0,Flesh Tones: A Novel
3,8680,034545104X,5,Flesh Tones: A Novel
4,10314,034545104X,9,Flesh Tones: A Novel


In [9]:
book_ratingcount=(combine_book_rating.groupby(by=['Book-Title'])['Book-Rating'].count().reset_index().rename(columns={'Book-Rating':'totalRatingCount'})[['Book-Title','totalRatingCount']])
book_ratingcount.head() 

Unnamed: 0,Book-Title,totalRatingCount
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [10]:
rating_with_totalRatingCount=combine_book_rating.merge(book_ratingcount, left_on='Book-Title', right_on='Book-Title')
rating_with_totalRatingCount.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,totalRatingCount
0,276725,034545104X,0,Flesh Tones: A Novel,60
1,2313,034545104X,5,Flesh Tones: A Novel,60
2,6543,034545104X,0,Flesh Tones: A Novel,60
3,8680,034545104X,5,Flesh Tones: A Novel,60
4,10314,034545104X,9,Flesh Tones: A Novel,60


In [11]:
#statistics of total rating count
pd.set_option('display.float_format',lambda x: '%.3f'% x)
print(book_ratingcount['totalRatingCount'].describe())
#The median book has been rated only once

count   241090.000
mean         4.277
std         16.738
min          1.000
25%          1.000
50%          1.000
75%          3.000
max       2502.000
Name: totalRatingCount, dtype: float64


In [12]:
popularity_threshold=30
rating_popular_book=rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_book.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,totalRatingCount
0,276725,034545104X,0,Flesh Tones: A Novel,60
1,2313,034545104X,5,Flesh Tones: A Novel,60
2,6543,034545104X,0,Flesh Tones: A Novel,60
3,8680,034545104X,5,Flesh Tones: A Novel,60
4,10314,034545104X,9,Flesh Tones: A Novel,60


In [13]:
combined=rating_popular_book.merge(users,left_on='User-ID',right_on='User-ID',how='left')
combined=combined.drop(['Age','Location'],axis=1)
combined.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,totalRatingCount
0,276725,034545104X,0,Flesh Tones: A Novel,60
1,2313,034545104X,5,Flesh Tones: A Novel,60
2,6543,034545104X,0,Flesh Tones: A Novel,60
3,8680,034545104X,5,Flesh Tones: A Novel,60
4,10314,034545104X,9,Flesh Tones: A Novel,60


In [14]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 372259 entries, 0 to 372258
Data columns (total 5 columns):
User-ID             372259 non-null int64
ISBN                372259 non-null object
Book-Rating         372259 non-null int64
Book-Title          372259 non-null object
totalRatingCount    372259 non-null int64
dtypes: int64(3), object(2)
memory usage: 17.0+ MB


# Implementing kNN

In [15]:
combined= pd.DataFrame(combined)

In [16]:
users_pivot=combined.pivot_table(index='Book-Title',columns='User-ID',values='Book-Rating',aggfunc=np.mean).fillna(0)

In [17]:
users_rating_matrix=csr_matrix(users_pivot.values)

In [18]:
#algorithm will calculate the cosine similarity between rating vectors
model_knn=NearestNeighbors(metric='cosine',n_neighbors=6,n_jobs=1,)
model_knn.fit(users_rating_matrix)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=1, n_neighbors=6, p=2, radius=1.0)

### Test our model and make some recommendations:

In [19]:
#query_index=np.random.choice(users_pivot.shape[0])
query_index=np.where(users_pivot.index.values=="The Green Mile: Coffey's Hands (Green Mile Series)")[0]
distances,indices=model_knn.kneighbors(users_pivot.iloc[query_index,:].values.reshape(1,-1),n_neighbors=6)

In [20]:
for i in range(0,len(distances.flatten())):
    if i==0:
        print('Recommendations for {0} :\n'.format(users_pivot.index[query_index]))
    else:
        print('{0}: {1},with distance of {2}'.format(i,users_pivot.index[indices.flatten()[i]],distances.flatten()[i]))

Recommendations for Index(['The Green Mile: Coffey's Hands (Green Mile Series)'], dtype='object', name='Book-Title') :

1: The Green Mile: Night Journey (Green Mile Series),with distance of 0.2719606608979157
2: The Green Mile: The Mouse on the Mile (Green Mile Series),with distance of 0.31030354618181977
3: The Green Mile: The Bad Death of Eduard Delacroix (Green Mile Series),with distance of 0.3286312781681793
4: The Two Dead Girls (Green Mile Series),with distance of 0.3397658291142378
5: The Green Mile: Coffey on the Mile (Green Mile Series),with distance of 0.38080066489823006


# Evaluation on User-based Knn

In [21]:
data=combined.pivot_table(index='User-ID',columns='Book-Title',values='Book-Rating',dropna=True).fillna(0)

In [22]:
train_data, test_data = train_test_split(data, test_size=0.25)

In [23]:
#removing lines that are completely zero or have just one non-zero rating
index_all=[]
for i in range(len(test_data.values)):
    if (test_data.values[i]==0).all()==True or len(np.where(test_data.values[i]!=0)[0])==1 :
        index_all.append(i)

In [24]:
test_data=np.delete(test_data.values,index_all,axis=0)

In [25]:
print('train_data shape: '+str(train_data.shape) +'\n'+'test_data shape: '+str(test_data.shape))

train_data shape: (41055, 4674)
test_data shape: (4082, 4674)


# Applying one Knn

In [26]:
train_data1=csr_matrix(train_data)
test_data1=csr_matrix(test_data)

In [27]:
# algorithm will calculate the cosine similarity between rating vectors
model_knn=NearestNeighbors(metric='cosine',algorithm='auto',n_jobs=1,n_neighbors=200)
model_knn.fit(train_data1)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=1, n_neighbors=200, p=2,
                 radius=1.0)

In [28]:
pred=[]

In [29]:
for i in range(test_data1.shape[0]):
    distances,indices=model_knn.kneighbors(test_data1[i].toarray()[0].reshape(1,-1),n_neighbors=200)
    pred_rating=train_data1[indices[0]].toarray().sum(axis=0)/model_knn.n_neighbors
    pred.append(pred_rating)

In [30]:
pred=np.array(pred)

In [31]:
MSE1=np.sqrt(mean_squared_error(test_data1.toarray(),pred))
MAE1=mean_absolute_error(test_data1.toarray(), pred)

In [32]:
print('MSE : '+str(MSE1))
print('MAE : '+str(MAE1))

MSE : 0.2924517066847602
MAE : 0.031645749634280164


# Applying multiple Knn

In [33]:
model_knn=NearestNeighbors(metric='cosine',algorithm='auto',n_jobs=1,n_neighbors=200)

In [34]:
pred_rating_weighted=[]

In [35]:
for j in range(test_data.shape[0]):
    pred=np.zeros(test_data[j].shape)
    index=np.where(test_data[j]!=0)[0]
    for i in range(len(index)):
        t=np.delete(index,i)
        b=train_data.values[:,t]
        model_knn.fit(b)
        distances,indices=model_knn.kneighbors(test_data[j][t].reshape(1,-1),n_neighbors=200)
        c=train_data.values[indices[0]][:,i]
        if c.nonzero()[0].size == 0:
            p=5.0
        else:
            #p=c[c.nonzero()[0]].mean()
            weight=distances[0][c.nonzero()[0]]
            weight[weight==0]=0.00001
            p=np.average(c[c.nonzero()[0]],axis=0,weights=1/weight)
        pred[index[i]]=p
    pred_rating_weighted.append(pred)

In [37]:
pred_rating_weighted=np.array(pred_rating_weighted)

In [38]:
MSE2=np.sqrt(mean_squared_error(test_data,pred_rating_weighted))
MAE2=mean_absolute_error(test_data, pred_rating_weighted)

In [39]:
print('MSE : '+str(MSE2))
print('MAE : '+str(MAE2))

MSE : 0.10863143775890322
MAE : 0.0035575949137529738


# Comparison

In [40]:
print('one Knn')
print('MSE : '+str(MSE1))
print('MAE : '+str(MAE1))
print('multiple Knn')
print('MSE : '+str(MSE2))
print('MAE : '+str(MAE2))
print('We have '+str(MSE1-MSE2) + ' decrease in MSE and '+str(MAE1-MAE2)+' decrease in MAE.')

one Knn
MSE : 0.2924517066847602
MAE : 0.031645749634280164
multiple Knn
MSE : 0.10863143775890322
MAE : 0.0035575949137529738
We have 0.183820268925857 decrease in MSE and 0.02808815472052719 decrease in MAE.
