In [1]:
# Package Imports
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import sklearn
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

# Load Data and Create Sparse Matrix

In [2]:
# Read books data
books = pd.read_csv('red_books.csv')
books

Unnamed: 0.1,Unnamed: 0,book_id,title,avg_rating,description
0,0,12182387,"The Passion (Dark Visions, #3)",4.04,This is the final tale in the bestselling auth...
1,1,20135365,Hope's Daughter,3.80,Life should be simple for Cassie.\nFor the sma...
2,2,21401181,"Half Bad (Half Life, #1)",3.80,Wanted by no one.\nHunted by everyone.\nSixtee...
3,3,10099492,Twelfth Grade Kills (The Chronicles of Vladimi...,4.35,It all comes down to this.\nVlad's running out...
4,4,22642971,The Body Electric,3.71,The future world is at peace.\nElla Shepherd h...
...,...,...,...,...,...
92777,93393,18221503,"Ãlmem Gerekirse (Revenants, #3)",4.21,Sevdigini Kurtarmak Icin Ne Kadarina Hazirsin?...
92778,93394,8987191,"The Mockingbirds (The Mockingbirds, #1)",3.79,Some schools have honor codes.\nOthers have ha...
92779,93395,1885730,Joel and Cat Set the Story Straight,3.78,
92780,93396,23636536,Another Day,3.67,The eagerly anticipated companion to David Lev...


In [3]:
# Read sparse matrix encoding into df
sparse_df = pd.read_csv('users_sparse.csv')
sparse_df

Unnamed: 0,r_index,c_index,data
0,0,25132,3
1,0,77220,2
2,0,32990,5
3,0,74424,5
4,0,20215,5
...,...,...,...
14731903,567805,148,5
14731904,567805,89946,4
14731905,567805,76582,4
14731906,567805,77720,5


In [4]:
# Create Scipy sparse matrix and convert to compressed sparse row format for operations
row = np.array(sparse_df.r_index)
col = np.array(sparse_df.c_index)
dat = np.array(sparse_df.data)
users = coo_matrix((dat,(row,col)),shape=(np.unique(row).size,np.unique(col).size))
users = users.tocsr()

# Determining k

In [5]:
n_users = np.unique(row).size
n_users

567806

In [6]:
# From some source somewhere
#Ummm its from here: https://medium.com/@rangavamsi5/k-nearest-neighbors-algorithm-in-python-64f38792193
k = (math.sqrt(n_users))/2
k = round(k)
k

377

# KNN for Cosine Similarities (don't use)

In [17]:
# Set up KNN
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=k)
knn.fit(users)

# Pick a user id from 0 to 567805
user_id = 478

# Get k neighbors for user_id
nbrs = knn.kneighbors(users.getrow(user_id))

# Gives user index and distance (angle between vectors, 0 means cosine simlarity == 1)
nbrs

(array([[0.        , 0.18758896, 0.1918774 , 0.20668076, 0.22179574,
         0.23068979, 0.25324948, 0.26470426, 0.26800421, 0.27870049,
         0.29277368, 0.29386828, 0.29466185, 0.29474137, 0.30592922,
         0.30714825, 0.30813461, 0.30952902, 0.30952902, 0.31204609,
         0.31224229, 0.31268863, 0.31305411, 0.31521318, 0.31522938,
         0.31586439, 0.31666001, 0.31732506, 0.317662  , 0.3181459 ,
         0.31908727, 0.31908727, 0.31996388, 0.31996388, 0.32104126,
         0.3217587 , 0.32188791, 0.32217538, 0.32241643, 0.32241643,
         0.32241643, 0.32241643, 0.32241643, 0.32241643, 0.32241643,
         0.32257648, 0.3227403 , 0.3227403 , 0.3227403 , 0.3227403 ,
         0.32313696, 0.32507363, 0.32532872, 0.32565019, 0.32565019,
         0.32565019, 0.32565019, 0.32590615, 0.32590615, 0.32590615,
         0.32590615, 0.32590615, 0.32590615, 0.32590615, 0.32590615,
         0.32590615, 0.32590615, 0.32590615, 0.32590615, 0.32590615,
         0.32590615, 0.32590615, 0

# With Cosine Similarity Function Instead

In [20]:
# Pick a user id from 0 to 567805
#This will have to be the value passed from the user in the GUI?
user_id = 478

# Calculate pairwise cosine similarities between user and all other users
similarities = cosine_similarity(users.getrow(user_id),users)

# Get top k similar users that are not equal to 1
sim_index = pd.DataFrame({'sim':similarities.flatten()})
sim_index = sim_index.sort_values(['sim'])
sim_index = sim_index[sim_index['sim'] != 1]
nbrs = sim_index.iloc[-k:]
nbrs = nbrs.reset_index()
a = nbrs.sort_values(by ='index', ascending = 1)
a

Unnamed: 0,index,sim
70,366,0.615868
330,1260,0.677260
98,1422,0.619479
310,1494,0.674094
239,1532,0.661711
...,...,...
39,556631,0.608838
229,557157,0.659749
2,560019,0.603328
270,560295,0.668488


# Weighted Adjusted Average

In [9]:
# Books read by the user_id
user_books = np.array(sparse_df[sparse_df['r_index']==user_id].c_index)

# Loop through each neighbor and find set difference between it and the user. Union the set differences
set_diff = np.empty(0,dtype=int)
for ind in nbrs['index']:
    nbr_books = np.array(sparse_df[sparse_df['r_index']==ind].c_index)
    set_diff = np.union1d(set_diff,np.setdiff1d(nbr_books,user_books))

In [10]:
# Get rows from users cooresponding to nbrs
nbrs_books = users[nbrs['index']]

# Get columns cooresponding to books read by neighbors but not user
nbrs_books = nbrs_books[:,set_diff]

# Convert to dataframe for ease of computation
nbrs_books = pd.DataFrame.sparse.from_spmatrix(nbrs_books)
nbrs_books = nbrs_books.set_axis(set_diff,axis=1)
nbrs_books = nbrs_books.set_axis(np.array(nbrs['index']),axis=0)

In [11]:
# Calculate weighted adjusted averages
weighted_adj_averages = np.empty(len(set_diff))
i = 0
for book in set_diff:
    curr_ratings = pd.DataFrame(nbrs_books[book])
    curr_ratings = curr_ratings[curr_ratings[book] != 0]
    curr_ratings['sim'] = np.array(nbrs[nbrs['index'].isin(curr_ratings.index)].sim)
    tot_sim = sum(curr_ratings.sim)
    curr_average = 0
    for rating, sim in zip(curr_ratings[book], curr_ratings['sim']):
        #This was the formula we chose?
        curr_average = curr_average + (sim/tot_sim) * rating
    curr_average = curr_average - 1/len(curr_ratings.index)
    weighted_adj_averages[i] = curr_average
    i += 1

In [12]:
weighted_adj_averages

array([1.        , 1.        , 1.        , 2.        , 1.        ,
       3.        , 3.        , 2.        , 3.        , 3.        ,
       3.        , 3.01868276, 1.        , 1.99903698, 3.        ,
       4.        , 3.5       , 1.        , 2.        , 2.5       ,
       3.        , 3.        , 0.        , 2.        , 3.65518298,
       2.        , 2.        , 2.        , 4.        , 3.32941557,
       1.        , 2.        , 1.        , 4.        , 2.        ,
       3.        , 3.        , 3.        , 2.        , 3.01523151,
       2.        , 2.99789456, 2.        , 3.        , 2.        ,
       1.        , 2.66666667, 4.        , 1.        , 1.454196  ,
       2.90570473, 3.        , 1.69591462, 3.        , 0.        ,
       2.        , 2.        , 2.99653951, 3.        , 3.00145574,
       1.97913037, 2.        , 1.        , 3.        , 1.62702037,
       0.        , 1.        , 1.        , 1.        , 2.        ,
       2.        , 4.        , 1.        , 2.        , 2.     