In [None]:
# Package Imports
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import sklearn
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

# Load Data and Create Sparse Matrix

In [None]:
# Read books data
books = pd.read_csv('red_books.csv')
books

In [None]:
# Read sparse matrix encoding into df
sparse_df = pd.read_csv('users_sparse.csv')
sparse_df

In [None]:
# Create Scipy sparse matrix and convert to compressed sparse row format for operations
row = np.array(sparse_df.r_index)
col = np.array(sparse_df.c_index)
dat = np.array(sparse_df.data)
users = coo_matrix((dat,(row,col)),shape=(np.unique(row).size,np.unique(col).size))
users = users.tocsr()

# Determining k

In [None]:
n_users = np.unique(row).size
n_users

In [None]:
# From some source somewhere
k = (math.sqrt(n_users))/2
k = round(k)
k

In [None]:
# Systematic approach

# KNN for Cosine Similarities (don't use)

In [None]:
# Set up KNN
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=377)
knn.fit(users)

# Pick a user id from 0 to 567805
user_id = 478

# Get k neighbors for user_id
nbrs = knn.kneighbors(users.getrow(user_id))

# Gives user index and distance (angle between vectors, 0 means cosine simlarity == 1)
nbrs

# With Cosine Similarity Function Instead

In [None]:
# Pick a user id from 0 to 567805
user_id = 478

# Calculate pairwise cosine similarities between user and all other users
similarities = cosine_similarity(users.getrow(user_id),users)

# Get top k similar users that are not equal to 1
sim_index = pd.DataFrame({'sim':similarities.flatten()})
sim_index = sim_index.sort_values(['sim'])
sim_index = sim_index[sim_index['sim'] != 1]
nbrs = sim_index.iloc[-k:]
nbrs = nbrs.reset_index()

# Weighted Adjusted Average

In [None]:
# Books read by the user_id
user_books = np.array(sparse_df[sparse_df['r_index']==user_id].c_index)

# Loop through each neighbor and find set difference between it and the user. Union the set differences
set_diff = np.empty(0,dtype=int)
for ind in nbrs['index']:
    nbr_books = np.array(sparse_df[sparse_df['r_index']==ind].c_index)
    set_diff = np.union1d(set_diff,np.setdiff1d(nbr_books,user_books))

In [None]:
# Get rows from users cooresponding to nbrs
nbrs_books = users[nbrs['index']]

# Get columns cooresponding to books read by neighbors but not user
nbrs_books = nbrs_books[:,set_diff]

# Convert to dataframe for ease of computation
nbrs_books = pd.DataFrame.sparse.from_spmatrix(nbrs_books)
nbrs_books = nbrs_books.set_axis(set_diff,axis=1)
nbrs_books = nbrs_books.set_axis(np.array(nbrs['index']),axis=0)

In [None]:
# Calculate weighted adjusted averages
weighted_adj_averages = np.empty(len(set_diff))
i = 0
for book in set_diff:
    curr_ratings = pd.DataFrame(nbrs_books[book])
    curr_ratings = curr_ratings[curr_ratings[book] != 0]
    curr_ratings['sim'] = np.array(nbrs[nbrs['index'].isin(curr_ratings.index)].sim)
    tot_sim = sum(curr_ratings.sim)
    curr_average = 0
    for rating, sim in zip(curr_ratings[book], curr_ratings['sim']):
        curr_average = curr_average + (sim/tot_sim) * rating
    curr_average = curr_average - 1/len(curr_ratings.index)
    weighted_adj_averages[i] = curr_average
    i += 1

In [None]:
weighted_adj_averages