In [2]:
# Required Packages
import platform                         #used to get the computer name for each user
import requests                         #used to request the books file from Google Drive
from io import StringIO                 #used to read the requested file from Google Drive to be able to convert to csv
import pandas as pd                     #used to create dataframes from the .csv files
import numpy as np                      #used for arrays
import numpy_indexed as npi             #used to remap rows and columns ???
import scipy as sp                      #needed to use sparse matrix 
from scipy.sparse import coo_matrix     #used to create sparse matrix in coordinate formate ???

In [3]:
# with the size of the files being read (primarily the interaction file), it was not possible to store the files directly onto Github
# each user needed to download the files used (interactions) 
# the name of their comupter would be the condition used to be able to specify the different paths 
# where the ratings file is stored on that user's individual computer. 
curr_comp = platform.node()
curr_comp

'DESKTOP-ARTEMI5'

### Load Data

In [4]:
# Read in book data

# only necessary columns were the book id, the title of the book, the average rating of the book, and the books's description
    #the book id, title, and average rating are going to be used in the recommendation system
    #the description was hoped to be used in a sentiment analysis exploration
colnames = ['book_id','title','avg_rating','description']

#if the name of the users computer is either of the two below the books file will be pulled from Google Drive
if curr_comp == 'DESKTOP-ARTEMI5' or curr_comp == 'Mandey-Lappy-Toppy':
    #https://stackoverflow.com/questions/56611698/pandas-how-to-read-csv-file-from-google-drive-public
    #this is the url for the books data
    orig_url='https://drive.google.com/file/d/15DvRQIdkXVg3qXVkyDm2GsgXVcTmgzYj/view?usp=sharing'
    #the file id is the unique id that is given to each Google Drive file. It is 15DvRQIdkXVg3qXVkyDm2GsgXVcTmgzYj
    file_id = orig_url.split('/')[-2]
    #add the following before the file id to explicit state that the file will be downloaded
    dwn_url='https://drive.google.com/uc?export=download&id=' + file_id
    #request to get the file (?)
    url = requests.get(dwn_url).text
    #StringIO will take the contents from the url and make it into a table (?), missing the column names 
    books_raw = StringIO(url)
    #convert the raw data into a csv and add the column names listed above
    books = pd.read_csv(books_raw,names=colnames)

#otherwise if the name of the user's computer is either of these pull the file from the given .csv file and provide it with the column names
elif curr_comp == 'sfort-laptop' or curr_comp == 'sfort-desktop':
    books = pd.read_csv('toobig/books.csv',names=colnames)

#print out the books csv file
books

Unnamed: 0,book_id,title,avg_rating,description
0,12182387,"The Passion (Dark Visions, #3)",4.04,This is the final tale in the bestselling auth...
1,20135365,Hope's Daughter,3.80,Life should be simple for Cassie.\nFor the sma...
2,21401181,"Half Bad (Half Life, #1)",3.80,Wanted by no one.\nHunted by everyone.\nSixtee...
3,10099492,Twelfth Grade Kills (The Chronicles of Vladimi...,4.35,It all comes down to this.\nVlad's running out...
4,22642971,The Body Electric,3.71,The future world is at peace.\nElla Shepherd h...
...,...,...,...,...
93393,18221503,"Ãlmem Gerekirse (Revenants, #3)",4.21,Sevdigini Kurtarmak Icin Ne Kadarina Hazirsin?...
93394,8987191,"The Mockingbirds (The Mockingbirds, #1)",3.79,Some schools have honor codes.\nOthers have ha...
93395,1885730,Joel and Cat Set the Story Straight,3.78,
93396,23636536,Another Day,3.67,The eagerly anticipated companion to David Lev...


In [5]:
# Read in interactions data

#desired columns from the interactions data. 
colnames = ['user_id','book_id','is_read','rating']

#This file was too large to pull directly from Google Drive
    #Google drive sends a warning when the file is too large and any solutions found online to work around this did not work
    #The solution was to download the file individually and then read it in based on its path location on each user's computer
if curr_comp == 'DESKTOP-ARTEMI5':
    interactions = pd.read_csv('C:\\DSCI478_Project_Files\\interactions.csv',names=colnames)
elif curr_comp == 'Mandey-Lappy-Toppy':
    interactions = pd.read_csv('C:\\Users\HP\\Dropbox\\000000 CSU\\CSU 2022 Spring\\DSCI 478\\Final Project\\interactions.csv',names=colnames)
elif curr_comp == 'sfort-laptop' or curr_comp == 'sfort-desktop':
    interactions = pd.read_csv('toobig/interactions.csv',names=colnames)

# Isolate interactions with non-zero rating and is_read status
interactions = interactions[interactions['is_read']==True]
interactions = interactions[interactions['rating']!=0]
interactions

Unnamed: 0,user_id,book_id,is_read,rating
3,8842281e1d1347389f2ab93d60773d4d,8684868,True,3
4,8842281e1d1347389f2ab93d60773d4d,8423493,True,2
5,8842281e1d1347389f2ab93d60773d4d,87976,True,5
6,8842281e1d1347389f2ab93d60773d4d,18116,True,5
7,8842281e1d1347389f2ab93d60773d4d,2767052,True,5
...,...,...,...,...
34919249,b0883ebf8e31731f1c5d91e678c26666,19057,True,5
34919250,b0883ebf8e31731f1c5d91e678c26666,13335037,True,4
34919251,b0883ebf8e31731f1c5d91e678c26666,1295102,True,4
34919252,b0883ebf8e31731f1c5d91e678c26666,11870085,True,5


In [7]:
#Calculate the Sparsity of the Collaborative Filtering matrix
#https://stackoverflow.com/questions/38708621/how-to-calculate-percentage-of-sparsity-for-a-numpy-array-matrix

#Sparsity = 1 - (number of non-zero values / size of the table)

#of all of the possible books how many were rated from each user --> how many books were rated? 
    #(aka number of non-zero values in the table)
num_interaction = interactions.shape[0]
print(num_interaction)

#size of collaborative filtering matrix --> num unique users * num books avail
num_users = len(interactions.user_id.unique())
print(num_users)
num_books = len(interactions.book_id.unique())
print(num_books)

sparsity = (1 - (num_interaction / (num_users * num_books))) * 100
print(sparsity)
#99% of the collaborative filtering matrix consists of 0. 

num_entires = 100 - sparsity
print(num_entires)
#0.027% of the matrix contains entries

14731908
567806
92782
99.97203625773099
0.027963742269008662


In [None]:
# Remove books with no interactions

#unique integer for each book id, instead of the value above. easier to intpret and understand
unique_books_int = interactions.book_id.unique()

for book in books.book_id:
    if book not in unique_books_int:
        books = books[books.book_id != book]

In [None]:
# Create CSV from reduced book set
books.to_csv('red_books.csv')

# Sparse Matrix

Coordinate sparse matrix encoding works by storing the row index, column index, and data value for each non-zero entry in the matrix. We need to map each `user_id` to a row in our sparse matrix, and each `book_id` to a column in our sparse matrix. So we create a map from unique `user_id`s to integers, and another map from each `book_id` to integers. Then we apply this map and build a sparse matrix from coordinate encoding. 

In [8]:
# Map between user_ids and index
unique_users = np.array(interactions.user_id.unique())
print(unique_users.size)
user_index = np.array(range(len(unique_users)))
user_map = dict(zip(unique_users,user_index))

# Map between book_ids and index
book_ids = np.array(books.book_id)
book_index = np.array(range(len(book_ids)))
book_map = dict(zip(book_ids,book_index))

# Apply maps to interactions set
row = npi.remap(interactions.user_id, list(user_map.keys()), list(user_map.values()))
col = npi.remap(interactions.book_id, list(book_map.keys()), list(book_map.values()))
dat = np.array(interactions.rating)

567806


In [9]:
# Create sparse matrix in coordinate format
users_size = unique_users.size
books_size = book_ids.size
users = coo_matrix((dat, (row,col)), shape=(users_size,books_size))

# Creating CSV

We need our recommendation system to have no dependence on the original datasets since they are quite large. Therefore we need to make a smaller dataset which carries over all necessary information while being smaller. We can take the coordinate/data pairs used in constructing our COO matrix, and put them into a CSV. We also need to store `book_id`s in order to recreate the bijection defined earlier purely from the data, but `book_id`s is part of the `books.csv` dataset, which is small enough to be stored.

In [10]:
# Make dataframe of col, row, dat, unique_users, book_ids ???
users_sparse_df = pd.DataFrame(list(zip(row,col,dat)),
                              columns=['r_index','c_index','data'])
users_sparse_df

Unnamed: 0,r_index,c_index,data
0,0,25318,3
1,0,77724,2
2,0,33225,5
3,0,74913,5
4,0,20364,5
...,...,...,...
14731903,567805,148,5
14731904,567805,90540,4
14731905,567805,77084,4
14731906,567805,78226,5


In [None]:
# Export to CSV
users_sparse_df.to_csv('users_sparse.csv',index=False)