In [1]:
# Package Imports
import pandas as pd
import numpy as np
import scipy as sp
from scipy.sparse import coo_matrix
import numpy_indexed as npi

import requests
from io import StringIO

In [29]:
# Read in book data
colnames = ['book_id','title','avg_rating','description']

orig_url='https://drive.google.com/file/d/15DvRQIdkXVg3qXVkyDm2GsgXVcTmgzYj/view?usp=sharing'

file_id = orig_url.split('/')[-2]
dwn_url='https://drive.google.com/uc?export=download&id=' + file_id
url = requests.get(dwn_url).text
books_raw = StringIO(url)

books = pd.read_csv(books_raw, names=colnames)
books.head()

Unnamed: 0,book_id,title,avg_rating,description
0,12182387,"The Passion (Dark Visions, #3)",4.04,This is the final tale in the bestselling auth...
1,20135365,Hope's Daughter,3.8,Life should be simple for Cassie.\nFor the sma...
2,21401181,"Half Bad (Half Life, #1)",3.8,Wanted by no one.\nHunted by everyone.\nSixtee...
3,10099492,Twelfth Grade Kills (The Chronicles of Vladimi...,4.35,It all comes down to this.\nVlad's running out...
4,22642971,The Body Electric,3.71,The future world is at peace.\nElla Shepherd h...


In [2]:
# Read in review data
rev_colnames = ['user_id','book_id','review_text','rating']

rev_url='https://drive.google.com/file/d/1Ctew6lEpW1V-is9JyRLmO_z-KgVI5c95/view?usp=sharing'

rev_file_id = rev_url.split('/')[-2]
rev_dwn_url='https://drive.google.com/uc?export=download&id=' + rev_file_id
rev_url = requests.get(rev_dwn_url).text
reviews_raw = StringIO(rev_url)

reviews = pd.read_csv(reviews_raw, names=rev_colnames)
reviews.head()
# Isolate reviews with non-zero rating (note if a rating was given, 0 was not an option)
reviews = reviews[reviews['rating']!=0]
reviews

Unnamed: 0,user_id,book_id,review_text,rating
0,<!DOCTYPE html><html><head><title>Google Drive...,,,
1,.goog-inline-block{position:relative;display:-...,sans-serif;margin:0}.grecaptcha-badge{visibili...,.uc-warning-caption{color:#222;font-size:16px}...,


In [7]:
# Map between user_ids and index
unique_users = np.array(reviews.user_id.unique())
user_index = np.array(range(len(unique_users)))
user_map = dict(zip(unique_users,user_index))

# Map between book_ids and index
book_ids = np.array(books.book_id)
book_index = np.array(range(len(book_ids)))
book_map = dict(zip(book_ids,book_index))

# Apply maps to review set
row = npi.remap(reviews.user_id, list(user_map.keys()), list(user_map.values()))
col = npi.remap(reviews.book_id, list(book_map.keys()), list(book_map.values()))
dat = np.array(reviews.rating)

NameError: name 'reviews' is not defined

In [None]:
# Create sparse matrix in coordinate format
users_size = unique_users.size
books_size = book_ids.size
users = coo_matrix((dat, (row,col)), shape=(users_size,books_size))

In [None]:
# Make dataframe of col, row, dat
users_sparse_df = pd.DataFrame(list(zip(row,col,dat)),
                              columns=['r_index','c_index','data'])

In [None]:
# Export to CSV
users_sparse_df.to_csv('users_sparse.csv',index=False)