In [2]:
import numpy as np
import pandas as pd
import scipy.sparse 
import multiprocessing as mp
import glob
import re
import time, timeit
import matplotlib.pyplot as plt
import sys
import random
import os
sys.path.insert(1, '../src/mf_algorithms')

from functions import *

## Load and format data

The following dataset was found on a lab website from UCSD: https://cseweb.ucsd.edu/~jmcauley/datasets.html

Specifically, I went with the dataset of Amazon gift card purchases(a subset of their Amazon dataset), which was relatively smaller than some of the other datasets at only 147,136 rows.

The dataset is originally read into a dataframe from a csv file, which was pivoted after removing duplicates and then casted into a matrix. It is (expectedly) extremely sparse.

In [3]:
rawdata = pd.read_csv('C:\\Users\\Edwin\\Downloads\\Gift_Cards.csv', names=['productid', 'reviewerid', 'rating', 'timestamp'])
rawdata = rawdata[-rawdata.duplicated(['productid', 'reviewerid'])]
rawdata.shape

(147136, 4)

In [12]:
testmat = rawdata.pivot(index = 'reviewerid', columns = 'productid', values = 'rating').fillna(0).values
testmat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Algorithm Comparison

ALS was compared to the fast version of BRK(no weighted sampling was done for the subiteration steps). An arbitrary factor dimension of 50 was chosen.

### ALS 

In [5]:
%%time
A, S, error = mf(testmat, k = 50, s = 1, niter = 100, siter = 1, solver = 'als')
error

Wall time: 38.3 s


94.26144505035776

In [6]:
%%time
A, S, error = mf(testmat, k = 50, s = 1, niter = 500, siter = 1, solver = 'als')
error

Wall time: 2min 45s


83.01018823901303

In [7]:
%%time
A, S, error = mf(testmat, k = 50, s = 1, niter = 1000, siter = 1, solver = 'als')
error

Wall time: 5min 45s


69.80336213665748

### BRK

In [11]:
%%time
A, S, error = mf(testmat, k = 50, s = 12500, niter = 1000, siter = 1, solver = 'quickbrk')
error

Wall time: 1min 12s


70.58115028044224

In [9]:
%%time
A, S, error = mf(testmat, k = 50, s = 25000, niter = 1000, siter = 1, solver = 'quickbrk')
error

Wall time: 2min 37s


70.00416029088552

In [10]:
%%time
A, S, error = mf(testmat, k = 50, s = 50000, niter = 1000, siter = 1, solver = 'quickbrk')
error

Wall time: 5min 35s


69.82047589504444

## Discussion

These results suggest that BRK is advantageous over ALS in terms of computational time without much sacrifice to factorization "quality". While decreasing the sampling size to one third of the overall row number did not decrease the computational time by much, the relative error stayed more or less the same. Reducing the sampling size to about one sixth(25000) and one twelfth(12500), we can see the relative error increase slightly but the computational time decrease significantly. 