Skip to content

Commit

Permalink
update to scipy sparse array interface
Browse files Browse the repository at this point in the history
  • Loading branch information
david-cortes committed Jul 3, 2023
1 parent c204011 commit 5bee294
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 14 deletions.
17 changes: 8 additions & 9 deletions hpfrec/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import multiprocessing, os, warnings
from . import cython_loops_float, cython_loops_double, _check_openmp
import ctypes, types, inspect
from scipy.sparse import coo_matrix, csr_matrix
from scipy.sparse import coo_array, issparse
### TODO: don't do this, use iloc/loc and make copies instead
pd.options.mode.chained_assignment = None

Expand Down Expand Up @@ -381,11 +381,11 @@ def fit(self, counts_df, val_set=None):
Parameters
----------
counts_df : pandas data frame (nobs, 3) or coo_matrix
counts_df : pandas data frame (nobs, 3) or coo_array
Input data with one row per non-zero observation, consisting of triplets ('UserId', 'ItemId', 'Count').
Must containin columns 'UserId', 'ItemId', and 'Count'.
Combinations of users and items not present are implicitly assumed to be zero by the model.
Can also pass a sparse coo_matrix, in which case 'reindex' will be forced to 'False'.
Can also pass a sparse coo_array, in which case 'reindex' will be forced to 'False'.
val_set : pandas data frame (nobs, 3)
Validation set on which to monitor log-likelihood. Same format as counts_df.
Expand Down Expand Up @@ -445,7 +445,7 @@ def _process_data(self, input_df):
assert 'Count' in input_df.columns.values
input_df = input_df[['UserId', 'ItemId', 'Count']]

elif input_df.__class__.__name__ == 'coo_matrix':
elif issparse(input_df) and (input_df.format == "coo"):
self.nusers = input_df.shape[0]
self.nitems = input_df.shape[1]
input_df = pd.DataFrame({
Expand All @@ -456,7 +456,7 @@ def _process_data(self, input_df):
self.reindex = False
calc_n = False
else:
raise ValueError("'input_df' must be a pandas data frame, numpy array, or scipy sparse coo_matrix.")
raise ValueError("'input_df' must be a pandas data frame, numpy array, or scipy sparse coo_array.")

if self.stop_crit in ['maxiter', 'diff-norm']:
thr = 0
Expand Down Expand Up @@ -539,7 +539,7 @@ def _process_valset(self, val_set, valset=True):
assert 'Count' in val_set.columns.values
self.val_set = val_set[['UserId', 'ItemId', 'Count']]

elif val_set.__class__.__name__ == 'coo_matrix':
elif issparse(val_set) and (val_set.format == "coo"):
assert val_set.shape[0] <= self.nusers
assert val_set.shape[1] <= self.nitems
self.val_set = pd.DataFrame({
Expand All @@ -548,7 +548,7 @@ def _process_valset(self, val_set, valset=True):
'Count' : val_set.data
})
else:
raise ValueError("'val_set' must be a pandas data frame, numpy array, or sparse coo_matrix.")
raise ValueError("'val_set' must be a pandas data frame, numpy array, or sparse coo_array.")

if self.stop_crit == 'val-llk':
thr = 0
Expand Down Expand Up @@ -591,8 +591,7 @@ def _store_metadata(self, for_partial_fit=False):
cython_loops = cython_loops_float if self.use_float else cython_loops_double
if self.verbose and for_partial_fit:
print("Creating user indices for stochastic optimization...")
X = coo_matrix((self.input_df.Count.values, (self.input_df.UserId.values, self.input_df.ItemId.values)), shape=(self.nusers, self.nitems))
X = csr_matrix(X)
X = coo_array((self.input_df.Count.values, (self.input_df.UserId.values, self.input_df.ItemId.values)), shape=(self.nusers, self.nitems)).tocsr()
self._n_seen_by_user = X.indptr[1:] - X.indptr[:-1]
if for_partial_fit:
self._st_ix_user = X.indptr.astype(cython_loops.obj_ind_type)
Expand Down
5 changes: 2 additions & 3 deletions hpfrec/cython_loops.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,8 @@ def cast_ind_type(n):
### Procedures reusable by package ctpfrec
##########################################
def get_csc_data(ix_u, ix_i, Y, nU, nI):
from scipy.sparse import coo_matrix, csc_matrix
X = coo_matrix((Y, (ix_u, ix_i)), shape=(nU, nI))
X = csc_matrix(X)
from scipy.sparse import coo_array
X = coo_array((Y, (ix_u, ix_i)), shape=(nU, nI)).tocsc()
return X.indptr.astype(obj_ind_type), X.indices.astype(obj_ind_type), X.data.astype(c_real_t)

def get_unique_items_batch(np.ndarray[ind_type, ndim=1] users_this_batch,
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pandas>=0.24
numpy>=1.18
scipy
scipy>=1.11.1
cython
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def test_supports_compile_arg(self, comm, with_omp=False):
'scipy',
'cython'
],
version = '0.2.5-11',
version = '0.2.6',
description = 'Hierarchical Poisson matrix factorization for recommender systems',
author = 'David Cortes',
url = 'https://github.com/david-cortes/hpfrec',
Expand Down

0 comments on commit 5bee294

Please sign in to comment.