update to scipy sparse array interface

david-cortes · Jul 3, 2023 · 5bee294 · 5bee294
1 parent c204011
commit 5bee294
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 14 deletions.
diff --git a/hpfrec/__init__.py b/hpfrec/__init__.py
@@ -2,7 +2,7 @@
 import multiprocessing, os, warnings
 from . import cython_loops_float, cython_loops_double, _check_openmp
 import ctypes, types, inspect
-from scipy.sparse import coo_matrix, csr_matrix
+from scipy.sparse import coo_array, issparse
 ### TODO: don't do this, use iloc/loc and make copies instead
 pd.options.mode.chained_assignment = None
 
@@ -381,11 +381,11 @@ def fit(self, counts_df, val_set=None):
 
 		Parameters
 		----------
-		counts_df : pandas data frame (nobs, 3) or coo_matrix
+		counts_df : pandas data frame (nobs, 3) or coo_array
 			Input data with one row per non-zero observation, consisting of triplets ('UserId', 'ItemId', 'Count').
 			Must containin columns 'UserId', 'ItemId', and 'Count'.
 			Combinations of users and items not present are implicitly assumed to be zero by the model.
-			Can also pass a sparse coo_matrix, in which case 'reindex' will be forced to 'False'.
+			Can also pass a sparse coo_array, in which case 'reindex' will be forced to 'False'.
 		val_set : pandas data frame (nobs, 3)
 			Validation set on which to monitor log-likelihood. Same format as counts_df.
 
@@ -445,7 +445,7 @@ def _process_data(self, input_df):
 			assert 'Count' in input_df.columns.values
 			input_df = input_df[['UserId', 'ItemId', 'Count']]
 
-		elif input_df.__class__.__name__ == 'coo_matrix':
+		elif issparse(input_df) and (input_df.format == "coo"):
 			self.nusers = input_df.shape[0]
 			self.nitems = input_df.shape[1]
 			input_df = pd.DataFrame({
@@ -456,7 +456,7 @@ def _process_data(self, input_df):
 			self.reindex = False
 			calc_n = False
 		else:
-			raise ValueError("'input_df' must be a pandas data frame, numpy array, or scipy sparse coo_matrix.")
+			raise ValueError("'input_df' must be a pandas data frame, numpy array, or scipy sparse coo_array.")
 
 		if self.stop_crit in ['maxiter', 'diff-norm']:
 			thr = 0
@@ -539,7 +539,7 @@ def _process_valset(self, val_set, valset=True):
 			assert 'Count' in val_set.columns.values
 			self.val_set = val_set[['UserId', 'ItemId', 'Count']]
 
-		elif val_set.__class__.__name__ == 'coo_matrix':
+		elif issparse(val_set) and (val_set.format == "coo"):
 			assert val_set.shape[0] <= self.nusers
 			assert val_set.shape[1] <= self.nitems
 			self.val_set = pd.DataFrame({
@@ -548,7 +548,7 @@ def _process_valset(self, val_set, valset=True):
 				'Count'  : val_set.data
 				})
 		else:
-			raise ValueError("'val_set' must be a pandas data frame, numpy array, or sparse coo_matrix.")
+			raise ValueError("'val_set' must be a pandas data frame, numpy array, or sparse coo_array.")
 
 		if self.stop_crit == 'val-llk':
 			thr = 0
@@ -591,8 +591,7 @@ def _store_metadata(self, for_partial_fit=False):
 		cython_loops = cython_loops_float if self.use_float else cython_loops_double
 		if self.verbose and for_partial_fit:
 			print("Creating user indices for stochastic optimization...")
-		X = coo_matrix((self.input_df.Count.values, (self.input_df.UserId.values, self.input_df.ItemId.values)), shape=(self.nusers, self.nitems))
-		X = csr_matrix(X)
+		X = coo_array((self.input_df.Count.values, (self.input_df.UserId.values, self.input_df.ItemId.values)), shape=(self.nusers, self.nitems)).tocsr()
 		self._n_seen_by_user = X.indptr[1:] - X.indptr[:-1]
 		if for_partial_fit:
 			self._st_ix_user = X.indptr.astype(cython_loops.obj_ind_type)

diff --git a/hpfrec/cython_loops.pxi b/hpfrec/cython_loops.pxi
@@ -37,9 +37,8 @@ def cast_ind_type(n):
 ### Procedures reusable by package ctpfrec
 ##########################################
 def get_csc_data(ix_u, ix_i, Y, nU, nI):
-	from scipy.sparse import coo_matrix, csc_matrix
-	X = coo_matrix((Y, (ix_u, ix_i)), shape=(nU, nI))
-	X = csc_matrix(X)
+	from scipy.sparse import coo_array
+	X = coo_array((Y, (ix_u, ix_i)), shape=(nU, nI)).tocsc()
 	return X.indptr.astype(obj_ind_type), X.indices.astype(obj_ind_type), X.data.astype(c_real_t)
 
 def get_unique_items_batch(np.ndarray[ind_type, ndim=1] users_this_batch,

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
 pandas>=0.24
 numpy>=1.18
-scipy 
+scipy>=1.11.1
 cython
diff --git a/setup.py b/setup.py
@@ -213,7 +213,7 @@ def test_supports_compile_arg(self, comm, with_omp=False):
      'scipy',
      'cython'
 ],
-    version = '0.2.5-11',
+    version = '0.2.6',
     description = 'Hierarchical Poisson matrix factorization for recommender systems',
     author = 'David Cortes',
     url = 'https://github.com/david-cortes/hpfrec',