Skip to content

Commit

Permalink
changed indexers to size_t for fitting to larger datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
david-cortes committed Nov 4, 2018
1 parent d3ecf1f commit 016278a
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 115 deletions.
44 changes: 22 additions & 22 deletions hpfrec/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,8 +471,8 @@ def _process_data(self, input_df):
pf.write("random seed: None\n")

self.input_df['Count'] = self.input_df.Count.astype('float32')
self.input_df['UserId'] = self.input_df.UserId.astype(ctypes.c_int)
self.input_df['ItemId'] = self.input_df.ItemId.astype(ctypes.c_int)
self.input_df['UserId'] = self.input_df.UserId.astype(ctypes.c_size_t)
self.input_df['ItemId'] = self.input_df.ItemId.astype(ctypes.c_size_t)

if self.users_per_batch != 0:
if self.nusers < self.users_per_batch:
Expand Down Expand Up @@ -527,18 +527,18 @@ def _process_valset(self, val_set, valset=True):
else:
self.val_set.reset_index(drop=True, inplace=True)
self.val_set['Count'] = self.val_set.Count.astype('float32')
self.val_set['UserId'] = self.val_set.UserId.astype(ctypes.c_int)
self.val_set['ItemId'] = self.val_set.ItemId.astype(ctypes.c_int)
self.val_set['UserId'] = self.val_set.UserId.astype(ctypes.c_size_t)
self.val_set['ItemId'] = self.val_set.ItemId.astype(ctypes.c_size_t)
return None

def _store_metadata(self, for_partial_fit=False):
if self.verbose and for_partial_fit:
print("Creating user indices for stochastic optimization...")
X = coo_matrix((self.input_df.Count.values, (self.input_df.UserId.values, self.input_df.ItemId.values)))
X = coo_matrix((self.input_df.Count.values, (self.input_df.UserId.values, self.input_df.ItemId.values)), shape=(self.nusers, self.nitems))
X = csr_matrix(X)
self._n_seen_by_user = X.indptr[1:] - X.indptr[:-1]
if for_partial_fit:
self._st_ix_user = X.indptr.astype(ctypes.c_int)
self._st_ix_user = X.indptr.astype(ctypes.c_size_t)
self.input_df.sort_values('UserId', inplace=True)
else:
self._st_ix_user = X.indptr[:-1]
Expand All @@ -549,9 +549,9 @@ def _cast_before_fit(self):
## setting all parameters and data to the right type
self.Theta = np.empty((self.nusers, self.k), dtype='float32')
self.Beta = np.empty((self.nitems, self.k), dtype='float32')
self.k = cython_loops.cast_int(self.k)
self.nusers = cython_loops.cast_int(self.nusers)
self.nitems = cython_loops.cast_int(self.nitems)
self.k = cython_loops.cast_size_t(self.k)
self.nusers = cython_loops.cast_size_t(self.nusers)
self.nitems = cython_loops.cast_size_t(self.nitems)
self.ncores = cython_loops.cast_int(self.ncores)
self.maxiter = cython_loops.cast_int(self.maxiter)
self.verbose = cython_loops.cast_int(self.verbose)
Expand All @@ -576,14 +576,14 @@ def _fit(self):
if self.val_set is None:
use_valset = cython_loops.cast_int(0)
self.val_set = pd.DataFrame(np.empty((0,3)), columns=['UserId','ItemId','Count'])
self.val_set['UserId'] = self.val_set.UserId.astype(ctypes.c_int)
self.val_set['ItemId'] = self.val_set.ItemId.astype(ctypes.c_int)
self.val_set['UserId'] = self.val_set.UserId.astype(ctypes.c_size_t)
self.val_set['ItemId'] = self.val_set.ItemId.astype(ctypes.c_size_t)
self.val_set['Count'] = self.val_set.Count.values.astype('float32')
else:
use_valset = cython_loops.cast_int(1)

if self.users_per_batch == 0:
self._st_ix_user = np.arange(1).astype(ctypes.c_int)
self._st_ix_user = np.arange(1).astype(ctypes.c_size_t)

self.niter, temp = cython_loops.fit_hpf(
self.a, self.a_prime, self.b_prime,
Expand All @@ -593,7 +593,7 @@ def _fit(self):
self.maxiter, self.stop_crit, self.check_every, self.stop_thr,
self.users_per_batch, self.items_per_batch,
self.step_size, cython_loops.cast_int(self.sum_exp_trick),
self._st_ix_user.astype(ctypes.c_int),
self._st_ix_user.astype(ctypes.c_size_t),
self.save_folder, self.random_seed, self.verbose,
self.ncores, cython_loops.cast_int(self.allow_inconsistent_math),
use_valset,
Expand Down Expand Up @@ -641,7 +641,7 @@ def _process_data_single(self, counts_df):
if (counts_df.ItemId == -1).sum() > 0:
raise ValueError("Can only make calculations for items that were in the training set.")

counts_df['ItemId'] = counts_df.ItemId.values.astype(ctypes.c_int)
counts_df['ItemId'] = counts_df.ItemId.values.astype(ctypes.c_size_t)
counts_df['Count'] = counts_df.ItemId.values.astype(ctypes.c_float)
return counts_df

Expand Down Expand Up @@ -795,17 +795,17 @@ def partial_fit(self, counts_df, batch_type='users', step_size=None,
assert counts_df.shape[0] > 0

Y_batch = counts_df.Count.values.astype('float32')
ix_u_batch = counts_df.UserId.values.astype(ctypes.c_int)
ix_i_batch = counts_df.ItemId.values.astype(ctypes.c_int)
ix_u_batch = counts_df.UserId.values.astype(ctypes.c_size_t)
ix_i_batch = counts_df.ItemId.values.astype(ctypes.c_size_t)

if users_in_batch is None:
users_in_batch = np.unique(ix_u_batch)
else:
users_in_batch = np.array(users_in_batch).astype(ctypes.c_int)
users_in_batch = np.array(users_in_batch).astype(ctypes.c_size_t)
if items_in_batch is None:
items_in_batch = np.unique(ix_i_batch)
else:
items_in_batch = np.array(items_in_batch).astype(ctypes.c_int)
items_in_batch = np.array(items_in_batch).astype(ctypes.c_size_t)

if (self.Theta is None) or (self.Beta is None):
self._cast_before_fit()
Expand Down Expand Up @@ -850,7 +850,7 @@ def partial_fit(self, counts_df, batch_type='users', step_size=None,
self.Lambda_shp, self.Lambda_rte,
self.k_rte, self.t_rte,
add_k_rte, add_t_rte, self.a, self.c,
k_shp, t_shp, cython_loops.cast_int(self.k),
k_shp, t_shp, cython_loops.cast_size_t(self.k),
users_in_batch, items_in_batch,
cython_loops.cast_int(self.allow_inconsistent_math),
cython_loops.cast_float(step_size), cython_loops.cast_float(multiplier_batch),
Expand Down Expand Up @@ -970,7 +970,7 @@ def predict_factors(self, counts_df, maxiter=10, ncores=1, random_seed=1, stop_t
Theta, self.Beta,
self.Lambda_shp,
self.Lambda_rte,
cython_loops.cast_int(counts_df.shape[0]), cython_loops.cast_int(self.k),
cython_loops.cast_size_t(counts_df.shape[0]), cython_loops.cast_size_t(self.k),
cython_loops.cast_int(int(maxiter)), cython_loops.cast_int(ncores),
cython_loops.cast_int(int(random_seed)), cython_loops.cast_float(stop_thr),
cython_loops.cast_int(bool(return_all))
Expand Down Expand Up @@ -1059,7 +1059,7 @@ def add_user(self, user_id, counts_df, update_existing=False, maxiter=10, ncores

if update_all_params:
counts_df['UserId'] = user_id
counts_df['UserId'] = counts_df.UserId.astype(ctypes.c_int)
counts_df['UserId'] = counts_df.UserId.astype(ctypes.c_size_t)
self.partial_fit(counts_df, new_users=(not update_existing))
Theta_prev = self.Theta[-1].copy()
for i in range(maxiter - 1):
Expand All @@ -1080,7 +1080,7 @@ def add_user(self, user_id, counts_df, update_existing=False, maxiter=10, ncores
Theta, self.Beta,
self.Lambda_shp,
self.Lambda_rte,
cython_loops.cast_int(counts_df.shape[0]), cython_loops.cast_int(self.k),
cython_loops.cast_size_t(counts_df.shape[0]), cython_loops.cast_size_t(self.k),
cython_loops.cast_int(maxiter), cython_loops.cast_int(ncores),
cython_loops.cast_int(random_seed), cython_loops.cast_int(stop_thr),
cython_loops.cast_int(self.keep_all_objs)
Expand Down

0 comments on commit 016278a

Please sign in to comment.