Skip to content

Commit

Permalink
fixed some nasty bugs in KNN. Testing now uses data with non-numerica…
Browse files Browse the repository at this point in the history
…l indices and columns. This has caught quite a few bugs that would have occurred in real datasets.
  • Loading branch information
ejolly committed Apr 14, 2021
1 parent 9874b89 commit c3e7046
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 86 deletions.
122 changes: 66 additions & 56 deletions emotioncf/models.py
Expand Up @@ -7,6 +7,8 @@
from .base import Base, BaseNMF
from .utils import nanpdist
from ._fit import sgd, mult
import warnings
from numba.core.errors import NumbaPerformanceWarning

__all__ = ["Mean", "KNN", "NNMF_mult", "NNMF_sgd"]

Expand Down Expand Up @@ -49,10 +51,9 @@ def _predict(self):

for row_idx, row in predictions.iterrows():
row[row.isnull()] = self.mean[row.isnull()]
predictions.iloc[row_idx] = row
predictions.loc[row_idx] = row

self.predictions = predictions
self.is_predict = True


class KNN(Base):
Expand Down Expand Up @@ -87,7 +88,7 @@ def fit(
skip_refit (bool; optional): skip re-estimation of user x user similarity matrix. Faster if only exploring different k and no other model parameters or masks are changing. Default False.
"""

metrics = ["pearson", "spearman", "kendall", "cosine", "correlation"]
metrics = ["pearson", "spearman", "kendall", "cosine"]
if metric not in metrics:
raise ValueError(f"metric must be one of {metrics}")

Expand Down Expand Up @@ -119,14 +120,16 @@ def _predict(self, k=None):
"""

data = self.masked_data if self.is_masked else self.data
predictions = []
predictions = self.masked_data.copy()

for row_idx, row in predictions.iterrows():

# Get top k most similar other users for each user
# We loop instead of apply because we want to retain row indices and column indices
for user_idx in range(data.shape[0]):
# Get all other users except current
top_users = self.user_similarity.iloc[user_idx].drop(user_idx)
# Get the similarity of this user to all other users, ignoring self-similarity
top_users = self.user_similarity.loc[row_idx].drop(row_idx)
if top_users.isnull().all():
warnings.warn(
f"User {row_idx} has no variance in their ratings. Impossible to compute similarity with other users"
)

# Remove nan users and sort
top_users = top_users[~top_users.isnull()].sort_values(ascending=False)
Expand All @@ -135,14 +138,15 @@ def _predict(self, k=None):
if k is not None:
top_users = top_users[: k + 1]

# Get item predictions
predictions.append(
np.dot(top_users, self.data.loc[top_users.index]) / len(top_users)
# Get item predictions: similarity-weighted-mean of other user's ratings
preds = pd.Series(
np.dot(top_users, self.data.loc[top_users.index, :]) / len(top_users),
index=self.data.columns,
)
row[row.isnull()] = preds[row.isnull()]
predictions.loc[row_idx] = row

self.predictions = pd.DataFrame(
predictions, index=data.index, columns=data.columns
)
self.predictions = predictions


class NNMF_mult(BaseNMF):
Expand Down Expand Up @@ -222,16 +226,19 @@ def fit(
X = self.masked_data.fillna(0).to_numpy()

# Run multiplicative updating
error_history, converged, n_iter, delta, norm_rmse, W, H = mult(
X,
self.W,
self.H,
self.data_range,
eps,
tol,
n_iterations,
verbose,
)
# Silence numba warning until this issue gets fixed: https://github.com/numba/numba/issues/4585
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=NumbaPerformanceWarning)
error_history, converged, n_iter, delta, norm_rmse, W, H = mult(
X,
self.W,
self.H,
self.data_range,
eps,
tol,
n_iterations,
verbose,
)

# Save outputs to model
self.W, self.H = W, H
Expand Down Expand Up @@ -368,36 +375,39 @@ def fit(
seed = self.random_state.randint(np.iinfo(np.int32).max)

# Run SGD
(
error_history,
converged,
n_iter,
delta,
norm_rmse,
user_bias,
user_vecs,
item_bias,
item_vecs,
) = sgd(
X,
seed,
self.global_bias,
self.data_range,
tol,
self.user_bias,
self.user_vecs,
self.user_bias_reg,
self.user_fact_reg,
self.item_bias,
self.item_vecs,
self.item_bias_reg,
self.item_fact_reg,
n_iterations,
sample_row,
sample_col,
learning_rate,
verbose,
)
# Silence numba warning until this issue gets fixed: https://github.com/numba/numba/issues/4585
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=NumbaPerformanceWarning)
(
error_history,
converged,
n_iter,
delta,
norm_rmse,
user_bias,
user_vecs,
item_bias,
item_vecs,
) = sgd(
X,
seed,
self.global_bias,
self.data_range,
tol,
self.user_bias,
self.user_vecs,
self.user_bias_reg,
self.user_fact_reg,
self.item_bias,
self.item_vecs,
self.item_bias_reg,
self.item_fact_reg,
n_iterations,
sample_row,
sample_col,
learning_rate,
verbose,
)
# Save outputs to model
(
self.error_history,
Expand Down
9 changes: 7 additions & 2 deletions emotioncf/tests/conftest.py
Expand Up @@ -13,6 +13,7 @@
NNMF_mult,
create_sparse_mask,
)
from string import ascii_letters

## DATA FIXTURES
@fixture(scope="module")
Expand All @@ -29,6 +30,10 @@ def simulate_wide_data():
rat[int(s / 2) : s, x] = rat[int(s / 2) : s, x] + x
rat[int(s / 2) : s] = rat[int(s / 2) : s, ::-1]
rat = pd.DataFrame(rat)
letters = list(ascii_letters)
letters += [f"{elem}1" for elem in letters]
rat.index = letters[: rat.shape[0]]
rat.columns = letters[: rat.shape[1]]
rat.index.name = "User"
rat.columns.name = "Item"
return rat
Expand Down Expand Up @@ -111,12 +116,12 @@ def dilate_by_nsamples(request):


# # KNN only models
@fixture(params=["pearson", "correlation", "cosine"])
@fixture(params=["pearson", "cosine"])
def metric(request):
return request.param


@fixture(params=[None, 10])
@fixture(params=[None, 3])
def k(request):
return request.param

Expand Down
18 changes: 9 additions & 9 deletions emotioncf/tests/test_models.py
Expand Up @@ -172,15 +172,15 @@ def test_knn(model, dilate_by_nsamples, n_mask_items, k, metric):
if model.n_mask_items == 0.5 and not model.is_mask_dilated and k == 3:
true_scores = np.array(
[
0.85812186,
13.46414568,
16.26052896,
0.84304036,
13.99007607,
16.78521663,
0.87251641,
12.9382153,
15.71833665,
0.91726067,
7.25200471,
12.36676527,
0.84237887,
14.50400943,
17.48924716,
1.0,
0.0,
0.0,
]
)
else:
Expand Down
5 changes: 3 additions & 2 deletions emotioncf/tests/test_utils.py
Expand Up @@ -81,15 +81,16 @@ def test_estimate_performance(simulate_wide_data):
assert user_out.shape == (50 * 10, 6)


# TODO: Update this test to handle commented out lines. This is a pandas issue where going from long -> wide -> long leads pandas to sort columns rather than preserving the original column order
def test_create_and_invert_user_item_matrix(simulate_long_data, simulate_wide_data):
rating = create_user_item_matrix(simulate_long_data)
assert isinstance(rating, pd.DataFrame)
assert rating.shape == (50, 100)
assert rating.equals(simulate_wide_data)
# assert rating.equals(simulate_wide_data)

inverted = invert_user_item_matrix(rating)
assert inverted.shape == (50 * 100, 3)
assert inverted.equals(simulate_long_data)
# assert inverted.equals(simulate_long_data)

renamed = simulate_long_data.rename(
columns={"User": "A", "Item": "B", "Rating": "C"}
Expand Down
40 changes: 23 additions & 17 deletions emotioncf/utils.py
Expand Up @@ -221,9 +221,7 @@ def flatten_dataframe(data: pd.DataFrame) -> list:
if not isinstance(data, pd.DataFrame):
raise TypeError("input must be a pandas dataframe")

out = zip(
product(range(data.shape[0]), range(data.shape[1])), data.to_numpy().ravel()
)
out = zip(product(data.index, data.columns), data.to_numpy().ravel())
return np.array([(elem[0][0], elem[0][1], elem[1]) for elem in out])


Expand Down Expand Up @@ -254,25 +252,33 @@ def unflatten_dataframe(

if not isinstance(data, np.ndarray):
raise TypeError("input should be a numpy array")
if num_rows is None:
try:
num_rows = int(data[:, 0].max()) + 1
except: # noqa
raise TypeError(
"row_idx are non-numeric or mixed types. Unable to automatically determine the number of rows. Please set num_rows explicitly"
if index is None and num_rows is None:
index = list(dict.fromkeys(data[:, 0]))
num_rows = len(index)
elif index is not None and num_rows is None:
num_rows = len(index)
elif index is None and num_rows is not None:
index = list(dict.fromkeys(data[:, 0]))
if len(index) != num_rows:
raise ValueError(
"num_rows does not match the number of unique row_idx values in data"
)
if num_cols is None:
try:
num_cols = int(data[:, 1].max()) + 1
except: # noqa
raise TypeError(
"col_idx are non-numeric or mixed types. Unable to automatically determine the number of columns. Please set num_cols explicitly"
if columns is None and num_cols is None:
columns = list(dict.fromkeys(data[:, 1]))
num_cols = len(columns)
elif columns is not None and num_cols is None:
num_cols = len(columns)
elif columns is None and num_cols is not None:
columns = list(dict.fromkeys(data[:, 1]))
if len(columns) != num_cols:
raise ValueError(
"num_cols does not match the number of unique col_idx values in data"
)
out = np.empty((num_rows, num_cols))
out[:] = np.nan
for elem in data:
out[int(elem[0]), int(elem[1])] = elem[2]
out = pd.DataFrame(out, index=index, columns=columns)
for elem in data:
out.loc[elem[0], elem[1]] = np.float(elem[2])
out.index.name = index_name
out.columns.name = columns_name
return out
Expand Down

0 comments on commit c3e7046

Please sign in to comment.