From 1e6ef2b245d44c36c608f5d7700c7b18d567079f Mon Sep 17 00:00:00 2001 From: ljchang Date: Sun, 24 Jun 2018 23:15:31 -0400 Subject: [PATCH 1/2] Fixed bug with pandas 0.23.0 where result of .apply() needs to specify broadcast to return full dataframe. --- .pytest_cache/v/cache/lastfailed | 4 ++ .pytest_cache/v/cache/nodeids | 9 ++++ emotioncf/cf.py | 91 ++++++++++++++++---------------- requirements.txt | 4 +- setup.py | 25 ++++++--- 5 files changed, 80 insertions(+), 53 deletions(-) create mode 100644 .pytest_cache/v/cache/lastfailed create mode 100644 .pytest_cache/v/cache/nodeids diff --git a/.pytest_cache/v/cache/lastfailed b/.pytest_cache/v/cache/lastfailed new file mode 100644 index 0000000..7c973a7 --- /dev/null +++ b/.pytest_cache/v/cache/lastfailed @@ -0,0 +1,4 @@ +{ + "emotioncf/tests/test_core.py::test_cf_mean": true, + "emotioncf/tests/test_core.py::test_cf_nnmf_multiplicative": true +} \ No newline at end of file diff --git a/.pytest_cache/v/cache/nodeids b/.pytest_cache/v/cache/nodeids new file mode 100644 index 0000000..8a2320c --- /dev/null +++ b/.pytest_cache/v/cache/nodeids @@ -0,0 +1,9 @@ +[ + "emotioncf/tests/test_core.py::test_create_sub_by_item_matrix", + "emotioncf/tests/test_core.py::test_cf_mean", + "emotioncf/tests/test_core.py::test_cf_knn", + "emotioncf/tests/test_core.py::test_cf_knn_dil", + "emotioncf/tests/test_core.py::test_cf_nnmf_multiplicative", + "emotioncf/tests/test_core.py::test_cf_nnmf_sgd", + "emotioncf/tests/test_core.py::test_downsample" +] \ No newline at end of file diff --git a/emotioncf/cf.py b/emotioncf/cf.py index 661232c..0b3fac8 100644 --- a/emotioncf/cf.py +++ b/emotioncf/cf.py @@ -21,7 +21,7 @@ class BaseCF(object): def __init__(self, ratings, mask=None, n_train_items=None): if not isinstance(ratings, pd.DataFrame): - raise ValueError('ratings must be a pandas dataframe instance') + raise ValueError('ratings must be a pandas dataframe instance') self.ratings = ratings self.predicted_ratings = None self.is_fit = False @@ -47,7 +47,7 @@ def __repr__(self): def get_mse(self, data='all'): ''' Get overall mean squared error for predicted compared to actual for all items and subjects. ''' - + if not self.is_fit: raise ValueError('You must fit() model first before using this method.') if not self.is_predict: @@ -149,7 +149,7 @@ def split_train_test(self, n_train_items=20): n_train_items: (int) number of items for test dictionary or list of specific items ''' - + self.n_train_items = int(n_train_items) self.train_mask = self.ratings.copy() self.train_mask.loc[:,:] = np.zeros(self. ratings.shape).astype(bool) @@ -164,12 +164,12 @@ def plot_predictions(self): ''' Create plot of actual and predicted ratings''' import matplotlib.pyplot as plt - import seaborn as sns + import seaborn as sns if not self.is_fit: raise ValueError('You must fit() model first before using this method.') if not self.is_predict: raise ValueError('You must predict() model first before using this method.') - + if self.is_mask: f, ax = plt.subplots(nrows=1,ncols=3, figsize=(15,8)) else: @@ -205,12 +205,12 @@ def plot_predictions(self): def downsample(self, sampling_freq=None, target=None, target_type='samples'): ''' Downsample rating matrix to a new target frequency or number of samples using averaging. - + Args: - sampling_freq: Sampling frequency of data + sampling_freq: Sampling frequency of data target: downsampling target target_type: type of target can be [samples,seconds,hz] - + ''' if sampling_freq is None: @@ -237,20 +237,20 @@ def ds(ratings, sampling_freq=sampling_freq, target=None, target_type='samples') idx = np.concatenate([idx, np.repeat(idx[-1]+1,ratings.shape[0]-len(idx))]) return ratings.groupby(idx).mean().T - self.ratings = ds(self.ratings, sampling_freq=sampling_freq, target=target, + self.ratings = ds(self.ratings, sampling_freq=sampling_freq, target=target, target_type=target_type) if self.is_mask: - self.train_mask = ds(self.train_mask, sampling_freq=sampling_freq, + self.train_mask = ds(self.train_mask, sampling_freq=sampling_freq, target=target, target_type=target_type) self.train_mask.loc[:,:] = self.train_mask>0 if self.is_predict: - self.predicted_ratings = ds(self.predicted_ratings, + self.predicted_ratings = ds(self.predicted_ratings, sampling_freq=sampling_freq, target=target, target_type=target_type) def to_long_df(self): - + ''' Create a long format pandas dataframe with observed, predicted, and mask.''' observed = pd.DataFrame(columns=['Subject','Item','Rating','Condition']) @@ -281,7 +281,7 @@ def to_long_df(self): def _conv_ts_mean_overlap(self, sub_rating, n_samples=5): '''Dilate each rating by n samples (centered). If dilated samples are overlapping they will be averaged. - + Args: sub_rating: vector of ratings for subject n_samples: number of samples to dilate each rating @@ -305,7 +305,7 @@ def _conv_ts_mean_overlap(self, sub_rating, n_samples=5): def _dilate_ts_rating_samples(self, n_samples=None): - ''' Helper function to dilate sparse time-series ratings by n_samples. + ''' Helper function to dilate sparse time-series ratings by n_samples. Overlapping ratings will be averaged Args: @@ -314,15 +314,18 @@ def _dilate_ts_rating_samples(self, n_samples=None): Returns: masked_ratings: pandas ratings instance that has been dilated by n_samples ''' - + if n_samples is None: raise ValueError('Please specify number of samples to dilate.') - + if not self.is_mask: raise ValueError('Make sure cf instance has been masked.') masked_ratings = self.ratings[self.train_mask] - return masked_ratings.apply(lambda x: self._conv_ts_mean_overlap(x, n_samples=n_samples), axis=1) + return masked_ratings.apply(lambda x: self._conv_ts_mean_overlap(x, + n_samples=n_samples), + axis=1, + result_type='broadcast') class Mean(BaseCF): @@ -338,12 +341,12 @@ def fit(self, dilate_ts_n_samples=None): Args: metric: type of similarity {"correlation","cosine"} - dilate_ts_n_samples: will dilate masked samples by n_samples to leverage auto-correlation + dilate_ts_n_samples: will dilate masked samples by n_samples to leverage auto-correlation in estimating time-series ratings ''' - if self.is_mask: + if self.is_mask: if dilate_ts_n_samples is None: self.mean = self.ratings[self.train_mask].mean(skipna=True, axis=0) else: @@ -386,12 +389,12 @@ def fit(self, metric='pearson', dilate_ts_n_samples=None): Args: metric: type of similarity {"pearson",,"spearman","correlation","cosine"}. Note pearson and spearman are way faster. - dilate_ts_n_samples: will dilate masked samples by n_samples to leverage auto-correlation + dilate_ts_n_samples: will dilate masked samples by n_samples to leverage auto-correlation in estimating time-series ratings ''' - if self.is_mask: + if self.is_mask: if dilate_ts_n_samples is None: ratings = self.ratings[self.train_mask] else: @@ -411,9 +414,9 @@ def cosine_similarity(x,y): for x in ratings.iterrows(): for y in ratings.iterrows(): if metric is 'correlation': - sim.loc[x[0],y[0]] = pearsonr(x[1][(~x[1].isnull()) & (~y[1].isnull())],y[1][(~x[1].isnull()) & (~y[1].isnull())])[0] + sim.loc[x[0],y[0]] = pearsonr(x[1][(~x[1].isnull()) & (~y[1].isnull())],y[1][(~x[1].isnull()) & (~y[1].isnull())])[0] elif metric is 'cosine': - sim.loc[x[0],y[0]] = cosine_similarity(x[1][(~x[1].isnull()) & (~y[1].isnull())],y[1][(~x[1].isnull()) & (~y[1].isnull())]) + sim.loc[x[0],y[0]] = cosine_similarity(x[1][(~x[1].isnull()) & (~y[1].isnull())],y[1][(~x[1].isnull()) & (~y[1].isnull())]) else: raise NotImplementedError("%s is not implemented yet. Try ['pearson','spearman','correlation','cosine']" % metric ) self.subject_similarity = sim @@ -448,24 +451,24 @@ def predict(self, k=None): self.is_predict = True class NNMF_multiplicative(BaseCF): - ''' Train non negative matrix factorization model using multiplicative updates. + ''' Train non negative matrix factorization model using multiplicative updates. Allows masking to only learn the training weights. Based on http://stackoverflow.com/questions/22767695/ python-non-negative-matrix-factorization-that-handles-both-zeros-and-missing-dat - + ''' - + def __init__(self, ratings, mask=None, n_train_items=None): super(NNMF_multiplicative, self).__init__(ratings, mask, n_train_items) self.H = None self.W = None - + def fit(self, - n_factors = None, + n_factors = None, max_iterations = 100, - error_limit = 1e-6, - fit_error_limit = 1e-6, + error_limit = 1e-6, + fit_error_limit = 1e-6, verbose = False, dilate_ts_n_samples = None): @@ -477,7 +480,7 @@ def fit(self, error_limit (float): error tolerance (default=1e-6) fit_error_limit (float): fit error tolerance (default=1e-6) verbose (bool): verbose output during fitting procedure (default=True) - dilate_ts_n_samples (int): will dilate masked samples by n_samples to leverage auto-correlation + dilate_ts_n_samples (int): will dilate masked samples by n_samples to leverage auto-correlation in estimating time-series ratings ''' @@ -492,7 +495,7 @@ def fit(self, avg = np.sqrt(np.nanmean(self.ratings)/n_factors) self.H = avg*np.random.rand(n_items, n_factors) # H = Y self.W = avg*np.random.rand(n_users, n_factors) # W = A - + if self.is_mask: if dilate_ts_n_samples is None: mask = self.train_mask.values @@ -550,21 +553,21 @@ def predict(self): self.is_predict = True class NNMF_sgd(BaseCF): - ''' Train non negative matrix factorization model using stochastic gradient descent. + ''' Train non negative matrix factorization model using stochastic gradient descent. Allows masking to only learn the training weights. - This code is based off of Ethan Rosenthal's excellent tutorial + This code is based off of Ethan Rosenthal's excellent tutorial on collaborative filtering https://blog.insightdatascience.com/ explicit-matrix-factorization-als-sgd-and-all-that-jazz-b00e4d9b21ea#.kkr7mzvr2 - + ''' - + def __init__(self, ratings, mask=None, n_train_items=None): super(NNMF_sgd, self).__init__(ratings, mask, n_train_items) - def fit(self, - n_factors=None, - item_fact_reg=0.0, + def fit(self, + n_factors=None, + item_fact_reg=0.0, user_fact_reg=0.0, item_bias_reg=0.0, user_bias_reg=0.0, @@ -581,7 +584,7 @@ def fit(self, error_limit (float): error tolerance (default=1e-6) fit_error_limit (float): fit error tolerance (default=1e-6) verbose (bool): verbose output during fitting procedure (default=True) - dilate_ts_n_samples (int): will dilate masked samples by n_samples to leverage auto-correlation + dilate_ts_n_samples (int): will dilate masked samples by n_samples to leverage auto-correlation in estimating time-series ratings ''' @@ -606,7 +609,7 @@ def fit(self, sample_row, sample_col = ratings.values.nonzero() self.global_bias = self.ratings[~self.ratings.isnull()].mean().mean() - # initialize latent vectors + # initialize latent vectors self.user_vecs = np.random.normal(scale=1./n_factors, size=(n_users, n_factors)) self.item_vecs = np.random.normal(scale=1./n_factors, size=(n_items, n_factors)) @@ -633,11 +636,11 @@ def fit(self, prediction = self._predict_single(u,i) e = (ratings.iloc[u,i] - prediction) # error - + # Update biases self.user_bias[u] += (learning_rate * (e - self.user_bias_reg * self.user_bias[u])) self.item_bias[i] += (learning_rate * (e - self.item_bias_reg * self.item_bias[i])) - + # Update latent factors self.user_vecs[u, :] += (learning_rate * (e * self.item_vecs[i, :] - self.user_fact_reg * self.user_vecs[u,:])) self.item_vecs[i, :] += (learning_rate * (e * self.user_vecs[u, :] - self.item_fact_reg * self.item_vecs[i,:])) @@ -666,5 +669,3 @@ def _predict_single(self, u, i): prediction = self.global_bias + self.user_bias[u] + self.item_bias[i] prediction += self.user_vecs[u, :].dot(self.item_vecs[i, :].T) return prediction - - diff --git a/requirements.txt b/requirements.txt index af3fc8b..adea6fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ numpy -pandas +pandas >= 0.23.0 scipy matplotlib -seaborn \ No newline at end of file +seaborn diff --git a/setup.py b/setup.py index 313d52a..018e070 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,12 @@ -from emotioncf.version import __version__ from setuptools import setup, find_packages +version = {} +with open("emotioncf/version.py") as f: + exec(f.read(), version) + +with open('requirements.txt') as f: + requirements = f.read().splitlines() + extra_setuptools_args = dict( tests_require=['pytest'] ) @@ -8,14 +14,21 @@ setup( name="emotioncf", version=__version__, - description="Emotion Rating Collaborative Filtering", + description='A Python package for performing Collaborative Filtering on ', + 'sparse emotion ratings", maintainer='Luke Chang', maintainer_email='luke.j.chang@dartmouth.edu', url='http://github.com/ljchang/emotionCF', - install_requires=['numpy', 'scipy', 'pandas', 'matplotlib', 'seaborn'], + install_requires=requirements, packages=find_packages(exclude=['emotioncf/tests']), license='MIT', - # download_url='https://github.com/ljchang/emotionCF/archive/%s.tar.gz' % - # __version__, + keywords = ['emotion', 'collaborative filtering', 'recommender','machine-learning'], + classifiers = [ + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3.6", + "Operating System :: OS Independent", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License" + ], **extra_setuptools_args -) \ No newline at end of file +) From b285cb694d8e81a82dc87882c20360a199530595 Mon Sep 17 00:00:00 2001 From: ljchang Date: Sun, 24 Jun 2018 23:18:02 -0400 Subject: [PATCH 2/2] updated version to 0.0.2 --- emotioncf/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emotioncf/version.py b/emotioncf/version.py index b8023d8..d18f409 100644 --- a/emotioncf/version.py +++ b/emotioncf/version.py @@ -1 +1 @@ -__version__ = '0.0.1' +__version__ = '0.0.2'