Merge branch 'type_hinting' of github.com:dkaslovsky/Coupled-Biased-R…

…andom-Walks into integrate_py3_only_branch
dkaslovsky · Aug 30, 2020 · 27cbd1c · 27cbd1c
2 parents 1e839f9 + f0e6495
commit 27cbd1c
Show file tree

Hide file tree

Showing 15 changed files with 164 additions and 146 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -2,10 +2,8 @@ language: python
 
 matrix:
   include:
-    - python: 2.7
-    - python: 3.5
-    - python: 3.6
     - python: 3.7
+    - python: 3.8
 
 install:
   - pip install -r requirements.txt

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,8 +1,12 @@
+## 2.0.0 / 2020-08-30
+* [Added] type hints
+* [Changed] removed support for Python 2 and <3.7
+* [Changed] updated dependencies to latest versions
+
 ## 1.1.1 / 2020-08-29
 * [Fixed] enforce feature weight and stationary probability sum normalization
 
 ## 1.1.0 / 2020-08-23
-
 * [Added] `CBRW.value_scores()` function to return individual value scores of an observation
 * [Added] exceptions inherit from base `CBRWError` exception
 * [Added] test cases for NaN values

diff --git a/README.md b/README.md
@@ -3,9 +3,12 @@ Outlier detection for categorical data
 
 [![Build Status](https://travis-ci.org/dkaslovsky/Coupled-Biased-Random-Walks.svg?branch=master)](https://travis-ci.org/dkaslovsky/Coupled-Biased-Random-Walks)
 [![Coverage Status](https://coveralls.io/repos/github/dkaslovsky/Coupled-Biased-Random-Walks/badge.svg?branch=master)](https://coveralls.io/github/dkaslovsky/Coupled-Biased-Random-Walks?branch=master)
+![PyPI - Python Version](https://img.shields.io/pypi/pyversions/Coupled-Biased-Random-Walks)
 
 ### Overview
-Python [2.7, 3.x] implementation of the Coupled Biased Random Walks (CBRW) outlier detection algorithm described by Pang, Cao, and Chen in https://www.ijcai.org/Proceedings/16/Papers/272.pdf.
+Python implementation of the Coupled Biased Random Walks (CBRW) outlier detection algorithm described by Pang, Cao, and Chen in https://www.ijcai.org/Proceedings/16/Papers/272.pdf.
+
+__NOTE__: Only Python>=3.7 is supported as of version 2.0.0.
 
 This implementation operates on Python dicts rather than Pandas DataFrames.  This has the advantage of allowing the model to be updated with new observations in a trivial manner and is more efficient in certain aspects.  However, these advantages come at the cost of iterating a (potentially large) dict of observed values more times than might otherwise be necessary using an underlying DataFrame implementation.
 
@@ -14,7 +17,7 @@ If one is working with data previously loaded into a DataFrame, simply use the r
 ### Installation
 This package is hosted on PyPI and can be installed via `pip`:
 ```
-$ pip install coupled_biased_random_walks
+$ pip install coupled-biased-random-walks
 ```
 To instead install from source:
 ```
@@ -26,7 +29,7 @@ $ python setup.py install
 ### Example
 Let's run the CBRW detection algorithm on the authors' example data set from the paper:
 
-<img src="./example_table.png" width="400">
+<img src="./img/example_table.png" width="400">
 
 This data is saved as a [CSV file](./data/CBRW_paper_example.csv) in this repository and is loaded into memory as a list of dicts by [example.py](./example.py).  Note that we drop the `Cheat?` column when loading the data, as this is essentially the target variable indicating the anomalous activity to be detected.  The detector is instantiated and observations are added as follows:
 ```

diff --git a/coupled_biased_random_walks/count.py b/coupled_biased_random_walks/count.py
@@ -1,14 +1,9 @@
 from collections import Counter, defaultdict
+from collections.abc import Mapping
 from itertools import combinations, tee
+from typing import Any, Dict, Hashable, Iterable, Tuple
 
-from six import iteritems
-
-try:
-    # python 2
-    from collections import Mapping
-except ImportError:
-    # python 3
-    from collections.abc import Mapping
+from coupled_biased_random_walks.types import obs_item_type, observation_type
 
 
 class IncrementingDict(Mapping):
@@ -24,7 +19,7 @@ def __init__(self):
         self._d = {}
         self._next_val = 0
 
-    def insert(self, key):
+    def insert(self, key: Hashable) -> None:
         """
         Inserts a (strictly new) key
         :param key: any hashable object to be used as a key
@@ -47,7 +42,7 @@ def __repr__(self):
         return self._d.__repr__()
 
 
-class ObservationCounter(object):
+class ObservationCounter:
 
     """
     Counts single and joint occurrences of key/value pairs in a dict with
@@ -69,18 +64,18 @@ def __init__(self):
         self._index = IncrementingDict()
 
     @property
-    def counts(self):
+    def counts(self) -> Dict[str, Counter]:
         return dict(self._counts)
 
     @property
-    def joint_counts(self):
+    def joint_counts(self) -> Dict[Tuple[obs_item_type, obs_item_type], int]:
         return dict(self._joint_counts)
 
     @property
-    def index(self):
+    def index(self) -> IncrementingDict:
         return self._index
 
-    def update(self, observation_iterable):
+    def update(self, observation_iterable: Iterable[observation_type]) -> None:
         """
         Update counts with new observation(s)
         :param observation_iterable: list of dicts
@@ -91,13 +86,13 @@ def update(self, observation_iterable):
             # feature name with value NaN represents a missing feature in the
             # observation (e.g., a missing value is NaN-filled in a pandas DataFrame) so
             # we remove any such features from the observation to avoid including in counts
-            obs = {key: value for key, value in iteritems(observation) if not isnan(value)}
+            obs = {key: value for key, value in observation.items() if not isnan(value)}
             # create iterators of obs for updating counts
-            obs1, obs2 = tee(iteritems(obs), 2)
+            obs1, obs2 = tee(obs.items(), 2)
             self._update_counts(obs1)
             self._update_joint_counts(obs2)
 
-    def get_count(self, item):
+    def get_count(self, item: obs_item_type) -> int:
         """
         Getter to safely retrieve count from interal data structure of defaultdict(Counter)
         :param item: tuple of the form ('feature_name', 'feature_value')
@@ -111,7 +106,7 @@ def get_count(self, item):
             # meaning there is no count for the feature_name
             return 0
 
-    def _update_counts(self, observation):
+    def _update_counts(self, observation: obs_item_type) -> None:
         """
         Update single counts
         :param observation: iterable of tuples of the form ('feature_name', 'feature_value')
@@ -122,34 +117,34 @@ def _update_counts(self, observation):
             self._index.insert(item)
             self.n_obs.update([feature_name])
 
-    def _update_joint_counts(self, observation):
+    def _update_joint_counts(self, observations: Iterable[obs_item_type]) -> None:
         """
         Update joint counts
         :param observation: iterable of tuples of the form ('feature_name', 'feature_value')
         """
-        pairs = combinations(sorted(observation), 2)
+        pairs = combinations(sorted(observations), 2)
         self._joint_counts.update(pairs)
 
 
 # Helper functions
 
-def get_feature_name(feature_tuple):
+def get_feature_name(feature_tuple: obs_item_type) -> str:
     """
     Helper function to return feature name from tuple representation
     :param feature_tuple: tuple of the form (feature_name, feature_value)
     """
     return feature_tuple[0]
 
 
-def get_feature_value(feature_tuple):
+def get_feature_value(feature_tuple: obs_item_type) -> str:
     """
     Helper function to return feature value from tuple representation
     :param feature_tuple: tuple of the form (feature_name, feature_value)
     """
     return feature_tuple[1]
 
 
-def get_mode(counter):
+def get_mode(counter: Counter) -> int:
     """
     Helper function to return the count of the most common
     element from an instance of Counter()
@@ -163,7 +158,7 @@ def get_mode(counter):
     return mode[0][1]
 
 
-def isnan(x):
+def isnan(x: Any) -> bool:
     """
     Return True if x is NaN where x can be of any type
     :param x: any object for which (in)equality can be checked

diff --git a/coupled_biased_random_walks/detect.py b/coupled_biased_random_walks/detect.py
@@ -1,9 +1,10 @@
-from __future__ import division
+from __future__ import annotations
 
 from collections import defaultdict
+from typing import Dict, Iterable, List, Optional
 
 import numpy as np
-from six import iteritems, itervalues
+from scipy.sparse import csr_matrix
 
 from coupled_biased_random_walks.count import (
     ObservationCounter,
@@ -16,9 +17,10 @@
     random_walk,
     row_normalize_csr_matrix,
 )
+from coupled_biased_random_walks.types import obs_item_type, observation_type
 
 
-class CBRW(object):
+class CBRW:
 
     """ Class implementing Coupled Biased Random Walks algorithm """
 
@@ -29,25 +31,29 @@ class CBRW(object):
         'max_iter': 100    # max number of steps to take
     }
 
-    def __init__(self, rw_params=None, ignore_unknown=False):
+    def __init__(
+        self,
+        rw_params: Optional[Dict[str, float]] = None,
+        ignore_unknown: bool = False,
+    ):
         """
         :param rw_params: random walk parameters to override defaults
         :param ignore_unknown: if True, score an observation containing unknown feature names
         or values based only on features seen during training; if False, score such an observation
         as nan (default)
         """
-        self.rw_params = rw_params if rw_params else self.PRESET_RW_PARAMS
+        self.rw_params = rw_params or self.PRESET_RW_PARAMS
         self._unknown_feature_score = 0 if ignore_unknown else np.nan
 
         self._counter = ObservationCounter()
-        self._stationary_prob = None
-        self._feature_relevance = None
+        self._stationary_prob = None    # type: Optional[Dict[obs_item_type, float]]
+        self._feature_relevance = None  # type: Optional[Dict[str, float]]
 
     @property
-    def feature_weights(self):
+    def feature_weights(self) -> Optional[Dict[str, float]]:
         return self._feature_relevance
 
-    def add_observations(self, observation_iterable):
+    def add_observations(self, observation_iterable: Iterable[observation_type]) -> CBRW:
         """
         Add observations to be modeled
         :param observation_iterable: list of dicts with each dict representing an observation
@@ -56,7 +62,7 @@ def add_observations(self, observation_iterable):
         self._counter.update(observation_iterable)
         return self
 
-    def fit(self):
+    def fit(self) -> CBRW:
         """
         Compute model based on current observations in state
         """
@@ -74,22 +80,22 @@ def fit(self):
         # allocate probability by feature
         stationary_prob = {}
         feature_relevance = defaultdict(int)
-        for feature, idx in iteritems(self._counter.index):
+        for feature, idx in self._counter.index.items():
             prob = pi[idx]
             stationary_prob[feature] = prob
             feature_relevance[get_feature_name(feature)] += prob
 
         # sum normalize feature_relevance
-        feature_rel_sum = sum(itervalues(feature_relevance))
+        feature_rel_sum = sum(feature_relevance.values())
         if feature_rel_sum < EPS:
             raise CBRWFitError('feature weights sum approximately zero')
-        feature_relevance = {key: val/feature_rel_sum for key, val in iteritems(feature_relevance)}
+        feature_relevance = {key: val/feature_rel_sum for key, val in feature_relevance.items()}
 
         self._stationary_prob = stationary_prob
         self._feature_relevance = feature_relevance
         return self
 
-    def score(self, observation_iterable):
+    def score(self, observation_iterable: Iterable[observation_type]) -> np.array:
         """
         Compute an anomaly score for each observation in observation_iterable
         :param observation_iterable: iterable of dict observations with each dict
@@ -101,14 +107,17 @@ def score(self, observation_iterable):
             observation_iterable = [observation_iterable]
         return np.array([self._score(obs) for obs in observation_iterable])
 
-    def _score(self, observation):
+    def _score(self, observation: observation_type) -> float:
         """
         Compute the weighted anomaly score (object_score in the paper) for an observation
         :param observation: dict of the form {feature_name: feature_value, ...}
         """
-        return sum(itervalues(self._value_scores(observation)))
+        return sum(self._value_scores(observation).values())
 
-    def value_scores(self, observation_iterable):
+    def value_scores(
+        self,
+        observation_iterable: Iterable[observation_type],
+    ) -> List[Dict[str, float]]:
         """
         Compute an anomaly sub-score for each value of each observation in observation_iterable
         :param observation_iterable: iterable of dict observations with each dict
@@ -123,7 +132,7 @@ def value_scores(self, observation_iterable):
             observation_iterable = [observation_iterable]
         return [self._value_scores(obs) for obs in observation_iterable]
 
-    def _value_scores(self, observation):
+    def _value_scores(self, observation: observation_type) -> Dict[str, float]:
         """
         Compute the weighted value scores for each feature value of an observation
         :param observation: dict of the form {feature_name: feature_value, ...}
@@ -132,26 +141,26 @@ def _value_scores(self, observation):
             get_feature_name(item):
                 self._get_feature_relevance(item) *
                 self._stationary_prob.get(item, self._unknown_feature_score)
-            for item in iteritems(observation)
+            for item in observation.items()
         }
 
-    def _get_feature_relevance(self, feature_tuple):
+    def _get_feature_relevance(self, feature_tuple: obs_item_type) -> float:
         """
         Getter for the relevance (weight) of a feature (category)
         :param feature_tuple:  tuple of the form (feature_name, feature_value)
         """
         feature_name = get_feature_name(feature_tuple)
         return self._feature_relevance.get(feature_name, 0)
 
-    def _compute_biased_transition_matrix(self):
+    def _compute_biased_transition_matrix(self) -> csr_matrix:
         """
         Computes biased probability transition matrix of conditional probabilities
         """
-        prob_idx = {}
+        prob_idx = {}  # type: Dict[obs_item_type, float]
 
         bias_dict = self._compute_biases()
 
-        for (feature1, feature2), joint_count in iteritems(self._counter.joint_counts):
+        for (feature1, feature2), joint_count in self._counter.joint_counts.items():
 
             # get index for features
             feature1_idx = self._counter.index[feature1]
@@ -180,19 +189,17 @@ def _compute_biased_transition_matrix(self):
         trans_matrix = dict_to_csr_matrix(prob_idx, shape=n_features)
         return row_normalize_csr_matrix(trans_matrix)
 
-    def _compute_biases(self):
+    def _compute_biases(self) -> Dict[obs_item_type, float]:
         """
         Computes bias for random walk for each feature tuple
         """
-        bias_dict = {}
-        for feature_name, value_counts in iteritems(self._counter.counts):
+        bias_dict = {}  # type: Dict[obs_item_type, float]
+        for feature_name, value_counts in self._counter.counts.items():
             mode = get_mode(value_counts)
             base = 1 - (mode / self._counter.n_obs[feature_name])
-            bias = {
-                feature_val: (1 - (count / mode) + base) / 2
-                for feature_val, count in iteritems(value_counts)
-            }
-            bias_dict.update(bias)
+            for feature_val, count in value_counts.items():
+                bias = (1 - (count / mode) + base) / 2
+                bias_dict[feature_val] = bias
         return bias_dict