Skip to content

Commit

Permalink
Merge branch 'type_hinting' of github.com:dkaslovsky/Coupled-Biased-R…
Browse files Browse the repository at this point in the history
…andom-Walks into integrate_py3_only_branch
  • Loading branch information
dkaslovsky committed Aug 30, 2020
2 parents 1e839f9 + f0e6495 commit 27cbd1c
Show file tree
Hide file tree
Showing 15 changed files with 164 additions and 146 deletions.
4 changes: 1 addition & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@ language: python

matrix:
include:
- python: 2.7
- python: 3.5
- python: 3.6
- python: 3.7
- python: 3.8

install:
- pip install -r requirements.txt
Expand Down
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
## 2.0.0 / 2020-08-30
* [Added] type hints
* [Changed] removed support for Python 2 and <3.7
* [Changed] updated dependencies to latest versions

## 1.1.1 / 2020-08-29
* [Fixed] enforce feature weight and stationary probability sum normalization

## 1.1.0 / 2020-08-23

* [Added] `CBRW.value_scores()` function to return individual value scores of an observation
* [Added] exceptions inherit from base `CBRWError` exception
* [Added] test cases for NaN values
Expand Down
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@ Outlier detection for categorical data

[![Build Status](https://travis-ci.org/dkaslovsky/Coupled-Biased-Random-Walks.svg?branch=master)](https://travis-ci.org/dkaslovsky/Coupled-Biased-Random-Walks)
[![Coverage Status](https://coveralls.io/repos/github/dkaslovsky/Coupled-Biased-Random-Walks/badge.svg?branch=master)](https://coveralls.io/github/dkaslovsky/Coupled-Biased-Random-Walks?branch=master)
![PyPI - Python Version](https://img.shields.io/pypi/pyversions/Coupled-Biased-Random-Walks)

### Overview
Python [2.7, 3.x] implementation of the Coupled Biased Random Walks (CBRW) outlier detection algorithm described by Pang, Cao, and Chen in https://www.ijcai.org/Proceedings/16/Papers/272.pdf.
Python implementation of the Coupled Biased Random Walks (CBRW) outlier detection algorithm described by Pang, Cao, and Chen in https://www.ijcai.org/Proceedings/16/Papers/272.pdf.

__NOTE__: Only Python>=3.7 is supported as of version 2.0.0.

This implementation operates on Python dicts rather than Pandas DataFrames. This has the advantage of allowing the model to be updated with new observations in a trivial manner and is more efficient in certain aspects. However, these advantages come at the cost of iterating a (potentially large) dict of observed values more times than might otherwise be necessary using an underlying DataFrame implementation.

Expand All @@ -14,7 +17,7 @@ If one is working with data previously loaded into a DataFrame, simply use the r
### Installation
This package is hosted on PyPI and can be installed via `pip`:
```
$ pip install coupled_biased_random_walks
$ pip install coupled-biased-random-walks
```
To instead install from source:
```
Expand All @@ -26,7 +29,7 @@ $ python setup.py install
### Example
Let's run the CBRW detection algorithm on the authors' example data set from the paper:

<img src="./example_table.png" width="400">
<img src="./img/example_table.png" width="400">

This data is saved as a [CSV file](./data/CBRW_paper_example.csv) in this repository and is loaded into memory as a list of dicts by [example.py](./example.py). Note that we drop the `Cheat?` column when loading the data, as this is essentially the target variable indicating the anomalous activity to be detected. The detector is instantiated and observations are added as follows:
```
Expand Down
43 changes: 19 additions & 24 deletions coupled_biased_random_walks/count.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
from collections import Counter, defaultdict
from collections.abc import Mapping
from itertools import combinations, tee
from typing import Any, Dict, Hashable, Iterable, Tuple

from six import iteritems

try:
# python 2
from collections import Mapping
except ImportError:
# python 3
from collections.abc import Mapping
from coupled_biased_random_walks.types import obs_item_type, observation_type


class IncrementingDict(Mapping):
Expand All @@ -24,7 +19,7 @@ def __init__(self):
self._d = {}
self._next_val = 0

def insert(self, key):
def insert(self, key: Hashable) -> None:
"""
Inserts a (strictly new) key
:param key: any hashable object to be used as a key
Expand All @@ -47,7 +42,7 @@ def __repr__(self):
return self._d.__repr__()


class ObservationCounter(object):
class ObservationCounter:

"""
Counts single and joint occurrences of key/value pairs in a dict with
Expand All @@ -69,18 +64,18 @@ def __init__(self):
self._index = IncrementingDict()

@property
def counts(self):
def counts(self) -> Dict[str, Counter]:
return dict(self._counts)

@property
def joint_counts(self):
def joint_counts(self) -> Dict[Tuple[obs_item_type, obs_item_type], int]:
return dict(self._joint_counts)

@property
def index(self):
def index(self) -> IncrementingDict:
return self._index

def update(self, observation_iterable):
def update(self, observation_iterable: Iterable[observation_type]) -> None:
"""
Update counts with new observation(s)
:param observation_iterable: list of dicts
Expand All @@ -91,13 +86,13 @@ def update(self, observation_iterable):
# feature name with value NaN represents a missing feature in the
# observation (e.g., a missing value is NaN-filled in a pandas DataFrame) so
# we remove any such features from the observation to avoid including in counts
obs = {key: value for key, value in iteritems(observation) if not isnan(value)}
obs = {key: value for key, value in observation.items() if not isnan(value)}
# create iterators of obs for updating counts
obs1, obs2 = tee(iteritems(obs), 2)
obs1, obs2 = tee(obs.items(), 2)
self._update_counts(obs1)
self._update_joint_counts(obs2)

def get_count(self, item):
def get_count(self, item: obs_item_type) -> int:
"""
Getter to safely retrieve count from interal data structure of defaultdict(Counter)
:param item: tuple of the form ('feature_name', 'feature_value')
Expand All @@ -111,7 +106,7 @@ def get_count(self, item):
# meaning there is no count for the feature_name
return 0

def _update_counts(self, observation):
def _update_counts(self, observation: obs_item_type) -> None:
"""
Update single counts
:param observation: iterable of tuples of the form ('feature_name', 'feature_value')
Expand All @@ -122,34 +117,34 @@ def _update_counts(self, observation):
self._index.insert(item)
self.n_obs.update([feature_name])

def _update_joint_counts(self, observation):
def _update_joint_counts(self, observations: Iterable[obs_item_type]) -> None:
"""
Update joint counts
:param observation: iterable of tuples of the form ('feature_name', 'feature_value')
"""
pairs = combinations(sorted(observation), 2)
pairs = combinations(sorted(observations), 2)
self._joint_counts.update(pairs)


# Helper functions

def get_feature_name(feature_tuple):
def get_feature_name(feature_tuple: obs_item_type) -> str:
"""
Helper function to return feature name from tuple representation
:param feature_tuple: tuple of the form (feature_name, feature_value)
"""
return feature_tuple[0]


def get_feature_value(feature_tuple):
def get_feature_value(feature_tuple: obs_item_type) -> str:
"""
Helper function to return feature value from tuple representation
:param feature_tuple: tuple of the form (feature_name, feature_value)
"""
return feature_tuple[1]


def get_mode(counter):
def get_mode(counter: Counter) -> int:
"""
Helper function to return the count of the most common
element from an instance of Counter()
Expand All @@ -163,7 +158,7 @@ def get_mode(counter):
return mode[0][1]


def isnan(x):
def isnan(x: Any) -> bool:
"""
Return True if x is NaN where x can be of any type
:param x: any object for which (in)equality can be checked
Expand Down
69 changes: 38 additions & 31 deletions coupled_biased_random_walks/detect.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from __future__ import division
from __future__ import annotations

from collections import defaultdict
from typing import Dict, Iterable, List, Optional

import numpy as np
from six import iteritems, itervalues
from scipy.sparse import csr_matrix

from coupled_biased_random_walks.count import (
ObservationCounter,
Expand All @@ -16,9 +17,10 @@
random_walk,
row_normalize_csr_matrix,
)
from coupled_biased_random_walks.types import obs_item_type, observation_type


class CBRW(object):
class CBRW:

""" Class implementing Coupled Biased Random Walks algorithm """

Expand All @@ -29,25 +31,29 @@ class CBRW(object):
'max_iter': 100 # max number of steps to take
}

def __init__(self, rw_params=None, ignore_unknown=False):
def __init__(
self,
rw_params: Optional[Dict[str, float]] = None,
ignore_unknown: bool = False,
):
"""
:param rw_params: random walk parameters to override defaults
:param ignore_unknown: if True, score an observation containing unknown feature names
or values based only on features seen during training; if False, score such an observation
as nan (default)
"""
self.rw_params = rw_params if rw_params else self.PRESET_RW_PARAMS
self.rw_params = rw_params or self.PRESET_RW_PARAMS
self._unknown_feature_score = 0 if ignore_unknown else np.nan

self._counter = ObservationCounter()
self._stationary_prob = None
self._feature_relevance = None
self._stationary_prob = None # type: Optional[Dict[obs_item_type, float]]
self._feature_relevance = None # type: Optional[Dict[str, float]]

@property
def feature_weights(self):
def feature_weights(self) -> Optional[Dict[str, float]]:
return self._feature_relevance

def add_observations(self, observation_iterable):
def add_observations(self, observation_iterable: Iterable[observation_type]) -> CBRW:
"""
Add observations to be modeled
:param observation_iterable: list of dicts with each dict representing an observation
Expand All @@ -56,7 +62,7 @@ def add_observations(self, observation_iterable):
self._counter.update(observation_iterable)
return self

def fit(self):
def fit(self) -> CBRW:
"""
Compute model based on current observations in state
"""
Expand All @@ -74,22 +80,22 @@ def fit(self):
# allocate probability by feature
stationary_prob = {}
feature_relevance = defaultdict(int)
for feature, idx in iteritems(self._counter.index):
for feature, idx in self._counter.index.items():
prob = pi[idx]
stationary_prob[feature] = prob
feature_relevance[get_feature_name(feature)] += prob

# sum normalize feature_relevance
feature_rel_sum = sum(itervalues(feature_relevance))
feature_rel_sum = sum(feature_relevance.values())
if feature_rel_sum < EPS:
raise CBRWFitError('feature weights sum approximately zero')
feature_relevance = {key: val/feature_rel_sum for key, val in iteritems(feature_relevance)}
feature_relevance = {key: val/feature_rel_sum for key, val in feature_relevance.items()}

self._stationary_prob = stationary_prob
self._feature_relevance = feature_relevance
return self

def score(self, observation_iterable):
def score(self, observation_iterable: Iterable[observation_type]) -> np.array:
"""
Compute an anomaly score for each observation in observation_iterable
:param observation_iterable: iterable of dict observations with each dict
Expand All @@ -101,14 +107,17 @@ def score(self, observation_iterable):
observation_iterable = [observation_iterable]
return np.array([self._score(obs) for obs in observation_iterable])

def _score(self, observation):
def _score(self, observation: observation_type) -> float:
"""
Compute the weighted anomaly score (object_score in the paper) for an observation
:param observation: dict of the form {feature_name: feature_value, ...}
"""
return sum(itervalues(self._value_scores(observation)))
return sum(self._value_scores(observation).values())

def value_scores(self, observation_iterable):
def value_scores(
self,
observation_iterable: Iterable[observation_type],
) -> List[Dict[str, float]]:
"""
Compute an anomaly sub-score for each value of each observation in observation_iterable
:param observation_iterable: iterable of dict observations with each dict
Expand All @@ -123,7 +132,7 @@ def value_scores(self, observation_iterable):
observation_iterable = [observation_iterable]
return [self._value_scores(obs) for obs in observation_iterable]

def _value_scores(self, observation):
def _value_scores(self, observation: observation_type) -> Dict[str, float]:
"""
Compute the weighted value scores for each feature value of an observation
:param observation: dict of the form {feature_name: feature_value, ...}
Expand All @@ -132,26 +141,26 @@ def _value_scores(self, observation):
get_feature_name(item):
self._get_feature_relevance(item) *
self._stationary_prob.get(item, self._unknown_feature_score)
for item in iteritems(observation)
for item in observation.items()
}

def _get_feature_relevance(self, feature_tuple):
def _get_feature_relevance(self, feature_tuple: obs_item_type) -> float:
"""
Getter for the relevance (weight) of a feature (category)
:param feature_tuple: tuple of the form (feature_name, feature_value)
"""
feature_name = get_feature_name(feature_tuple)
return self._feature_relevance.get(feature_name, 0)

def _compute_biased_transition_matrix(self):
def _compute_biased_transition_matrix(self) -> csr_matrix:
"""
Computes biased probability transition matrix of conditional probabilities
"""
prob_idx = {}
prob_idx = {} # type: Dict[obs_item_type, float]

bias_dict = self._compute_biases()

for (feature1, feature2), joint_count in iteritems(self._counter.joint_counts):
for (feature1, feature2), joint_count in self._counter.joint_counts.items():

# get index for features
feature1_idx = self._counter.index[feature1]
Expand Down Expand Up @@ -180,19 +189,17 @@ def _compute_biased_transition_matrix(self):
trans_matrix = dict_to_csr_matrix(prob_idx, shape=n_features)
return row_normalize_csr_matrix(trans_matrix)

def _compute_biases(self):
def _compute_biases(self) -> Dict[obs_item_type, float]:
"""
Computes bias for random walk for each feature tuple
"""
bias_dict = {}
for feature_name, value_counts in iteritems(self._counter.counts):
bias_dict = {} # type: Dict[obs_item_type, float]
for feature_name, value_counts in self._counter.counts.items():
mode = get_mode(value_counts)
base = 1 - (mode / self._counter.n_obs[feature_name])
bias = {
feature_val: (1 - (count / mode) + base) / 2
for feature_val, count in iteritems(value_counts)
}
bias_dict.update(bias)
for feature_val, count in value_counts.items():
bias = (1 - (count / mode) + base) / 2
bias_dict[feature_val] = bias
return bias_dict


Expand Down
Loading

0 comments on commit 27cbd1c

Please sign in to comment.