Merge pull request #13 from datasciencebr/irio-meal-price-outlier

Anomalies in meal prices
okfn-brasil · Dec 14, 2016 · 2a153c6 · 2a153c6
2 parents 015b355 + bd88b05
commit 2a153c6
Show file tree

Hide file tree

Showing 19 changed files with 490 additions and 91 deletions.
diff --git a/.codeclimate.yml b/.codeclimate.yml
@@ -0,0 +1,6 @@
+engines:
+  pep8:
+    enabled: true
+ratings:
+   paths:
+   - "**.py"
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,2 @@
+  [run]
+  source = rosie
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
-rosie/serenata-toolbox
-rosie/pip-delete-this-directory.txt
-__pycache__
+*.pyc
+.coverage
+.python-version
+__pycache__/
+htmlcov/
+
diff --git a/.travis.yml b/.travis.yml
@@ -1,4 +1,10 @@
 language: python
 python: 3.5
-install: ./setup
-script: python -m unittest discover tests
+install: 
+  - ./setup
+  - pip install coveralls
+script: 
+  - coverage run rosie.py test
+
+after_success:
+  - coveralls
diff --git a/README.md b/README.md
@@ -1,5 +1,9 @@
 # Rosie, the robot
 
+[![Build Status](https://travis-ci.org/datasciencebr/rosie.svg?branch=master)](https://travis-ci.org/datasciencebr/rosie)
+[![Code Climate](https://codeclimate.com/github/datasciencebr/rosie/badges/gpa.svg)](https://codeclimate.com/github/datasciencebr/rosie)
+[![Coverage Status](https://coveralls.io/repos/github/datasciencebr/rosie/badge.svg?branch=master)](https://coveralls.io/github/datasciencebr/rosie?branch=master)
+
 A Python application reading receipts from the Quota for Exercising Parliamentary Activity (aka CEAP, from the Brazilian Chamber of Deputies) and outputs, for each of the receipts, a "probability of corruption" and a list of reasons why is considered this way.
 
 - [x] Fetch CEAP dataset from Chamber of Deputies
@@ -25,14 +29,19 @@ $ ./setup
 ## Running
 
 ```console
-$ python rosie/main.py
+$ python rosie.py run
 ```
 
 A `/tmp/serenata-data/irregularities.xz` file will be created. It's a compacted CSV with all the irregularities Rosie is able to find.
 
+Also a target directory (where files are saved) can de passed — for example:
+
+```console
+$ python rosie.py run /my/serenata/directory/
+```
 
 ## Test suite
 
 ```console
-$ python -m unittest discover tests
+$ python rosie.py test
 ```
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
--e git+https://github.com/datasciencebr/serenata-toolbox.git#egg=serenata-toolbox
+git+https://github.com/datasciencebr/serenata-toolbox.git#egg=serenata-toolbox
 scikit-learn>=0.17
 scipy>=0.18
 geopy>=1.11.0
diff --git a/rosie.py b/rosie.py
@@ -0,0 +1,36 @@
+from sys import argv
+
+
+def entered_command(argv):
+    if len(argv) >= 2:
+        return argv[1]
+    return None
+
+
+def help():
+    message = (
+        'Usages:',
+        '  python rosie.py run',
+        '  python rosie.py run <path to output directory>',
+        '  python rosie.py test',
+    )
+    print('\n'.join(message))
+
+
+def run():
+    import rosie
+    target_directory = argv[2] if len(argv) >= 3 else '/tmp/serenata-data/'
+    rosie.main(target_directory)
+
+
+def test():
+    import unittest
+    loader = unittest.TestLoader()
+    tests = loader.discover('tests')
+    testRunner = unittest.runner.TextTestRunner()
+    testRunner.run(tests)
+
+
+commands = {'run': run, 'test': test}
+command = commands.get(entered_command(argv), help)
+command()
diff --git a/rosie/__init__.py b/rosie/__init__.py
@@ -0,0 +1,63 @@
+import os.path
+
+import numpy as np
+from sklearn.externals import joblib
+
+from rosie.dataset import Dataset
+from rosie.meal_price_outlier_classifier import MealPriceOutlierClassifier
+from rosie.monthly_subquota_limit_classifier import MonthlySubquotaLimitClassifier
+from rosie.traveled_speeds_classifier import TraveledSpeedsClassifier
+
+
+class Rosie:
+    CLASSIFIERS = {
+        MealPriceOutlierClassifier: 'meal_price_outlier',
+        MonthlySubquotaLimitClassifier: 'over_monthly_subquota_limit',
+        TraveledSpeedsClassifier: 'suspicious_traveled_speed_day',
+    }
+    DATASET_KEYS = ['applicant_id', 'year', 'document_id']
+
+    def __init__(self, dataset, data_path):
+        self.dataset = dataset
+        self.data_path = data_path
+        self.irregularities = self.dataset[self.DATASET_KEYS].copy()
+
+    def run_classifiers(self):
+        for classifier, irregularity in self.CLASSIFIERS.items():
+            model = self.load_trained_model(classifier)
+            self.predict(model, irregularity)
+
+        self.irregularities.to_csv(os.path.join(self.data_path, 'irregularities.xz'),
+                                   compression='xz',
+                                   encoding='utf-8',
+                                   index=False)
+
+    def load_trained_model(self, classifier):
+        filename = '{}.pkl'.format(classifier.__name__.lower())
+        path = os.path.join(self.data_path, filename)
+        # palliative since this model is outputting
+        # a model too large to be loaded with joblib
+        if filename == 'monthlysubquotalimitclassifier.pkl':
+            model = classifier()
+            model.fit(self.dataset)
+        else:
+            if os.path.isfile(path):
+                model = joblib.load(path)
+            else:
+                model = classifier()
+                model.fit(self.dataset)
+                joblib.dump(model, path)
+        return model
+
+    def predict(self, model, irregularity):
+        model.transform(self.dataset)
+        y = model.predict(self.dataset)
+        self.irregularities[irregularity] = y
+        if y.dtype == np.int:
+            self.irregularities.loc[y == 1, irregularity] = False
+            self.irregularities.loc[y == -1, irregularity] = True
+
+
+def main(target_directory='/tmp/serenata-data'):
+    dataset = Dataset(target_directory).get()
+    Rosie(dataset, target_directory).run_classifiers()
diff --git a/rosie/__main__.py b/rosie/__main__.py
@@ -0,0 +1,4 @@
+from rosie import main
+
+
+main()
diff --git a/rosie/dataset.py b/rosie/dataset.py
@@ -0,0 +1,52 @@
+import os
+
+import numpy as np
+import pandas as pd
+from serenata_toolbox.ceap_dataset import CEAPDataset
+from serenata_toolbox.datasets import fetch
+
+
+class Dataset:
+    COMPANIES_DATASET = '2016-09-03-companies.xz'
+
+    def __init__(self, path):
+        self.path = path
+
+    def get(self):
+        self.update_datasets()
+        reimbursements = self.get_reimbursements()
+        companies = self.get_companies()
+        return pd.merge(reimbursements, companies,
+                        left_on='cnpj_cpf',
+                        right_on='cnpj')
+
+    def update_datasets(self):
+        os.makedirs(self.path, exist_ok=True)
+        ceap = CEAPDataset(self.path)
+        ceap.fetch()
+        ceap.convert_to_csv()
+        ceap.translate()
+        ceap.clean()
+        fetch(self.COMPANIES_DATASET, self.path)
+
+    def get_reimbursements(self):
+        dataset = \
+            pd.read_csv(os.path.join(self.path, 'reimbursements.xz'),
+                        dtype={'applicant_id': np.str,
+                               'cnpj_cpf': np.str,
+                               'congressperson_id': np.str,
+                               'subquota_number': np.str},
+                        low_memory=False)
+        dataset['issue_date'] = pd.to_datetime(dataset['issue_date'],
+                                               errors='coerce')
+        return dataset
+
+    def get_companies(self):
+        is_in_brazil = ('(-73.992222 < longitude < -34.7916667) & '
+                        '(-33.742222 < latitude < 5.2722222)')
+        dataset = pd.read_csv(os.path.join(self.path, self.COMPANIES_DATASET),
+                              dtype={'cnpj_cpf': np.str},
+                              low_memory=False)
+        dataset = dataset.query(is_in_brazil)
+        dataset['cnpj'] = dataset['cnpj'].str.replace(r'\D', '')
+        return dataset
diff --git a/rosie/main.py b/rosie/main.py
diff --git a/rosie/meal_price_outlier_classifier.py b/rosie/meal_price_outlier_classifier.py
@@ -0,0 +1,84 @@
+import unicodedata
+
+import numpy as np
+import pandas as pd
+from sklearn.base import TransformerMixin
+from sklearn.cluster import KMeans
+
+
+class MealPriceOutlierClassifier(TransformerMixin):
+
+
+    HOTEL_REGEX = r'hote(?:(?:ls?)|is)'
+    CLUSTER_KEYS = ['mean', 'std']
+
+    def fit(self, X):
+        _X = X[self.__applicable_rows(X)]
+        companies = _X.groupby('cnpj_cpf').apply(self.__company_stats) \
+            .reset_index()
+        companies = companies[self.__applicable_company_rows(companies)]
+
+        self.cluster_model = KMeans(n_clusters=3)
+        self.cluster_model.fit(companies[self.CLUSTER_KEYS])
+        companies['cluster'] = self.cluster_model.predict(companies[self.CLUSTER_KEYS])
+        self.clusters = companies.groupby('cluster') \
+            .apply(self.__cluster_stats) \
+            .reset_index()
+        self.clusters['threshold'] = \
+            self.clusters['mean'] + 4 * self.clusters['std']
+        return self
+
+    def transform(self, X=None):
+        pass
+
+    def predict(self, X):
+        _X = X.copy()
+        companies = _X[self.__applicable_rows(_X)] \
+            .groupby('cnpj_cpf').apply(self.__company_stats) \
+            .reset_index()
+        companies['cluster'] = \
+            self.cluster_model.predict(companies[self.CLUSTER_KEYS])
+        companies = pd.merge(companies,
+                             self.clusters[['cluster', 'threshold']],
+                             how='left')
+        _X = pd.merge(_X, companies[['cnpj_cpf', 'threshold']], how='left')
+        known_companies = companies[self.__applicable_company_rows(companies)]
+        known_thresholds = known_companies \
+            .groupby('cnpj_cpf') \
+            .apply(lambda x: x['mean'] + 3 * x['std']) \
+            .reset_index() \
+            .rename(columns={0: 'cnpj_threshold'})
+        _X = pd.merge(_X, known_thresholds, how='left')
+        if 'cnpj_threshold' in _X.columns:
+            _X.loc[_X['cnpj_threshold'].notnull(),
+                  'threshold'] = _X['cnpj_threshold']
+        _X['y'] = 1
+        is_outlier = self.__applicable_rows(_X) & \
+            _X['threshold'].notnull() & \
+            (_X['total_net_value'] > _X['threshold'])
+        _X.loc[is_outlier, 'y'] = -1
+        return _X['y']
+
+    def __applicable_rows(self, X):
+        return (X['subquota_description'] == 'Congressperson meal') & \
+            (X['cnpj_cpf'].str.len() == 14) & \
+            (~X['supplier'].apply(self.__normalize_string).str.contains(self.HOTEL_REGEX))
+
+    def __applicable_company_rows(self, companies):
+        return (companies['congresspeople'] > 3) & (companies['records'] > 20)
+
+    def __company_stats(self, X):
+        stats = {'mean': np.mean(X['total_net_value']),
+                 'std': np.std(X['total_net_value']),
+                 'congresspeople': len(np.unique(X['applicant_id'])),
+                 'records': len(X)}
+        return pd.Series(stats)
+
+    def __cluster_stats(self, X):
+        stats = {'mean': np.mean(X['mean']),
+                 'std': np.mean(X['std'])}
+        return pd.Series(stats)
+
+    def __normalize_string(self, string):
+        nfkd_form = unicodedata.normalize('NFKD', string.lower())
+        return nfkd_form.encode('ASCII', 'ignore').decode('utf-8')