Skip to content
This repository has been archived by the owner on Mar 1, 2018. It is now read-only.

Commit

Permalink
Merge pull request #13 from datasciencebr/irio-meal-price-outlier
Browse files Browse the repository at this point in the history
Anomalies in meal prices
  • Loading branch information
cabral committed Dec 14, 2016
2 parents 015b355 + bd88b05 commit 2a153c6
Show file tree
Hide file tree
Showing 19 changed files with 490 additions and 91 deletions.
6 changes: 6 additions & 0 deletions .codeclimate.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
engines:
pep8:
enabled: true
ratings:
paths:
- "**.py"
2 changes: 2 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[run]
source = rosie
9 changes: 6 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
rosie/serenata-toolbox
rosie/pip-delete-this-directory.txt
__pycache__
*.pyc
.coverage
.python-version
__pycache__/
htmlcov/

10 changes: 8 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
language: python
python: 3.5
install: ./setup
script: python -m unittest discover tests
install:
- ./setup
- pip install coveralls
script:
- coverage run rosie.py test

after_success:
- coveralls
13 changes: 11 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Rosie, the robot

[![Build Status](https://travis-ci.org/datasciencebr/rosie.svg?branch=master)](https://travis-ci.org/datasciencebr/rosie)
[![Code Climate](https://codeclimate.com/github/datasciencebr/rosie/badges/gpa.svg)](https://codeclimate.com/github/datasciencebr/rosie)
[![Coverage Status](https://coveralls.io/repos/github/datasciencebr/rosie/badge.svg?branch=master)](https://coveralls.io/github/datasciencebr/rosie?branch=master)

A Python application reading receipts from the Quota for Exercising Parliamentary Activity (aka CEAP, from the Brazilian Chamber of Deputies) and outputs, for each of the receipts, a "probability of corruption" and a list of reasons why is considered this way.

- [x] Fetch CEAP dataset from Chamber of Deputies
Expand All @@ -25,14 +29,19 @@ $ ./setup
## Running

```console
$ python rosie/main.py
$ python rosie.py run
```

A `/tmp/serenata-data/irregularities.xz` file will be created. It's a compacted CSV with all the irregularities Rosie is able to find.

Also a target directory (where files are saved) can de passed — for example:

```console
$ python rosie.py run /my/serenata/directory/
```

## Test suite

```console
$ python -m unittest discover tests
$ python rosie.py test
```
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-e git+https://github.com/datasciencebr/serenata-toolbox.git#egg=serenata-toolbox
git+https://github.com/datasciencebr/serenata-toolbox.git#egg=serenata-toolbox
scikit-learn>=0.17
scipy>=0.18
geopy>=1.11.0
36 changes: 36 additions & 0 deletions rosie.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from sys import argv


def entered_command(argv):
if len(argv) >= 2:
return argv[1]
return None


def help():
message = (
'Usages:',
' python rosie.py run',
' python rosie.py run <path to output directory>',
' python rosie.py test',
)
print('\n'.join(message))


def run():
import rosie
target_directory = argv[2] if len(argv) >= 3 else '/tmp/serenata-data/'
rosie.main(target_directory)


def test():
import unittest
loader = unittest.TestLoader()
tests = loader.discover('tests')
testRunner = unittest.runner.TextTestRunner()
testRunner.run(tests)


commands = {'run': run, 'test': test}
command = commands.get(entered_command(argv), help)
command()
63 changes: 63 additions & 0 deletions rosie/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os.path

import numpy as np
from sklearn.externals import joblib

from rosie.dataset import Dataset
from rosie.meal_price_outlier_classifier import MealPriceOutlierClassifier
from rosie.monthly_subquota_limit_classifier import MonthlySubquotaLimitClassifier
from rosie.traveled_speeds_classifier import TraveledSpeedsClassifier


class Rosie:
CLASSIFIERS = {
MealPriceOutlierClassifier: 'meal_price_outlier',
MonthlySubquotaLimitClassifier: 'over_monthly_subquota_limit',
TraveledSpeedsClassifier: 'suspicious_traveled_speed_day',
}
DATASET_KEYS = ['applicant_id', 'year', 'document_id']

def __init__(self, dataset, data_path):
self.dataset = dataset
self.data_path = data_path
self.irregularities = self.dataset[self.DATASET_KEYS].copy()

def run_classifiers(self):
for classifier, irregularity in self.CLASSIFIERS.items():
model = self.load_trained_model(classifier)
self.predict(model, irregularity)

self.irregularities.to_csv(os.path.join(self.data_path, 'irregularities.xz'),
compression='xz',
encoding='utf-8',
index=False)

def load_trained_model(self, classifier):
filename = '{}.pkl'.format(classifier.__name__.lower())
path = os.path.join(self.data_path, filename)
# palliative since this model is outputting
# a model too large to be loaded with joblib
if filename == 'monthlysubquotalimitclassifier.pkl':
model = classifier()
model.fit(self.dataset)
else:
if os.path.isfile(path):
model = joblib.load(path)
else:
model = classifier()
model.fit(self.dataset)
joblib.dump(model, path)
return model

def predict(self, model, irregularity):
model.transform(self.dataset)
y = model.predict(self.dataset)
self.irregularities[irregularity] = y
if y.dtype == np.int:
self.irregularities.loc[y == 1, irregularity] = False
self.irregularities.loc[y == -1, irregularity] = True


def main(target_directory='/tmp/serenata-data'):
dataset = Dataset(target_directory).get()
Rosie(dataset, target_directory).run_classifiers()
4 changes: 4 additions & 0 deletions rosie/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from rosie import main


main()
52 changes: 52 additions & 0 deletions rosie/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os

import numpy as np
import pandas as pd
from serenata_toolbox.ceap_dataset import CEAPDataset
from serenata_toolbox.datasets import fetch


class Dataset:
COMPANIES_DATASET = '2016-09-03-companies.xz'

def __init__(self, path):
self.path = path

def get(self):
self.update_datasets()
reimbursements = self.get_reimbursements()
companies = self.get_companies()
return pd.merge(reimbursements, companies,
left_on='cnpj_cpf',
right_on='cnpj')

def update_datasets(self):
os.makedirs(self.path, exist_ok=True)
ceap = CEAPDataset(self.path)
ceap.fetch()
ceap.convert_to_csv()
ceap.translate()
ceap.clean()
fetch(self.COMPANIES_DATASET, self.path)

def get_reimbursements(self):
dataset = \
pd.read_csv(os.path.join(self.path, 'reimbursements.xz'),
dtype={'applicant_id': np.str,
'cnpj_cpf': np.str,
'congressperson_id': np.str,
'subquota_number': np.str},
low_memory=False)
dataset['issue_date'] = pd.to_datetime(dataset['issue_date'],
errors='coerce')
return dataset

def get_companies(self):
is_in_brazil = ('(-73.992222 < longitude < -34.7916667) & '
'(-33.742222 < latitude < 5.2722222)')
dataset = pd.read_csv(os.path.join(self.path, self.COMPANIES_DATASET),
dtype={'cnpj_cpf': np.str},
low_memory=False)
dataset = dataset.query(is_in_brazil)
dataset['cnpj'] = dataset['cnpj'].str.replace(r'\D', '')
return dataset
74 changes: 0 additions & 74 deletions rosie/main.py

This file was deleted.

84 changes: 84 additions & 0 deletions rosie/meal_price_outlier_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import unicodedata

import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.cluster import KMeans


class MealPriceOutlierClassifier(TransformerMixin):


HOTEL_REGEX = r'hote(?:(?:ls?)|is)'
CLUSTER_KEYS = ['mean', 'std']

def fit(self, X):
_X = X[self.__applicable_rows(X)]
companies = _X.groupby('cnpj_cpf').apply(self.__company_stats) \
.reset_index()
companies = companies[self.__applicable_company_rows(companies)]

self.cluster_model = KMeans(n_clusters=3)
self.cluster_model.fit(companies[self.CLUSTER_KEYS])
companies['cluster'] = self.cluster_model.predict(companies[self.CLUSTER_KEYS])
self.clusters = companies.groupby('cluster') \
.apply(self.__cluster_stats) \
.reset_index()
self.clusters['threshold'] = \
self.clusters['mean'] + 4 * self.clusters['std']
return self

def transform(self, X=None):
pass

def predict(self, X):
_X = X.copy()
companies = _X[self.__applicable_rows(_X)] \
.groupby('cnpj_cpf').apply(self.__company_stats) \
.reset_index()
companies['cluster'] = \
self.cluster_model.predict(companies[self.CLUSTER_KEYS])
companies = pd.merge(companies,
self.clusters[['cluster', 'threshold']],
how='left')
_X = pd.merge(_X, companies[['cnpj_cpf', 'threshold']], how='left')
known_companies = companies[self.__applicable_company_rows(companies)]
known_thresholds = known_companies \
.groupby('cnpj_cpf') \
.apply(lambda x: x['mean'] + 3 * x['std']) \
.reset_index() \
.rename(columns={0: 'cnpj_threshold'})
_X = pd.merge(_X, known_thresholds, how='left')
if 'cnpj_threshold' in _X.columns:
_X.loc[_X['cnpj_threshold'].notnull(),
'threshold'] = _X['cnpj_threshold']
_X['y'] = 1
is_outlier = self.__applicable_rows(_X) & \
_X['threshold'].notnull() & \
(_X['total_net_value'] > _X['threshold'])
_X.loc[is_outlier, 'y'] = -1
return _X['y']

def __applicable_rows(self, X):
return (X['subquota_description'] == 'Congressperson meal') & \
(X['cnpj_cpf'].str.len() == 14) & \
(~X['supplier'].apply(self.__normalize_string).str.contains(self.HOTEL_REGEX))

def __applicable_company_rows(self, companies):
return (companies['congresspeople'] > 3) & (companies['records'] > 20)

def __company_stats(self, X):
stats = {'mean': np.mean(X['total_net_value']),
'std': np.std(X['total_net_value']),
'congresspeople': len(np.unique(X['applicant_id'])),
'records': len(X)}
return pd.Series(stats)

def __cluster_stats(self, X):
stats = {'mean': np.mean(X['mean']),
'std': np.mean(X['std'])}
return pd.Series(stats)

def __normalize_string(self, string):
nfkd_form = unicodedata.normalize('NFKD', string.lower())
return nfkd_form.encode('ASCII', 'ignore').decode('utf-8')
Loading

0 comments on commit 2a153c6

Please sign in to comment.