This repository has been archived by the owner on Mar 1, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 60
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #13 from datasciencebr/irio-meal-price-outlier
Anomalies in meal prices
- Loading branch information
Showing
19 changed files
with
490 additions
and
91 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
engines: | ||
pep8: | ||
enabled: true | ||
ratings: | ||
paths: | ||
- "**.py" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
[run] | ||
source = rosie |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
rosie/serenata-toolbox | ||
rosie/pip-delete-this-directory.txt | ||
__pycache__ | ||
*.pyc | ||
.coverage | ||
.python-version | ||
__pycache__/ | ||
htmlcov/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,10 @@ | ||
language: python | ||
python: 3.5 | ||
install: ./setup | ||
script: python -m unittest discover tests | ||
install: | ||
- ./setup | ||
- pip install coveralls | ||
script: | ||
- coverage run rosie.py test | ||
|
||
after_success: | ||
- coveralls |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
-e git+https://github.com/datasciencebr/serenata-toolbox.git#egg=serenata-toolbox | ||
git+https://github.com/datasciencebr/serenata-toolbox.git#egg=serenata-toolbox | ||
scikit-learn>=0.17 | ||
scipy>=0.18 | ||
geopy>=1.11.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from sys import argv | ||
|
||
|
||
def entered_command(argv): | ||
if len(argv) >= 2: | ||
return argv[1] | ||
return None | ||
|
||
|
||
def help(): | ||
message = ( | ||
'Usages:', | ||
' python rosie.py run', | ||
' python rosie.py run <path to output directory>', | ||
' python rosie.py test', | ||
) | ||
print('\n'.join(message)) | ||
|
||
|
||
def run(): | ||
import rosie | ||
target_directory = argv[2] if len(argv) >= 3 else '/tmp/serenata-data/' | ||
rosie.main(target_directory) | ||
|
||
|
||
def test(): | ||
import unittest | ||
loader = unittest.TestLoader() | ||
tests = loader.discover('tests') | ||
testRunner = unittest.runner.TextTestRunner() | ||
testRunner.run(tests) | ||
|
||
|
||
commands = {'run': run, 'test': test} | ||
command = commands.get(entered_command(argv), help) | ||
command() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import os.path | ||
|
||
import numpy as np | ||
from sklearn.externals import joblib | ||
|
||
from rosie.dataset import Dataset | ||
from rosie.meal_price_outlier_classifier import MealPriceOutlierClassifier | ||
from rosie.monthly_subquota_limit_classifier import MonthlySubquotaLimitClassifier | ||
from rosie.traveled_speeds_classifier import TraveledSpeedsClassifier | ||
|
||
|
||
class Rosie: | ||
CLASSIFIERS = { | ||
MealPriceOutlierClassifier: 'meal_price_outlier', | ||
MonthlySubquotaLimitClassifier: 'over_monthly_subquota_limit', | ||
TraveledSpeedsClassifier: 'suspicious_traveled_speed_day', | ||
} | ||
DATASET_KEYS = ['applicant_id', 'year', 'document_id'] | ||
|
||
def __init__(self, dataset, data_path): | ||
self.dataset = dataset | ||
self.data_path = data_path | ||
self.irregularities = self.dataset[self.DATASET_KEYS].copy() | ||
|
||
def run_classifiers(self): | ||
for classifier, irregularity in self.CLASSIFIERS.items(): | ||
model = self.load_trained_model(classifier) | ||
self.predict(model, irregularity) | ||
|
||
self.irregularities.to_csv(os.path.join(self.data_path, 'irregularities.xz'), | ||
compression='xz', | ||
encoding='utf-8', | ||
index=False) | ||
|
||
def load_trained_model(self, classifier): | ||
filename = '{}.pkl'.format(classifier.__name__.lower()) | ||
path = os.path.join(self.data_path, filename) | ||
# palliative since this model is outputting | ||
# a model too large to be loaded with joblib | ||
if filename == 'monthlysubquotalimitclassifier.pkl': | ||
model = classifier() | ||
model.fit(self.dataset) | ||
else: | ||
if os.path.isfile(path): | ||
model = joblib.load(path) | ||
else: | ||
model = classifier() | ||
model.fit(self.dataset) | ||
joblib.dump(model, path) | ||
return model | ||
|
||
def predict(self, model, irregularity): | ||
model.transform(self.dataset) | ||
y = model.predict(self.dataset) | ||
self.irregularities[irregularity] = y | ||
if y.dtype == np.int: | ||
self.irregularities.loc[y == 1, irregularity] = False | ||
self.irregularities.loc[y == -1, irregularity] = True | ||
|
||
|
||
def main(target_directory='/tmp/serenata-data'): | ||
dataset = Dataset(target_directory).get() | ||
Rosie(dataset, target_directory).run_classifiers() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from rosie import main | ||
|
||
|
||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import os | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from serenata_toolbox.ceap_dataset import CEAPDataset | ||
from serenata_toolbox.datasets import fetch | ||
|
||
|
||
class Dataset: | ||
COMPANIES_DATASET = '2016-09-03-companies.xz' | ||
|
||
def __init__(self, path): | ||
self.path = path | ||
|
||
def get(self): | ||
self.update_datasets() | ||
reimbursements = self.get_reimbursements() | ||
companies = self.get_companies() | ||
return pd.merge(reimbursements, companies, | ||
left_on='cnpj_cpf', | ||
right_on='cnpj') | ||
|
||
def update_datasets(self): | ||
os.makedirs(self.path, exist_ok=True) | ||
ceap = CEAPDataset(self.path) | ||
ceap.fetch() | ||
ceap.convert_to_csv() | ||
ceap.translate() | ||
ceap.clean() | ||
fetch(self.COMPANIES_DATASET, self.path) | ||
|
||
def get_reimbursements(self): | ||
dataset = \ | ||
pd.read_csv(os.path.join(self.path, 'reimbursements.xz'), | ||
dtype={'applicant_id': np.str, | ||
'cnpj_cpf': np.str, | ||
'congressperson_id': np.str, | ||
'subquota_number': np.str}, | ||
low_memory=False) | ||
dataset['issue_date'] = pd.to_datetime(dataset['issue_date'], | ||
errors='coerce') | ||
return dataset | ||
|
||
def get_companies(self): | ||
is_in_brazil = ('(-73.992222 < longitude < -34.7916667) & ' | ||
'(-33.742222 < latitude < 5.2722222)') | ||
dataset = pd.read_csv(os.path.join(self.path, self.COMPANIES_DATASET), | ||
dtype={'cnpj_cpf': np.str}, | ||
low_memory=False) | ||
dataset = dataset.query(is_in_brazil) | ||
dataset['cnpj'] = dataset['cnpj'].str.replace(r'\D', '') | ||
return dataset |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import unicodedata | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from sklearn.base import TransformerMixin | ||
from sklearn.cluster import KMeans | ||
|
||
|
||
class MealPriceOutlierClassifier(TransformerMixin): | ||
|
||
|
||
HOTEL_REGEX = r'hote(?:(?:ls?)|is)' | ||
CLUSTER_KEYS = ['mean', 'std'] | ||
|
||
def fit(self, X): | ||
_X = X[self.__applicable_rows(X)] | ||
companies = _X.groupby('cnpj_cpf').apply(self.__company_stats) \ | ||
.reset_index() | ||
companies = companies[self.__applicable_company_rows(companies)] | ||
|
||
self.cluster_model = KMeans(n_clusters=3) | ||
self.cluster_model.fit(companies[self.CLUSTER_KEYS]) | ||
companies['cluster'] = self.cluster_model.predict(companies[self.CLUSTER_KEYS]) | ||
self.clusters = companies.groupby('cluster') \ | ||
.apply(self.__cluster_stats) \ | ||
.reset_index() | ||
self.clusters['threshold'] = \ | ||
self.clusters['mean'] + 4 * self.clusters['std'] | ||
return self | ||
|
||
def transform(self, X=None): | ||
pass | ||
|
||
def predict(self, X): | ||
_X = X.copy() | ||
companies = _X[self.__applicable_rows(_X)] \ | ||
.groupby('cnpj_cpf').apply(self.__company_stats) \ | ||
.reset_index() | ||
companies['cluster'] = \ | ||
self.cluster_model.predict(companies[self.CLUSTER_KEYS]) | ||
companies = pd.merge(companies, | ||
self.clusters[['cluster', 'threshold']], | ||
how='left') | ||
_X = pd.merge(_X, companies[['cnpj_cpf', 'threshold']], how='left') | ||
known_companies = companies[self.__applicable_company_rows(companies)] | ||
known_thresholds = known_companies \ | ||
.groupby('cnpj_cpf') \ | ||
.apply(lambda x: x['mean'] + 3 * x['std']) \ | ||
.reset_index() \ | ||
.rename(columns={0: 'cnpj_threshold'}) | ||
_X = pd.merge(_X, known_thresholds, how='left') | ||
if 'cnpj_threshold' in _X.columns: | ||
_X.loc[_X['cnpj_threshold'].notnull(), | ||
'threshold'] = _X['cnpj_threshold'] | ||
_X['y'] = 1 | ||
is_outlier = self.__applicable_rows(_X) & \ | ||
_X['threshold'].notnull() & \ | ||
(_X['total_net_value'] > _X['threshold']) | ||
_X.loc[is_outlier, 'y'] = -1 | ||
return _X['y'] | ||
|
||
def __applicable_rows(self, X): | ||
return (X['subquota_description'] == 'Congressperson meal') & \ | ||
(X['cnpj_cpf'].str.len() == 14) & \ | ||
(~X['supplier'].apply(self.__normalize_string).str.contains(self.HOTEL_REGEX)) | ||
|
||
def __applicable_company_rows(self, companies): | ||
return (companies['congresspeople'] > 3) & (companies['records'] > 20) | ||
|
||
def __company_stats(self, X): | ||
stats = {'mean': np.mean(X['total_net_value']), | ||
'std': np.std(X['total_net_value']), | ||
'congresspeople': len(np.unique(X['applicant_id'])), | ||
'records': len(X)} | ||
return pd.Series(stats) | ||
|
||
def __cluster_stats(self, X): | ||
stats = {'mean': np.mean(X['mean']), | ||
'std': np.mean(X['std'])} | ||
return pd.Series(stats) | ||
|
||
def __normalize_string(self, string): | ||
nfkd_form = unicodedata.normalize('NFKD', string.lower()) | ||
return nfkd_form.encode('ASCII', 'ignore').decode('utf-8') |
Oops, something went wrong.