Skip to content
This repository has been archived by the owner on Mar 1, 2018. It is now read-only.

Commit

Permalink
Merge 054ddac into 40af34e
Browse files Browse the repository at this point in the history
  • Loading branch information
cuducos committed Apr 18, 2017
2 parents 40af34e + 054ddac commit 202b490
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 76 deletions.
72 changes: 6 additions & 66 deletions rosie/chamber_of_deputies/__init__.py
Original file line number Diff line number Diff line change
@@ -1,69 +1,9 @@
import os.path

import numpy as np
from sklearn.externals import joblib

from rosie.chamber_of_deputies.dataset import Dataset
from rosie.chamber_of_deputies.classifiers.election_expenses_classifier import ElectionExpensesClassifier
from rosie.core.classifiers.invalid_cnpj_cpf_classifier import InvalidCnpjCpfClassifier
from rosie.chamber_of_deputies.classifiers.meal_price_outlier_classifier import MealPriceOutlierClassifier
from rosie.chamber_of_deputies.classifiers.monthly_subquota_limit_classifier import MonthlySubquotaLimitClassifier
from rosie.chamber_of_deputies.classifiers.traveled_speeds_classifier import TraveledSpeedsClassifier
from rosie.chamber_of_deputies.classifiers.irregular_companies_classifier import IrregularCompaniesClassifier


class ChamberOfDeputies:
CLASSIFIERS = {
MealPriceOutlierClassifier: 'meal_price_outlier',
MonthlySubquotaLimitClassifier: 'over_monthly_subquota_limit',
TraveledSpeedsClassifier: 'suspicious_traveled_speed_day',
InvalidCnpjCpfClassifier: 'invalid_cnpj_cpf',
ElectionExpensesClassifier: 'election_expenses',
IrregularCompaniesClassifier: 'irregular_companies_classifier'
}
DATASET_KEYS = ['applicant_id', 'year', 'document_id']

def __init__(self, dataset, data_path):
self.dataset = dataset
self.data_path = data_path
self.irregularities = self.dataset[self.DATASET_KEYS].copy()

def run_classifiers(self):
for classifier, irregularity in self.CLASSIFIERS.items():
model = self.load_trained_model(classifier)
self.predict(model, irregularity)

self.irregularities.to_csv(os.path.join(self.data_path, 'irregularities.xz'),
compression='xz',
encoding='utf-8',
index=False)

def load_trained_model(self, classifier):
filename = '{}.pkl'.format(classifier.__name__.lower())
path = os.path.join(self.data_path, filename)
# palliative since this model is outputting
# a model too large to be loaded with joblib
if filename == 'monthlysubquotalimitclassifier.pkl':
model = classifier()
model.fit(self.dataset)
else:
if os.path.isfile(path):
model = joblib.load(path)
else:
model = classifier()
model.fit(self.dataset)
joblib.dump(model, path)
return model

def predict(self, model, irregularity):
model.transform(self.dataset)
y = model.predict(self.dataset)
self.irregularities[irregularity] = y
if y.dtype == np.int:
self.irregularities.loc[y == 1, irregularity] = False
self.irregularities.loc[y == -1, irregularity] = True
from rosie.chamber_of_deputies import settings
from rosie.chamber_of_deputies.adapter import Adapter
from rosie.core import Core


def main(target_directory='/tmp/serenata-data'):
dataset = Dataset(target_directory).get()
ChamberOfDeputies(dataset, target_directory).run_classifiers()
adapter = Adapter(target_directory)
core = Core(settings, adapter)
core()
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@
from serenata_toolbox.datasets import fetch


class Dataset:
class Adapter:
COMPANIES_DATASET = '2016-09-03-companies.xz'

def __init__(self, path):
self.path = path

def get(self):
@property
def dataset(self):
self.update_datasets()
reimbursements = self.get_reimbursements()
companies = self.get_companies()
Expand Down
20 changes: 20 additions & 0 deletions rosie/chamber_of_deputies/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from rosie.chamber_of_deputies.classifiers.election_expenses_classifier import ElectionExpensesClassifier
from rosie.chamber_of_deputies.classifiers.meal_price_outlier_classifier import MealPriceOutlierClassifier
from rosie.chamber_of_deputies.classifiers.monthly_subquota_limit_classifier import MonthlySubquotaLimitClassifier
from rosie.chamber_of_deputies.classifiers.traveled_speeds_classifier import TraveledSpeedsClassifier
from rosie.chamber_of_deputies.classifiers.irregular_companies_classifier import IrregularCompaniesClassifier
from rosie.core.classifiers.invalid_cnpj_cpf_classifier import InvalidCnpjCpfClassifier


CLASSIFIERS = {
'meal_price_outlier': MealPriceOutlierClassifier,
'over_monthly_subquota_limit': MonthlySubquotaLimitClassifier,
'suspicious_traveled_speed_day': TraveledSpeedsClassifier,
'invalid_cnpj_cpf': InvalidCnpjCpfClassifier,
'election_expenses': ElectionExpensesClassifier,
'irregular_companies_classifier': IrregularCompaniesClassifier
}

DATASET_KEYS = ('applicant_id', 'year', 'document_id')

VALUE = 'total_net_value'
70 changes: 62 additions & 8 deletions rosie/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,66 @@
import os.path
class Core:
"""
This is Rosie's core object: it implements a generic pipeline to collect
data, clean and normalize it, analyzies the data and output a dataset with
suspicions. It's initialization module takes a settings module and an
adapter.
import numpy as np
from sklearn.externals import joblib
The settings module should have three constants:
* CLASSIFIERS (dict) with pairs of human readable name (snake case) for
each classifier and the object (class) of the classifiers.
* UNIQUE_IDS (str or iterable) with the column(s) that should be taken as
unique identifiers if the main dataset of each module.
* VALUE (str) with the column that should be taken as the total net value
of the transaction represented by each row of the datset.
class RosieCore:
"""docstring for ClassName"""
def __init__(self, arg):
super(ClassName, self).__init__()
self.arg = arg
The adapter should be an object with:
* A `dataset` property with the main dataset to be analyzed;
* A `path` property with the path to the datasets (where the output will be
saved).
"""

def __init__(self, settings, adapter, data_path):
self.settings = settings
self.dataset = adapter.dataset
self.data_path = adapter.path

if isinstance(settings.UNIQUE_IDENTIFIERS, str):
self.settings.UNIQUE_IDS = (self.settings.UNIQUE_IDS,)

self.suspicions = self.dataset[self.settings.UNIQUE_IDS].copy()

def __call__(self):
for name, classifier in self.settings.CLASSIFIERS.items():
model = self.load_trained_model(classifier)
self.predict(model, name)

output = os.path.join(self.data_path, 'suspicions.xz')
kwargs = (compression='xz', encoding='utf-8', index=False)
self.suspicions.to_csv(output, **kwargs)

def load_trained_model(self, classifier):
filename = '{}.pkl'.format(classifier.__name__.lower())
path = os.path.join(self.data_path, filename)

# palliative: this outputs a model too large for joblib
if classifier.__name__ == 'MonthlySubquotaLimitClassifier':
model = classifier()
model.fit(self.dataset)

else:
if os.path.isfile(path):
model = joblib.load(path)
else:
model = classifier()
model.fit(self.dataset)
joblib.dump(model, path)

return model

def predict(self, model, name):
model.transform(self.dataset)
prediction = model.predict(self.dataset)
self.suspicions[suspicion] = prediciton
if prediciton.dtype == np.int:
self.suspitions.loc[prediciton == 1, name] = False
self.suspitions.loc[prediciton == -1, name] = True

0 comments on commit 202b490

Please sign in to comment.