Merge 054ddac into 40af34e

okfn-brasil · Apr 18, 2017 · 202b490 · 202b490
2 parents 40af34e + 054ddac
commit 202b490
Show file tree

Hide file tree

Showing 4 changed files with 91 additions and 76 deletions.
diff --git a/rosie/chamber_of_deputies/__init__.py b/rosie/chamber_of_deputies/__init__.py
@@ -1,69 +1,9 @@
-import os.path
-
-import numpy as np
-from sklearn.externals import joblib
-
-from rosie.chamber_of_deputies.dataset import Dataset
-from rosie.chamber_of_deputies.classifiers.election_expenses_classifier import ElectionExpensesClassifier
-from rosie.core.classifiers.invalid_cnpj_cpf_classifier import InvalidCnpjCpfClassifier
-from rosie.chamber_of_deputies.classifiers.meal_price_outlier_classifier import MealPriceOutlierClassifier
-from rosie.chamber_of_deputies.classifiers.monthly_subquota_limit_classifier import MonthlySubquotaLimitClassifier
-from rosie.chamber_of_deputies.classifiers.traveled_speeds_classifier import TraveledSpeedsClassifier
-from rosie.chamber_of_deputies.classifiers.irregular_companies_classifier import IrregularCompaniesClassifier
-
-
-class ChamberOfDeputies:
-    CLASSIFIERS = {
-        MealPriceOutlierClassifier: 'meal_price_outlier',
-        MonthlySubquotaLimitClassifier: 'over_monthly_subquota_limit',
-        TraveledSpeedsClassifier: 'suspicious_traveled_speed_day',
-        InvalidCnpjCpfClassifier: 'invalid_cnpj_cpf',
-        ElectionExpensesClassifier: 'election_expenses',
-        IrregularCompaniesClassifier: 'irregular_companies_classifier'
-    }
-    DATASET_KEYS = ['applicant_id', 'year', 'document_id']
-
-    def __init__(self, dataset, data_path):
-        self.dataset = dataset
-        self.data_path = data_path
-        self.irregularities = self.dataset[self.DATASET_KEYS].copy()
-
-    def run_classifiers(self):
-        for classifier, irregularity in self.CLASSIFIERS.items():
-            model = self.load_trained_model(classifier)
-            self.predict(model, irregularity)
-
-        self.irregularities.to_csv(os.path.join(self.data_path, 'irregularities.xz'),
-                                   compression='xz',
-                                   encoding='utf-8',
-                                   index=False)
-
-    def load_trained_model(self, classifier):
-        filename = '{}.pkl'.format(classifier.__name__.lower())
-        path = os.path.join(self.data_path, filename)
-        # palliative since this model is outputting
-        # a model too large to be loaded with joblib
-        if filename == 'monthlysubquotalimitclassifier.pkl':
-            model = classifier()
-            model.fit(self.dataset)
-        else:
-            if os.path.isfile(path):
-                model = joblib.load(path)
-            else:
-                model = classifier()
-                model.fit(self.dataset)
-                joblib.dump(model, path)
-        return model
-
-    def predict(self, model, irregularity):
-        model.transform(self.dataset)
-        y = model.predict(self.dataset)
-        self.irregularities[irregularity] = y
-        if y.dtype == np.int:
-            self.irregularities.loc[y == 1, irregularity] = False
-            self.irregularities.loc[y == -1, irregularity] = True
+from rosie.chamber_of_deputies import settings
+from rosie.chamber_of_deputies.adapter import Adapter
+from rosie.core import Core
 
 
 def main(target_directory='/tmp/serenata-data'):
-    dataset = Dataset(target_directory).get()
-    ChamberOfDeputies(dataset, target_directory).run_classifiers()
+    adapter = Adapter(target_directory)
+    core = Core(settings, adapter)
+    core()
diff --git a/rosie/chamber_of_deputies/dataset.py → rosie/chamber_of_deputies/adapter.py b/rosie/chamber_of_deputies/dataset.py → rosie/chamber_of_deputies/adapter.py
@@ -6,13 +6,14 @@
 from serenata_toolbox.datasets import fetch
 
 
-class Dataset:
+class Adapter:
     COMPANIES_DATASET = '2016-09-03-companies.xz'
 
     def __init__(self, path):
         self.path = path
 
-    def get(self):
+    @property
+    def dataset(self):
         self.update_datasets()
         reimbursements = self.get_reimbursements()
         companies = self.get_companies()

diff --git a/rosie/chamber_of_deputies/settings.py b/rosie/chamber_of_deputies/settings.py
@@ -0,0 +1,20 @@
+from rosie.chamber_of_deputies.classifiers.election_expenses_classifier import ElectionExpensesClassifier
+from rosie.chamber_of_deputies.classifiers.meal_price_outlier_classifier import MealPriceOutlierClassifier
+from rosie.chamber_of_deputies.classifiers.monthly_subquota_limit_classifier import MonthlySubquotaLimitClassifier
+from rosie.chamber_of_deputies.classifiers.traveled_speeds_classifier import TraveledSpeedsClassifier
+from rosie.chamber_of_deputies.classifiers.irregular_companies_classifier import IrregularCompaniesClassifier
+from rosie.core.classifiers.invalid_cnpj_cpf_classifier import InvalidCnpjCpfClassifier
+
+
+CLASSIFIERS = {
+    'meal_price_outlier': MealPriceOutlierClassifier,
+    'over_monthly_subquota_limit': MonthlySubquotaLimitClassifier,
+    'suspicious_traveled_speed_day': TraveledSpeedsClassifier,
+    'invalid_cnpj_cpf': InvalidCnpjCpfClassifier,
+    'election_expenses': ElectionExpensesClassifier,
+    'irregular_companies_classifier': IrregularCompaniesClassifier
+}
+
+DATASET_KEYS = ('applicant_id', 'year', 'document_id')
+
+VALUE = 'total_net_value'
diff --git a/rosie/core/__init__.py b/rosie/core/__init__.py
@@ -1,12 +1,66 @@
-import os.path
+class Core:
+    """
+    This is Rosie's core object: it implements a generic pipeline to collect
+    data, clean and normalize it, analyzies the data and output a dataset with
+    suspicions. It's initialization module takes a settings module and an
+    adapter.
 
-import numpy as np
-from sklearn.externals import joblib
+    The settings module should have three constants:
+    * CLASSIFIERS (dict) with pairs of human readable name (snake case) for
+    each classifier and the object (class) of the classifiers.
+    * UNIQUE_IDS (str or iterable) with the column(s) that should be taken as
+    unique identifiers if the main dataset of each module.
+    * VALUE (str) with the column that should be taken as the total net value
+    of the transaction represented by each row of the datset.
 
-class RosieCore:
-    """docstring for ClassName"""
-    def __init__(self, arg):
-        super(ClassName, self).__init__()
-        self.arg = arg
+    The adapter should be an object with:
+    * A `dataset` property with the main dataset to be analyzed;
+    * A `path` property with the path to the datasets (where the output will be
+    saved).
+    """
 
+    def __init__(self, settings, adapter, data_path):
+        self.settings = settings
+        self.dataset = adapter.dataset
+        self.data_path = adapter.path
 
+        if isinstance(settings.UNIQUE_IDENTIFIERS, str):
+            self.settings.UNIQUE_IDS = (self.settings.UNIQUE_IDS,)
+
+        self.suspicions = self.dataset[self.settings.UNIQUE_IDS].copy()
+
+    def __call__(self):
+        for name, classifier in self.settings.CLASSIFIERS.items():
+            model = self.load_trained_model(classifier)
+            self.predict(model, name)
+
+        output = os.path.join(self.data_path, 'suspicions.xz')
+        kwargs = (compression='xz', encoding='utf-8', index=False)
+        self.suspicions.to_csv(output, **kwargs)
+
+    def load_trained_model(self, classifier):
+        filename = '{}.pkl'.format(classifier.__name__.lower())
+        path = os.path.join(self.data_path, filename)
+
+        # palliative: this outputs a model too large for joblib
+        if classifier.__name__ == 'MonthlySubquotaLimitClassifier':
+            model = classifier()
+            model.fit(self.dataset)
+
+        else:
+            if os.path.isfile(path):
+                model = joblib.load(path)
+            else:
+                model = classifier()
+                model.fit(self.dataset)
+                joblib.dump(model, path)
+
+        return model
+
+    def predict(self, model, name):
+        model.transform(self.dataset)
+        prediction = model.predict(self.dataset)
+        self.suspicions[suspicion] = prediciton
+        if prediciton.dtype == np.int:
+            self.suspitions.loc[prediciton == 1, name] = False
+            self.suspitions.loc[prediciton == -1, name] = True