Merge branch 'master' into cuducos-docker

okfn-brasil · Dec 23, 2016 · 4b9f917 · 4b9f917
2 parents 22ef11d + 5ec072c
commit 4b9f917
Show file tree

Hide file tree

Showing 10 changed files with 41 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -6,16 +6,6 @@
 
 A Python application reading receipts from the [Quota for Exercising Parliamentary Activity](https://github.com/datasciencebr/serenata-de-amor/blob/master/CONTRIBUTING.md#more-about-the-quota-for-exercising-parliamentary-activity-ceap) (aka CEAP) from the Brazilian Chamber of Deputies and outputs, for each of the receipts, a _probability of corruption_ and a list of reasons why it was considered this way.
 
-- [x] Fetch CEAP dataset from Chamber of Deputies
-- [x] Convert XML to CSV
-- [x] Translate CSVs to English
-- [x] Read CEAP dataset from CSV into Pandas
-- [ ] Process in the "corruption pipeline"
-    - [ ] Monthly limits - quota
-    - [x] Monthly limits - subquota
-    - [ ] Machine Learning models using scikit-learn
-- [ ] Task to push to Jarbas via API
-
 ## Running
 
 ### With Docker

diff --git a/rosie/dataset.py b/rosie/dataset.py
@@ -17,6 +17,7 @@ def get(self):
         reimbursements = self.get_reimbursements()
         companies = self.get_companies()
         return pd.merge(reimbursements, companies,
+                        how='left',
                         left_on='cnpj_cpf',
                         right_on='cnpj')
 
@@ -45,7 +46,7 @@ def get_companies(self):
         is_in_brazil = ('(-73.992222 < longitude < -34.7916667) & '
                         '(-33.742222 < latitude < 5.2722222)')
         dataset = pd.read_csv(os.path.join(self.path, self.COMPANIES_DATASET),
-                              dtype={'cnpj_cpf': np.str},
+                              dtype={'cnpj': np.str},
                               low_memory=False)
         dataset = dataset.query(is_in_brazil)
         dataset['cnpj'] = dataset['cnpj'].str.replace(r'\D', '')

diff --git a/rosie/traveled_speeds_classifier.py b/rosie/traveled_speeds_classifier.py
@@ -63,7 +63,7 @@ def __classify_dataset(self, X):
 
     def __applicable_rows(self, X):
         return (X['subquota_description'] == 'Congressperson meal') & \
-            X['congressperson_id'].notnull()
+            X[['congressperson_id', 'latitude', 'longitude']].notnull().all(axis=1)
 
     def __calculate_sum_distances(self, X):
         coordinate_list = X[['latitude', 'longitude']].values

diff --git a/tests/fixtures/companies.xz b/tests/fixtures/companies.xz
diff --git a/tests/meal_price_outlier_classifier.csv → ...ixtures/meal_price_outlier_classifier.csv b/tests/meal_price_outlier_classifier.csv → ...ixtures/meal_price_outlier_classifier.csv
diff --git a/tests/fixtures/reimbursements.xz b/tests/fixtures/reimbursements.xz
diff --git a/tests/traveled_speeds_classifier.csv → ...s/fixtures/traveled_speeds_classifier.csv b/tests/traveled_speeds_classifier.csv → ...s/fixtures/traveled_speeds_classifier.csv
@@ -10,6 +10,8 @@ applicant_id,congressperson_id,issue_date,subquota_description,cnpj_cpf,latitude
 999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977
 999,999,2016-01-01,Flight ticket issue,08378940000120,-29.2310464,-51.1597365
 999,,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977
+999,999,2016-01-01,Congressperson meal,08378940000120,,-67.8248977
+999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770,
 999,999,2016-01-02,Congressperson meal,08378940000120,-9.9753770,-67.8248977
 999,999,2016-01-03,Congressperson meal,08378940000120,-9.9753770,-67.8248977
 999,999,2016-01-03,Congressperson meal,08378940000120,-9.9753770,-67.8248977

diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -0,0 +1,24 @@
+import os.path
+from tempfile import mkdtemp
+from unittest import TestCase
+from unittest.mock import patch
+from shutil import copy2
+
+from rosie import Dataset
+
+
+class TestDataset(TestCase):
+
+    def setUp(self):
+        temp_path = mkdtemp()
+        copy2('tests/fixtures/companies.xz',
+              os.path.join(temp_path, Dataset.COMPANIES_DATASET))
+        copy2('tests/fixtures/reimbursements.xz', temp_path)
+        self.subject = Dataset(temp_path)
+
+    @patch('rosie.dataset.CEAPDataset')
+    @patch('rosie.dataset.fetch')
+    def test_get_performs_a_left_merge_between_reimbursements_and_companies(self, _ceap_dataset, _fetch):
+        dataset = self.subject.get()
+        self.assertEqual(5, len(dataset))
+        self.assertEqual(1, dataset['legal_entity'].isnull().sum())
diff --git a/tests/test_meal_price_outlier_classifier.py b/tests/test_meal_price_outlier_classifier.py
@@ -11,7 +11,7 @@
 class TestMealPriceOutlierClassifier(TestCase):
 
     def setUp(self):
-        self.dataset = pd.read_csv('tests/meal_price_outlier_classifier.csv',
+        self.dataset = pd.read_csv('tests/fixtures/meal_price_outlier_classifier.csv',
                                    dtype={'cnpj_cpf': np.str})
         self.subject = MealPriceOutlierClassifier()
         self.subject.fit(self.dataset)

diff --git a/tests/test_traveled_speeds_classifier.py b/tests/test_traveled_speeds_classifier.py
@@ -11,7 +11,7 @@
 class TestTraveledSpeedsClassifier(TestCase):
 
     def setUp(self):
-        self.dataset = pd.read_csv('tests/traveled_speeds_classifier.csv',
+        self.dataset = pd.read_csv('tests/fixtures/traveled_speeds_classifier.csv',
                                    dtype={'cnpj_cpf': np.str})
         self.subject = TraveledSpeedsClassifier()
         self.subject.fit(self.dataset)
@@ -34,7 +34,7 @@ def test_predict_considers_meal_reimbursements_in_days_with_more_than_8_outliers
 
     def test_predict_considers_non_meal_reibursement_an_inlier(self):
         prediction = self.subject.predict(self.dataset)
-        self.assertEqual(1, prediction[12])
+        self.assertEqual(1, prediction[14])
 
     def test_predict_considers_non_meal_reibursement_an_inlier_even_when_more_than_8_meal_reimbursements(self):
         prediction = self.subject.predict(self.dataset)
@@ -44,11 +44,19 @@ def test_predict_considers_meal_reibursement_without_congressperson_id_an_inlier
         prediction = self.subject.predict(self.dataset)
         self.assertEqual(1, prediction[10])
 
+    def test_predict_considers_meal_reibursement_without_latitude_an_inlier_even_when_more_than_8_meal_reimbursements(self):
+        prediction = self.subject.predict(self.dataset)
+        self.assertEqual(1, prediction[11])
+
+    def test_predict_considers_meal_reibursement_without_longitude_an_inlier_even_when_more_than_8_meal_reimbursements(self):
+        prediction = self.subject.predict(self.dataset)
+        self.assertEqual(1, prediction[12])
+
     def test_predict_uses_learned_thresholds_from_fit_dataset(self):
         subject = TraveledSpeedsClassifier(contamination=.6)
         subject.fit(self.dataset)
         assert_array_equal(
-            np.repeat(-1, 6), subject.predict(self.dataset[11:17]))
+            np.repeat(-1, 6), subject.predict(self.dataset[13:19]))
 
     def test_predict_limits_the_number_of_outliers_with_contamination_param(self):
         subject = TraveledSpeedsClassifier(contamination=.5)