Skip to content
This repository has been archived by the owner on Mar 1, 2018. It is now read-only.

Commit

Permalink
Merge branch 'master' into cuducos-docker
Browse files Browse the repository at this point in the history
  • Loading branch information
cuducos committed Dec 23, 2016
2 parents 22ef11d + 5ec072c commit 4b9f917
Show file tree
Hide file tree
Showing 10 changed files with 41 additions and 16 deletions.
10 changes: 0 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,6 @@

A Python application reading receipts from the [Quota for Exercising Parliamentary Activity](https://github.com/datasciencebr/serenata-de-amor/blob/master/CONTRIBUTING.md#more-about-the-quota-for-exercising-parliamentary-activity-ceap) (aka CEAP) from the Brazilian Chamber of Deputies and outputs, for each of the receipts, a _probability of corruption_ and a list of reasons why it was considered this way.

- [x] Fetch CEAP dataset from Chamber of Deputies
- [x] Convert XML to CSV
- [x] Translate CSVs to English
- [x] Read CEAP dataset from CSV into Pandas
- [ ] Process in the "corruption pipeline"
- [ ] Monthly limits - quota
- [x] Monthly limits - subquota
- [ ] Machine Learning models using scikit-learn
- [ ] Task to push to Jarbas via API

## Running

### With Docker
Expand Down
3 changes: 2 additions & 1 deletion rosie/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def get(self):
reimbursements = self.get_reimbursements()
companies = self.get_companies()
return pd.merge(reimbursements, companies,
how='left',
left_on='cnpj_cpf',
right_on='cnpj')

Expand Down Expand Up @@ -45,7 +46,7 @@ def get_companies(self):
is_in_brazil = ('(-73.992222 < longitude < -34.7916667) & '
'(-33.742222 < latitude < 5.2722222)')
dataset = pd.read_csv(os.path.join(self.path, self.COMPANIES_DATASET),
dtype={'cnpj_cpf': np.str},
dtype={'cnpj': np.str},
low_memory=False)
dataset = dataset.query(is_in_brazil)
dataset['cnpj'] = dataset['cnpj'].str.replace(r'\D', '')
Expand Down
2 changes: 1 addition & 1 deletion rosie/traveled_speeds_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def __classify_dataset(self, X):

def __applicable_rows(self, X):
return (X['subquota_description'] == 'Congressperson meal') & \
X['congressperson_id'].notnull()
X[['congressperson_id', 'latitude', 'longitude']].notnull().all(axis=1)

def __calculate_sum_distances(self, X):
coordinate_list = X[['latitude', 'longitude']].values
Expand Down
Binary file added tests/fixtures/companies.xz
Binary file not shown.
Binary file added tests/fixtures/reimbursements.xz
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ applicant_id,congressperson_id,issue_date,subquota_description,cnpj_cpf,latitude
999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977
999,999,2016-01-01,Flight ticket issue,08378940000120,-29.2310464,-51.1597365
999,,2016-01-01,Congressperson meal,08378940000120,-9.9753770,-67.8248977
999,999,2016-01-01,Congressperson meal,08378940000120,,-67.8248977
999,999,2016-01-01,Congressperson meal,08378940000120,-9.9753770,
999,999,2016-01-02,Congressperson meal,08378940000120,-9.9753770,-67.8248977
999,999,2016-01-03,Congressperson meal,08378940000120,-9.9753770,-67.8248977
999,999,2016-01-03,Congressperson meal,08378940000120,-9.9753770,-67.8248977
Expand Down
24 changes: 24 additions & 0 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import os.path
from tempfile import mkdtemp
from unittest import TestCase
from unittest.mock import patch
from shutil import copy2

from rosie import Dataset


class TestDataset(TestCase):

def setUp(self):
temp_path = mkdtemp()
copy2('tests/fixtures/companies.xz',
os.path.join(temp_path, Dataset.COMPANIES_DATASET))
copy2('tests/fixtures/reimbursements.xz', temp_path)
self.subject = Dataset(temp_path)

@patch('rosie.dataset.CEAPDataset')
@patch('rosie.dataset.fetch')
def test_get_performs_a_left_merge_between_reimbursements_and_companies(self, _ceap_dataset, _fetch):
dataset = self.subject.get()
self.assertEqual(5, len(dataset))
self.assertEqual(1, dataset['legal_entity'].isnull().sum())
2 changes: 1 addition & 1 deletion tests/test_meal_price_outlier_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
class TestMealPriceOutlierClassifier(TestCase):

def setUp(self):
self.dataset = pd.read_csv('tests/meal_price_outlier_classifier.csv',
self.dataset = pd.read_csv('tests/fixtures/meal_price_outlier_classifier.csv',
dtype={'cnpj_cpf': np.str})
self.subject = MealPriceOutlierClassifier()
self.subject.fit(self.dataset)
Expand Down
14 changes: 11 additions & 3 deletions tests/test_traveled_speeds_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
class TestTraveledSpeedsClassifier(TestCase):

def setUp(self):
self.dataset = pd.read_csv('tests/traveled_speeds_classifier.csv',
self.dataset = pd.read_csv('tests/fixtures/traveled_speeds_classifier.csv',
dtype={'cnpj_cpf': np.str})
self.subject = TraveledSpeedsClassifier()
self.subject.fit(self.dataset)
Expand All @@ -34,7 +34,7 @@ def test_predict_considers_meal_reimbursements_in_days_with_more_than_8_outliers

def test_predict_considers_non_meal_reibursement_an_inlier(self):
prediction = self.subject.predict(self.dataset)
self.assertEqual(1, prediction[12])
self.assertEqual(1, prediction[14])

def test_predict_considers_non_meal_reibursement_an_inlier_even_when_more_than_8_meal_reimbursements(self):
prediction = self.subject.predict(self.dataset)
Expand All @@ -44,11 +44,19 @@ def test_predict_considers_meal_reibursement_without_congressperson_id_an_inlier
prediction = self.subject.predict(self.dataset)
self.assertEqual(1, prediction[10])

def test_predict_considers_meal_reibursement_without_latitude_an_inlier_even_when_more_than_8_meal_reimbursements(self):
prediction = self.subject.predict(self.dataset)
self.assertEqual(1, prediction[11])

def test_predict_considers_meal_reibursement_without_longitude_an_inlier_even_when_more_than_8_meal_reimbursements(self):
prediction = self.subject.predict(self.dataset)
self.assertEqual(1, prediction[12])

def test_predict_uses_learned_thresholds_from_fit_dataset(self):
subject = TraveledSpeedsClassifier(contamination=.6)
subject.fit(self.dataset)
assert_array_equal(
np.repeat(-1, 6), subject.predict(self.dataset[11:17]))
np.repeat(-1, 6), subject.predict(self.dataset[13:19]))

def test_predict_limits_the_number_of_outliers_with_contamination_param(self):
subject = TraveledSpeedsClassifier(contamination=.5)
Expand Down

0 comments on commit 4b9f917

Please sign in to comment.