Skip to content
This repository has been archived by the owner on Mar 1, 2018. It is now read-only.

Commit

Permalink
First try at ranking suspicions
Browse files Browse the repository at this point in the history
  • Loading branch information
jtemporal committed Aug 30, 2017
1 parent ebbf7e9 commit 77813b2
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 0 deletions.
Empty file added lead_scoring/__init__.py
Empty file.
74 changes: 74 additions & 0 deletions lead_scoring/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from collections import OrderedDict
import math
import os.path

import pandas as pd
import numpy as np

DISPLAY_KEYS = OrderedDict([
('url', 'URL'),
('issue_date', 'Data do gasto'),
('congressperson_name', 'Deputado'),
('total_net_value', 'Valor'),
('meal_price_outlier', 'Pre莽o de refei莽茫o suspeito?'),
('over_monthly_subquota_limit', 'Acima da subcota?'),
('suspicious_traveled_speed_day', 'Dist芒ncia viajada suspeita?'),
('invalid_cnpj_cpf', 'CNPJ ou CPF inv谩lido?'),
('election_expenses', '脡 gasto de elei莽茫o?'),
('irregular_companies_classifier', 'Empresa irregular?'),
('has_receipt', 'Tem recibo?'),
('is_in_office', 'Em mandato?'),
('year', 'Ano'),
('document_id', 'ID'),
('applicant_id', 'ID Deputado'),
])


def display(dataset):
data = dataset.copy()
data['issue_date'] = data['issue_date'].str[:10]
data['url'] = data['document_id'] \
.apply(lambda x: 'https://jarbas.datasciencebr.com/#/documentId/{}'.format(x))
data = data[[k for k in DISPLAY_KEYS.keys()]]
return data

def _display_percentage(values):
return '{0:.2f}%'.format(values * 100)

def ranking():
data = _irregularities()
data = pd.merge(data, _is_in_office(data))
data['has_receipt'] = data['year'] > 2011
data = data.sort_values(['is_in_office', 'has_receipt'],
ascending=[False, False])
remove_receipts_from_same_case(data)
return display(data)

def remove_receipts_from_same_case(data):
speed_day_keys = ['applicant_id',
'issue_date',
'suspicious_traveled_speed_day']
subquota_keys = ['applicant_id',
'month',
'over_monthly_subquota_limit']
data.drop_duplicates(speed_day_keys, inplace=True)
data.drop_duplicates(subquota_keys, inplace=True)
return data

def _is_in_office(data):
return data \
.groupby('applicant_id') \
.apply(lambda x: x['year'].max() >= 2016) \
.reset_index() \
.rename(columns={0: 'is_in_office'})


def _irregularities():
data = pd.read_csv('suspicions.xz'),
low_memory=False)
is_valid_suspicion = data.select_dtypes(include=[np.bool]).any(axis=1)
data = data[is_valid_suspicion]
reimbursements = pd.read_csv('reimbursements.xz'),
low_memory=False)
reimbursements = reimbursements.query('congressperson_id.notnull()')
return pd.merge(data, reimbursements)

0 comments on commit 77813b2

Please sign in to comment.