<a href="https://colab.research.google.com/github/ggarciabas/ml_engineer/blob/dspipe/pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pipeline Fraude

- [sklearn pipes](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

In [4]:
from sklearn.pipeline import Pipeline

## Leitura dos dados e isolamento (treino, teste e validação)

### Funções

In [5]:
def plot_fraudes_data (pdM, start=None, end=None)->tuple:
  pdf = pdM.copy()
  if start != None:
    pdf = pdf[(pdf['fecha_d']>=start)&(pdf['fecha_d']<=end)]
  pdf = pdf[['fecha_d', 'fraude','a']].groupby(['fecha_d', 'fraude']).count().reset_index()
  pdf = pdf.pivot(index="fecha_d", columns=["fraude"], values="a")
  print (f"Fraude {pdf[1].mean()} avg.\tGenuine: {pdf[0].mean()} avg.")
  pdf.plot()
  return pdf[1].sum(),pdf[0].sum()

### Leitura dos dados

In [6]:
import pandas as pd
# Cria Dataframe inicial
url='https://drive.google.com/file/d/1dRDvoSOtdtsgOG65UVKLTBlzejg_cX4P/view?usp=sharing' 
url2='https://drive.google.com/uc?export=download&id=' + url.split('/')[-2]
# https://stackoverflow.com/questions/56611698/pandas-how-to-read-csv-file-from-google-drive-public
pd_dados = pd.read_csv(url2)

In [7]:
# Filtra data sem hora
pd_dados['fecha_d'] = pd_dados['fecha'].apply(lambda x: x[:10])

In [8]:
pd_dados['fecha'] = pd.to_datetime(pd_dados['fecha'])
pd_dados['fecha_d'] = pd.to_datetime(pd_dados['fecha_d'])

### Feature store

Conhecimento de risco para categoria do produto e país.

Informação que precisa ser conhecida de tempos em tempos.

In [9]:
import math
def calc_risk (pdf, feat, delay=7, janelas=[1,7]):
    """
      Calcula o risco de fraude para feature
      pdf: dataframe com filtro feature a ser avaliada
      delay: prazo de identificação da fraude
      janelas: janelas de análise, como temos poucos dados faremos de 1 em 1 dia e de 7 em 7
    """
    pdf = pdf.sort_values('fecha_d')
    pdf = pdf.set_index('fecha_d')
    # quantidade de transações fraudulentas + contagem de transações --> no período de delay 
    fraudes = pdf['fraude'].rolling(f'{delay}d').sum()
    trxS = pdf['fraude'].rolling(f'{delay}d').count()
    for janela in janelas:
      fraudeJ = pdf['fraude'].rolling(f'{delay+janela}d').sum() 
      trxJ = pdf['fraude'].rolling(f'{delay+janela}d').count() 
      riskJ=(fraudeJ-fraudes)/(trxJ-trxS)
      riskJ = [0 if math.isnan(x) else x for x in riskJ]
      pdf[f'{feat}_trx_{janela}'] = list((trxJ-trxS))
      pdf[f'{feat}_risk_{janela}'] = list(riskJ)
    pdf = pdf.reset_index()
    return pdf

In [10]:
feature_store = pd_dados.copy()

In [None]:
feature_store = feature_store.groupby('j').apply(lambda x: calc_risk(x, 'j')).sort_values('fecha_d').reset_index(drop=True)

In [None]:
feature_store = feature_store.groupby('g').apply(lambda x: calc_risk(x, 'g')).sort_values('fecha_d').reset_index(drop=True)

In [None]:
feature_store[['fecha_d', 'g', 'g_trx_1', 'g_risk_1', 'g_trx_7', 'g_risk_7']].to_csv('feature_store_g.csv')

In [None]:
feature_store[['fecha_d', 'j', 'j_trx_1', 'j_risk_1', 'j_trx_7', 'j_risk_7']].to_csv('feature_store_j.csv')

### Divide dados

- Treino: `2020-03-22` até `2020-03-28`
- Delay: `2020-03-29` até `2020-04-04`
- Teste: `2020-04-05` até `2020-04-11`

In [None]:
# Treino
dataTrain1 = {'start':'2020-03-22', 'end':'2020-03-28'}
pdfTrain1 = pd_dados.copy()
pdfTrain1 = pdfTrain1[(pdfTrain1['fecha_d']>=dataTrain1['start'])
                      &(pdfTrain1['fecha_d']<=dataTrain1['end'])]
print (pdfTrain1.shape)
fraude,genuine = plot_fraudes_data(pd_dados, dataTrain1['start'], dataTrain1['end'])
print (f"Fraude: {fraude} ({fraude/pdfTrain1.shape[0]})\tGenuine: {genuine} ({genuine/pdfTrain1.shape[0]})")

In [None]:
# Delay
dataDelay1 = {'start':'2020-03-29', 'end':'2020-04-04'}
plot_fraudes_data(pd_dados, dataDelay1['start'], dataDelay1['end'])

In [None]:
# Teste
dataTest1 = {'start':'2020-04-05', 'end':'2020-04-11'}
pdfTest1 = pd_dados.copy()
pdfTest1 = pdfTest1[(pdfTest1['fecha_d']>=dataTest1['start'])
                      &(pdfTest1['fecha_d']<=dataTest1['end'])]
print (pdfTest1.shape)
fraude,genuine = plot_fraudes_data(pd_dados, dataTest1['start'], dataTest1['end'])
print (f"Fraude: {fraude} ({fraude/pdfTest1.shape[0]})\tGenuine: {genuine} ({genuine/pdfTest1.shape[0]})")

## Classes

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
## Transformer para Risco de classes
import math
class RiskTransformer(BaseEstimator, TransformerMixin):
  """
    Calcula do risco para feature consultando na feature store
  """
  def __init__(self, feature_name, feat_store):
    self.feature_name = feature_name
    self.feat_store = feat_store

  def fit(self, X, y = None):
    return self

  def transform(self, X, y = None):    
    X_ = X.copy()
    X_ = X_.merge(self.feat_store, on=['fecha_d', self.feature_name], how='left')
    return X_

In [None]:
# Transformer para identificar final de semana
class IsWeekendTransformer(BaseEstimator, TransformerMixin):
  """
    Identifica se a operação ocorreu no final de semana
  """
  def __init__(self, feature_date):
    self.feature_date = feature_date

  def fit(self, X, y = None):
    return self

  def transform(self, X, y = None):    
    X_ = X.copy()
    X_ = X_[self.feature_date].apply(lambda x: 1 if x.weekday()>=5 else 0)
    return X_

## Versão tratamento manual

In [None]:
# Carrega a feature store
feat_store = pd.read_csv('feature_store.csv')

In [None]:
# Features manual
# feat = ['a', 'b', 'c', 'd', 'e', 'f', 'h', 'k', 'l',
#         'm', 'n',  'monto', 'weekend', 'night', 'cat_trx_1', 'cat_risk_1', 'cat_trx_7',
#         'cat_risk_7', 'pais_trx_1', 'pais_risk_1', 'pais_trx_7', 'pais_risk_7',
#         'miss_o', 'is_p']
target = 'fecha'
# cols = feat+[target]+['fecha_d']

In [None]:
p_manual = Pipeline([('risk_g', RiskTransformer(feature_name='g', feat_store=feat_store)), 
                      ('risk_j', RiskTransformer(feature_name='j', feat_store=feat_store))
                    ])

In [None]:
p_manual.fit(pdfTrain1, pdfTrain1[target])
#p_manual.score(X_test, y_test)

In [None]:
output = p_manual.transform(pdfTrain1)

In [None]:
output