# CUNEF MUCD 2022/2023
## Machine Learning
## Análisis de Fraude

### Autores:
- Gozde Yazganoglu
- Irma Sanchez


# Importación de los librerías

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from collections import Counter
from sklearn import metrics
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pickle

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, \
                            silhouette_score, recall_score, precision_score, make_scorer, \
                            roc_auc_score, f1_score, precision_recall_curve

El parte de EDA hemos tratado los datos nulos y comentamos sobre la situacion de outliers. En esta sección vamos a continuar con variable selección y pipelines.

# Seleccion de los Variables y Creacion de Preprocessor

Hemos guardado el data como 'parquet' para que use menos memoria y procese más rapido. 

In [6]:
data= pd.read_parquet('../data/processed/dataset_payments_fraud_processed.parquet')
print("El dataset está compuesto por", len(data.index), "filas y", len(data.columns), "columnas")
data.head()

El dataset está compuesto por 1048575 filas y 19 columnas


Unnamed: 0,step,type,amount,gender,device,connection_time,nameOrig,race,oldbalanceOrg,age,newbalanceOrig,zone,user_number,nameDest,user_connections,security_alert,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,man,mac,0.140039,C1231006815,black,170136.0,85,160296.36,capital,138,M1979787155,5,1,0.0,0.0,0
1,1,PAYMENT,1864.28,woman,mac,0.49689,C1666544295,asian,21249.0,57,19384.72,country,909,M2044282225,1,0,0.0,0.0,0
2,1,TRANSFER,181.0,man,pc,0.78115,C1305486145,asian,181.0,66,0.0,capital,2569,C553264065,10,0,0.0,0.0,1
3,1,CASH_OUT,181.0,man,mac,0.565068,C840083671,black,181.0,31,0.0,country,1787,C38997010,3,0,21182.0,0.0,1
4,1,PAYMENT,11668.14,unknow,mac,0.517114,C2048537720,black,41554.0,90,29885.86,country,3997,M1230701703,8,0,0.0,0.0,0


Para poder tratar con pipeline, categorizamos los variables. primero agrupamos los datos numericos.

In [7]:
data_numeric = data.drop(columns = [ 'nameDest','nameOrig', 'type','gender','race','zone', 'device', 'isFraud'])
print("El dataset está compuesto por", len(data_numeric.index), "filas y", len(data_numeric.columns), "columnas")
data_numeric.head()



El dataset está compuesto por 1048575 filas y 11 columnas


Unnamed: 0,step,amount,connection_time,oldbalanceOrg,age,newbalanceOrig,user_number,user_connections,security_alert,oldbalanceDest,newbalanceDest
0,1,9839.64,0.140039,170136.0,85,160296.36,138,5,1,0.0,0.0
1,1,1864.28,0.49689,21249.0,57,19384.72,909,1,0,0.0,0.0
2,1,181.0,0.78115,181.0,66,0.0,2569,10,0,0.0,0.0
3,1,181.0,0.565068,181.0,31,0.0,1787,3,0,21182.0,0.0
4,1,11668.14,0.517114,41554.0,90,29885.86,3997,8,0,0.0,0.0


In [8]:
data_cat = data.filter(items=['nameDest','nameOrig', 'type','gender','race','zone', 'device'])
print("El dataset está compuesto por", len(data_cat.index), "filas y", len(data_cat.columns), "columnas")
data_cat.head()

El dataset está compuesto por 1048575 filas y 7 columnas


Unnamed: 0,nameDest,nameOrig,type,gender,race,zone,device
0,M1979787155,C1231006815,PAYMENT,man,black,capital,mac
1,M2044282225,C1666544295,PAYMENT,woman,asian,country,mac
2,C553264065,C1305486145,TRANSFER,man,asian,capital,pc
3,C38997010,C840083671,CASH_OUT,man,black,country,mac
4,M1230701703,C2048537720,PAYMENT,unknow,black,country,mac


Ahora conseguimos con variables categoricos. Podemos hacer target encoding o one hot encoding. Este decision es depende de cuantos valores unicos por variable. Tenemos tambien variables tipo numero de cliente : nameDest, nameOrig como comentemos antes, estos variables no nos pueden dar mucha información.

Resto de los variables categoricós no tienen muchas valores unicos. Se puede transformar con dummies. Podemos transformar las columnas utilizando pipeline. 

# Pipelines

In [20]:
list_cat = ['type','gender','race','zone', 'device']

In [21]:
list_num = ['step','amount','connection_time','oldbalanceOrg','age', 'newbalanceOrig','user_number','user_connections','security_alert','oldbalanceDest','newbalanceDest']

In [22]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ])


onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=np.nan)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

In [23]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, list_num),
        ('cat', onehot_transformer, list_cat)
        ])

In [24]:
# Guardamos el preprocessor
with open('../models/preprocessor.pickle', 'wb') as f:
    pickle.dump(preprocessor, f)

El preprocessador se va utilizar en siguentos pasos. en esta manera podemos utilizar en futuro con nuevos datos introducidos para comprobar con modelos.

# Creaccion de los datos de Train y Test

In [14]:
xtrain, xtest, ytrain, ytest = train_test_split(data.drop(columns=['isFraud']), data['isFraud'], test_size=0.20, random_state=0, stratify= data['isFraud'])

In [15]:
print("El dataset está compuesto por", len(xtrain.index), "filas y", len(xtrain.columns), "columnas")
xtrain.head()

El dataset está compuesto por 838860 filas y 18 columnas


Unnamed: 0,step,type,amount,gender,device,connection_time,nameOrig,race,oldbalanceOrg,age,newbalanceOrig,zone,user_number,nameDest,user_connections,security_alert,oldbalanceDest,newbalanceDest
398013,18,PAYMENT,18122.77,man,iphone,0.203923,C1108831516,black,51886.0,44,33763.23,africa,2047,M1405479153,10,0,0.0,0.0
57644,9,CASH_OUT,162508.35,man,pc,0.811282,C1561673457,latin,35137.0,18,0.0,capital,881,C1623277492,2,0,19824.0,437067.2
112607,11,PAYMENT,4531.2,woman,pc,0.80982,C227445222,black,85289.39,67,80758.19,africa,1769,M1765972463,4,0,0.0,0.0
438762,18,CASH_OUT,133330.4,unknow,pc,0.82284,C1518253350,other,0.0,25,0.0,country,2369,C1398318749,7,0,1989858.29,2789940.33
734829,38,CASH_IN,1937.66,woman,mac,0.462285,C425461927,asian,3515915.02,52,3517852.68,country,4838,C672262190,1,0,1506327.42,1748125.69


In [16]:
print("El dataset está compuesto por", len(xtest.index), "filas y", len(xtest.columns), "columnas")
xtest.head()

El dataset está compuesto por 209715 filas y 18 columnas


Unnamed: 0,step,type,amount,gender,device,connection_time,nameOrig,race,oldbalanceOrg,age,newbalanceOrig,zone,user_number,nameDest,user_connections,security_alert,oldbalanceDest,newbalanceDest
913673,43,CASH_OUT,183961.06,woman,iphone,0.188162,C1745487332,black,10110.0,14,0.0,country,4304,C220065931,7,0,3065000.46,3248961.52
217560,13,CASH_OUT,116454.41,man,mac,0.381502,C896968055,asian,231298.0,23,114843.59,africa,4689,C562262220,3,0,883739.58,1000193.98
738438,38,CASH_IN,177780.66,unknow,mac,0.453677,C904478850,asian,2820628.04,83,2998408.71,country,3408,C1919486048,10,0,990784.88,813004.21
632204,35,PAYMENT,16824.94,unknow,other,0.925336,C1272616090,asian,337204.0,83,320379.06,africa,1524,M575239164,10,0,0.0,0.0
529704,20,CASH_IN,109133.06,woman,iphone,0.018892,C1607953481,asian,17106267.15,45,17215400.22,country,1458,C22043098,9,1,6624653.56,6515520.5


Elegimos test size como 20% luego para comprobar con los modelos. Como tenemos un problema de minoridad, Es importante que barajar bien los valores. Hemos puesto stratify = ['isFraud'] para valores objectivos. Para asegurar, hacemos otro analisis rapida para observar si hemos seperado bien.

In [17]:
import sweetviz as sv

In [18]:
#compare_report = sv.compare([xtrain, "train"],[xtest, "test"])
#compare_report.show_html("../html/compare_variables.html")

#Comentamos el codigo porque se trabaja mejor así. 
#El resultado esta en htmls. En caso de necesidad se puede executar también.

El reporte de comparacion nos enseña que los datos de training son seperado bien. podemos conseguir con esta separacion. Cuando seperar elegimos stratif= data['isFraud'] para ayudarnos tener 2 sets en más o menos mismo proporciones de valores 1. En este trabajo no trabajamos con datos sinteticos para evitar sus desventajas.

In [19]:
xtrain.to_parquet("../data/processed/xtrain.parquet")
xtest.to_parquet("../data/processed/xtest.parquet")
pd.DataFrame(ytrain).to_parquet("../data/processed/ytrain.parquet")
pd.DataFrame(ytest).to_parquet("../data/processed/ytest.parquet")
print('guardado')


guardado
