# CUNEF MUCD 2022/2023

# Machine Learning
# Análisis de Fraude¶

### Autores:
- Gozde Yazganoglu
- Irma Sanchez


# Importación de Librerías 

In [2]:

import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, \
                            silhouette_score, recall_score, precision_score, make_scorer, \
                            roc_auc_score, f1_score, precision_recall_curve
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, roc_auc_score, \
                            classification_report, confusion_matrix
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix

from sklearn.dummy import DummyClassifier 
from sklearn.naive_bayes import GaussianNB

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import pickle
import warnings
warnings.filterwarnings('ignore')

from aux_func import evaluate_model, cargar_modelo


# Cargando Datos y Pipelines

In [3]:
xtrain = pd.read_parquet("../data/processed/xtrain.parquet")
ytrain = pd.read_parquet("../data/processed/ytrain.parquet")
xtest = pd.read_parquet("../data/processed/xtest.parquet")
ytest = pd.read_parquet("../data/processed/ytest.parquet")

In [4]:
preprocessor = cargar_modelo('../models/preprocessor.pickle')


In [5]:
preprocessor

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['step', 'amount', 'connection_time',
                                  'oldbalanceOrg', 'age', 'newbalanceOrig',
                                  'user_number', 'user_connections',
                                  'security_alert', 'oldbalanceDest',
                                  'newbalanceDest']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value=nan,
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(h

# Implementacion del Modelo

In [16]:
model_dummy = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('clasificador', DummyClassifier(strategy='stratified', random_state=0))])

In [17]:
model_dummy.fit(xtrain, ytrain)

Pipeline(steps=[('preprocesador',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['step', 'amount',
                                                   'connection_time',
                                                   'oldbalanceOrg', 'age',
                                                   'newbalanceOrig',
                                                   'user_number',
                                                   'user_connections',
                                                   'security_alert',
                                                   'oldbalanceDest',
      

In [18]:
with open('../models/model_dummy.pickle', 'wb') as f:
    pickle.dump(model_dummy, f)

In [19]:
ypred = model_dummy.predict(xtest)
ypred_proba = model_dummy.predict_proba(xtest)
evaluate_model(ytest,ypred,ypred_proba)

ROC-AUC score of the model: 0.49943194565772575
Accuracy of the model: 0.9977779367236488

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       0.00      0.00      0.00       228

    accuracy                           1.00    209715
   macro avg       0.50      0.50      0.50    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209249    238]
 [   228      0]]



In [24]:
# keep probabilities for the positive outcome only
yhat = ypred_proba[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_proba[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytest,ypred_new_threshold,ypred_proba)


Best Threshold=1.000000, G-Mean=0.881
ROC-AUC score of the model: 0.9359664029732113
Accuracy of the model: 0.8883961566888396

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.89      0.94    209487
           1       0.01      0.87      0.02       228

    accuracy                           0.89    209715
   macro avg       0.50      0.88      0.48    209715
weighted avg       1.00      0.89      0.94    209715


Confusion matrix: 
[[186112  23375]
 [    30    198]]



In [20]:
model_NB = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('clasificador', GaussianNB())])

In [21]:
with open('../models/model_NB.pickle', 'wb') as f:
    pickle.dump(model_NB, f)

In [22]:
model_NB.fit(xtrain, ytrain)

Pipeline(steps=[('preprocesador',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['step', 'amount',
                                                   'connection_time',
                                                   'oldbalanceOrg', 'age',
                                                   'newbalanceOrig',
                                                   'user_number',
                                                   'user_connections',
                                                   'security_alert',
                                                   'oldbalanceDest',
      

In [23]:
ypred = model_dummy.predict(xtest)
ypred_proba = model_NB.predict_proba(xtest)
evaluate_model(ytest,ypred,ypred_proba)

ROC-AUC score of the model: 0.9359664029732113
Accuracy of the model: 0.9977779367236488

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       0.00      0.00      0.00       228

    accuracy                           1.00    209715
   macro avg       0.50      0.50      0.50    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209249    238]
 [   228      0]]



#  Ajuste de Umbral

In [13]:
# keep probabilities for the positive outcome only
yhat = ypred_proba[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_proba[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytest,ypred_new_threshold,ypred_proba)

Best Threshold=2.000000, G-Mean=0.000
ROC-AUC score of the model: 0.49943194565772575
Accuracy of the model: 0.9989128102424719

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       0.00      0.00      0.00       228

    accuracy                           1.00    209715
   macro avg       0.50      0.50      0.50    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209487      0]
 [   228      0]]



Como estamos con un problema de minoridad, todos los modelos que trabajaremos va a tener alto accuracy. Pero los dos modelos no detecta los fraudes. los intenciones de ajustar los modelos no nos ayuda mucho tampoco. Analizaremos mas detallada en seccion de model selection.