# CUNEF MUCD 2022/2023

# Machine Learning
# Análisis de Fraude¶

### Autores:
- Gozde Yazganoglu
- Irma Sanchez


# Importación de Librerías 

In [2]:

import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, \
                            silhouette_score, recall_score, precision_score, make_scorer, \
                            roc_auc_score, f1_score, precision_recall_curve
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, roc_auc_score, \
                            classification_report, confusion_matrix
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix

from sklearn.dummy import DummyClassifier 
from sklearn.naive_bayes import GaussianNB

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import pickle
import warnings
warnings.filterwarnings('ignore')

from aux_func import evaluate_model, cargar_modelo


# Cargando Datos y Pipelines

In [3]:
xtrain = pd.read_parquet("../data/processed/xtrain.parquet")
ytrain = pd.read_parquet("../data/processed/ytrain.parquet")
xtest = pd.read_parquet("../data/processed/xtest.parquet")
ytest = pd.read_parquet("../data/processed/ytest.parquet")

In [4]:
preprocessor = cargar_modelo('../models/preprocessor.pickle')


In [5]:
preprocessor

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['step', 'amount', 'connection_time',
                                  'oldbalanceOrg', 'age', 'newbalanceOrig',
                                  'user_number', 'user_connections',
                                  'security_alert', 'oldbalanceDest',
                                  'newbalanceDest']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value=nan,
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(h

# Implementacion del Modelo

In [7]:
model_dummy = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('clasificador', DummyClassifier(strategy='stratified', random_state=0))])

In [8]:
xtrain

Unnamed: 0,step,type,amount,gender,device,connection_time,nameOrig,race,oldbalanceOrg,age,newbalanceOrig,zone,user_number,nameDest,user_connections,security_alert,oldbalanceDest,newbalanceDest
398013,18,PAYMENT,18122.77,man,iphone,0.203923,C1108831516,black,51886.00,44,33763.23,africa,2047,M1405479153,10,0,0.00,0.00
57644,9,CASH_OUT,162508.35,man,pc,0.811282,C1561673457,latin,35137.00,18,0.00,capital,881,C1623277492,2,0,19824.00,437067.20
112607,11,PAYMENT,4531.20,woman,pc,0.809820,C227445222,black,85289.39,67,80758.19,africa,1769,M1765972463,4,0,0.00,0.00
438762,18,CASH_OUT,133330.40,unknow,pc,0.822840,C1518253350,other,0.00,25,0.00,country,2369,C1398318749,7,0,1989858.29,2789940.33
734829,38,CASH_IN,1937.66,woman,mac,0.462285,C425461927,asian,3515915.02,52,3517852.68,country,4838,C672262190,1,0,1506327.42,1748125.69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909,1,PAYMENT,10497.33,woman,iphone,0.084828,C1717739363,asian,2684.69,17,0.00,capital,465,M138536309,3,1,0.00,0.00
341087,16,CASH_OUT,47913.17,man,other,0.978752,C1307321015,latin,0.00,41,0.00,capital,2012,C49966531,9,0,94368.72,142281.89
291112,15,PAYMENT,28546.07,man,pc,0.799663,C1317149827,asian,0.00,41,0.00,country,3310,M850052135,2,0,0.00,0.00
923030,43,CASH_OUT,186171.10,man,iphone,0.177884,C1784836589,black,0.00,92,0.00,unknown,4019,C984817850,8,0,2781445.17,2967616.27


In [9]:
model_dummy.fit(xtrain, ytrain)

In [10]:
with open('../models/model_dummy.pickle', 'wb') as f:
    pickle.dump(model_dummy, f)

In [11]:
ypred = model_dummy.predict(xtest)
ypred_proba = model_dummy.predict_proba(xtest)
evaluate_model(ytest,ypred,ypred_proba)

ROC-AUC score of the model: 0.49943194565772575
Accuracy of the model: 0.9977779367236488

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       0.00      0.00      0.00       228

    accuracy                           1.00    209715
   macro avg       0.50      0.50      0.50    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209249    238]
 [   228      0]]



In [12]:
model_NB = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('clasificador', GaussianNB())])

In [13]:
with open('../models/model_NB.pickle', 'wb') as f:
    pickle.dump(model_NB, f)

In [14]:
model_NB.fit(xtrain, ytrain)

#  Ajuste de Umbral

In [15]:
#NEW
# keep probabilities for the positive outcome only
yhat = ypred_proba[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_proba[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytest,ypred_new_threshold,ypred_proba)

Best Threshold=2.000000, G-Mean=0.000
ROC-AUC score of the model: 0.49943194565772575
Accuracy of the model: 0.9989128102424719

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       0.00      0.00      0.00       228

    accuracy                           1.00    209715
   macro avg       0.50      0.50      0.50    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209487      0]
 [   228      0]]

