# CUNEF MUCD 2021/2022  
## Machine Learning
## Analisis de Fraude

### Autores:
- Gozde Yazganoglu
- Irma Sanchez


# Importación de Librerías 

In [2]:
#Librerías
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, \
                            silhouette_score, recall_score, precision_score, make_scorer, \
                            roc_auc_score, f1_score, precision_recall_curve

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.linear_model import LogisticRegressionCV 

from sklearn.pipeline import Pipeline



from aux_func import evaluate_model, cargar_modelo
import pickle
import warnings
warnings.filterwarnings('ignore')

In [3]:
xtrain = pd.read_parquet("../data/processed/xtrain.parquet")
ytrain = pd.read_parquet("../data/processed/ytrain.parquet")
xtest = pd.read_parquet("../data/processed/xtest.parquet")
ytest = pd.read_parquet("../data/processed/ytest.parquet")

# Cargando Datos y Pipelines

In [4]:
#Cargamos pipeline preprocesado
preprocessor = cargar_modelo('../models/preprocessor.pickle')

# Regresión Logística (Ridge)

La regresión de Ridge se desarrolló como una posible solución a la imprecisión de los estimadores de mínimos cuadrados cuando los modelos de regresión lineal tienen algunas variables independientes multicolineales (altamente correlacionadas), mediante la creación de un estimador de regresión de Ridge (RR). Esto proporciona una estimación más precisa de los parámetros de Ridge, ya que su varianza y estimador cuadrático medio son a menudo más pequeños que los estimadores mínimos cuadrados derivados anteriormente.

In [5]:
clf_l2 = Pipeline(steps=[
    ('preprocesador', preprocessor), 
    ('clasificador', LogisticRegressionCV(cv=8, n_jobs=4, penalty='l2', random_state=0))])

In [6]:
clf_l2.fit(xtrain, ytrain)
warnings.filterwarnings('ignore')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [7]:
with open('../models/model_LR2.pickle', 'wb') as f:
    pickle.dump(clf_l2, f)

In [8]:
# Para no tener que ejecutar, saltarse el fit y ejecutar a partir de aquí
cargar_modelo('../models/model_LR2.pickle')

Pipeline(steps=[('preprocesador',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['step', 'amount',
                                                   'connection_time',
                                                   'oldbalanceOrg', 'age',
                                                   'newbalanceOrig',
                                                   'user_number',
                                                   'user_connections',
                                                   'security_alert',
                                                   'oldbalanceDest',
      

In [9]:
ypred = clf_l2.predict(xtest)
ypred_proba = clf_l2.predict_proba(xtest)
evaluate_model(ytest,ypred, ypred_proba)

ROC-AUC score of the model: 0.9932801382223694
Accuracy of the model: 0.9993228905896097

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       0.91      0.42      0.57       228

    accuracy                           1.00    209715
   macro avg       0.96      0.71      0.79    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209478      9]
 [   133     95]]



In [None]:
model_analysis(clf_l2, xtest, ytest)

ROC CURVE

In [None]:
import matplotlib.pyplot as plt
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(fpr, tpr, marker='.', label='SVM')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# calculate the g-mean for each threshold
gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

PR CURVE

In [None]:
# calculate pr-curve
precision, recall, thresholds = precision_recall_curve(ytest, yhat)
# plot the roc curve for the model
no_skill = len(ytest[ytest==1]) / len(ytest)
plt.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
plt.plot(recall, precision, marker='.', label='SVM')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
# show the plot
plt.show()

In [None]:
# convert to f score
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
# plot the roc curve for the model
no_skill = len(ytest[ytest==1]) / len(ytest)
plt.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
plt.plot(recall, precision, marker='.', label='SVM')
plt.scatter(recall[ix], precision[ix], s=100, marker='o', color='black', label='Best')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
# show the plot
plt.show()

In [None]:
# evaluate the model
score = f1_score(ytest, ypred)
print('F-Score: %.5f' % score)

## Ajuste del umbral de predicción

In [10]:
# keep probabilities for the positive outcome only
yhat = ypred_proba[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_proba[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytest,ypred_new_threshold,ypred_proba)

Best Threshold=0.001651, G-Mean=0.960
ROC-AUC score of the model: 0.9932801382223694
Accuracy of the model: 0.9379491214266982

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.94      0.97    209487
           1       0.02      0.98      0.03       228

    accuracy                           0.94    209715
   macro avg       0.51      0.96      0.50    209715
weighted avg       1.00      0.94      0.97    209715


Confusion matrix: 
[[196479  13008]
 [     5    223]]



In [None]:
model_analysis(clf_l2, xtest, ytest)

# Regresión Logística (Lasso)

La regresión de Lasso es parecida a la regresión lineal, pero utiliza una técnica de "contracción" en la que los coeficientes de determinación se reducen a cero . 
La regresión lineal le brinda coeficientes de regresión como se observa en el conjunto de datos. La regresión Lasso le permite reducir o regularizar estos coeficientes para evitar el sobreajuste y hacer que funcionen mejor en diferentes conjuntos de datos. 
Este tipo de regresión se usa cuando el conjunto de datos muestra una alta multicolinealidad o cuando desea automatizar la eliminación de variables y la selección de características .

In [16]:
clf_l1 = Pipeline(steps=[
    ('preprocesador', preprocessor), 
    ('clasificador', LogisticRegression(C=1.5,random_state=0, n_jobs=2, penalty='l1', solver='liblinear', tol= 0.0005))])

In [17]:
clf_l1.fit(xtrain, ytrain)

Pipeline(steps=[('preprocesador',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['step', 'amount',
                                                   'connection_time',
                                                   'oldbalanceOrg', 'age',
                                                   'newbalanceOrig',
                                                   'user_number',
                                                   'user_connections',
                                                   'security_alert',
                                                   'oldbalanceDest',
      

In [15]:
with open('../models/model_LR1.pickle', 'wb') as f:
    pickle.dump(clf_l1, f)

In [19]:
cargar_modelo('../models/model_LR1.pickle')

Pipeline(steps=[('preprocesador',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['step', 'amount',
                                                   'connection_time',
                                                   'oldbalanceOrg', 'age',
                                                   'newbalanceOrig',
                                                   'user_number',
                                                   'user_connections',
                                                   'security_alert',
                                                   'oldbalanceDest',
      

In [13]:
ypred = clf_l1.predict(xtest)
ypred_proba = clf_l1.predict_proba(xtest)
evaluate_model(ytest, ypred, ypred_proba)

ROC-AUC score of the model: 0.9923340090860221
Accuracy of the model: 0.9992990487089621

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       0.89      0.40      0.56       228

    accuracy                           1.00    209715
   macro avg       0.95      0.70      0.78    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209476     11]
 [   136     92]]



## Ajuste del umbral de predicción

In [17]:
# keep probabilities for the positive outcome only
yhat = ypred_proba[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_proba[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytest,ypred_new_threshold,ypred_proba)

Best Threshold=0.001460, G-Mean=0.954
ROC-AUC score of the model: 0.9923340090860221
Accuracy of the model: 0.9261330853777746

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.93      0.96    209487
           1       0.01      0.98      0.03       228

    accuracy                           0.93    209715
   macro avg       0.51      0.95      0.49    209715
weighted avg       1.00      0.93      0.96    209715


Confusion matrix: 
[[194001  15486]
 [     5    223]]



Modelos Ridge y Lasso están mejores en antes de ajuste de umbral. esta enseña que nuestro ajuste basado a g-means esta mejorando recall.

In [None]:
model_analysis(clf_l1, xtest, ytest)