----------
**Credit Card Fraud Detection - VotingClassifier : F1_score = 0.88 (Recall = 0.83 - Precision = 0.95)**
=====================================

***Vincent Lugat***

*May 2018*

----------


![](https://image.noelshack.com/fichiers/2018/21/2/1526990364-resume-2.png)



-------------------



- <a href='#1'>1. Load libraries and read the data</a>  
     - <a href='#1.1'>1.1. Load libraries</a>
     - <a href='#1.2'>1.2. Read the data</a>
- <a href='#2'>2. Quick EDA</a>
     - <a href='#2.1'>2.1. Variables distribution and describe</a>
     - <a href='#2.2'>2.2. Class distribution</a>
     - <a href='#2.3'>2.3. Time vs Class</a>
     - <a href='#2.4'>2.4. Time vs Amount vs Class</a>
     - <a href='#2.5'>2.5. Boxplots Amount vs Class</a>
     - <a href='#2.6'>2.6. Correlation Matrix by Class</a>  
- <a href='#3'>3. Data preparation</a>
    - <a href='#3.1'>3.1. Cut dataset</a>
    - <a href='#3.2'>3.2. Normalization Amount</a>
    - <a href='#3.3'>3.3. Drop useless variables</a>
- <a href='#4'>4. Define metrics</a>
    - <a href='#4.1'>4.1. Confusion matrix</a>
    - <a href='#4.2'>4.2. Recall, Precision and F1_score</a>
    - <a href='#4.3'>4.3. Precision - Recall Curve</a>
    - <a href='#4.4'>4.4. ROC Curve</a>
    - <a href='#4.5'>4.5. Feature Importance</a>
    - <a href='#4.6'>4.6. Define (X , y) and Stratifed K-folds</a>
- <a href='#5'>5. Logistic Regression (LOG)</a>
    - <a href='#5.1'>5.1. LOG - Without hyperparameters tuning</a>
    - <a href='#5.2'>5.2. LOG - Grid Search CV to optimise Recall</a>
    - <a href='#5.3'>5.3. LOG - With best hyperparameters</a>
- <a href='#6'>6. Extreme Gradient Boosting (XGB)</a>
    - <a href='#6.1'>6.1. XGB - Without hyperparameters tuning</a>
    - <a href='#6.2'>6.2. XGB - Grid Search CV to find best n_estimators (F1_score)</a>
    - <a href='#6.3'>6.3. XGB - With best hyperparameters</a>
- <a href='#7'>7. Random Forest (RF)</a>
    - <a href='#7.1'>7.1. RF - Without hyperparameters tuning</a>
    - <a href='#7.2'>7.2. RF - Grid Search CV to optimise F1_score</a>
    - <a href='#7.3'>7.3. RF - With best hyperparameters</a>  
- <a href='#8'>8. VotingClassifier = LOG - XGB - RND (F1 = 0.881)</a>
    - <a href='#8.1'>8.1. VotingClassifier</a>
    - <a href='#8.2'>8.2. Precision - Recall – Threshold Curve</a>
    - <a href='#8.3'>8.3. Compare ROC Curves (all models)</a>
    - <a href='#8.4'>8.4. Compare Precision - Recall Curves (all models)</a>
- <a href='#9'>9. VotingClassifier : Validation (F1 = 0.884)</a>
     - <a href='#9.1'>9.1. Normalize Amount, drop useless variables and define (X, y)</a>    
     - <a href='#9.2'>9.2. VotingClassifier on valid_data (trained & tested in 8.1.)</a>
     - <a href='#9.3'>9.3. Threshold selection</a>


-------------------


# <a id='#1'>1. Load libraries and read the data</a>


## <a id='1.1'>1.1. Load libraries</a>

In [None]:
# Python libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import itertools
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve
import warnings

## <a id='1.2'>1.2. Read the data</a>

In [None]:
data = pd.read_csv('../input/creditcard.csv')

In [None]:
warnings.filterwarnings('ignore') #ignore warning messages 

# <a id='2'>2. Quick EDA</a>

## <a id='2.1'>2.1. Variables distribution and describe </a> 

Some things are useless, but the code is here for you :)

In [None]:
# Graph distribution
data.hist (bins=50, figsize=(20,15), color = 'deepskyblue')

plt.show()

In [None]:
# Describe
data.describe()

 ## <a id='2.2'>2.2. Class distribution</a>

The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

In [None]:
#Graph
my_pal = {0: 'deepskyblue', 1: 'deeppink'}

plt.figure(figsize = (12, 6))
ax = sns.countplot(x = 'Class', data = data, palette = my_pal)
plt.title('Class Distribution')
plt.show()

# Count and %
Count_Normal_transacation = len(data[data['Class']==0])
Count_Fraud_transacation = len(data[data['Class']==1]) 
Percentage_of_Normal_transacation = Count_Normal_transacation/(Count_Normal_transacation+Count_Fraud_transacation)
print('% of normal transacation       :', Percentage_of_Normal_transacation*100)
print('Number of normal transaction   :', Count_Normal_transacation)
Percentage_of_Fraud_transacation= Count_Fraud_transacation/(Count_Normal_transacation+Count_Fraud_transacation)
print('% of fraud transacation        :',Percentage_of_Fraud_transacation*100)
print('Number of fraud transaction    :', Count_Fraud_transacation)

## <a id='2.3'>2.3. Time vs Class</a>

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(15,8))

bins = 50

ax1.hist(data.Time[data.Class == 1], bins = bins, color = 'deeppink')
ax1.set_title('Fraud')

ax2.hist(data.Time[data.Class == 0], bins = bins, color = 'deepskyblue')
ax2.set_title('Normal')

plt.xlabel('Time (in Seconds)')
plt.ylabel('Number of Transactions')
plt.show()

## <a id='2.4'>2.4. Time vs Amount vs Class</a>

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(15,8))

ax1.scatter(data.Time[data.Class == 1], data.Amount[data.Class == 1], color = 'deeppink')
ax1.set_title('Fraud')

ax2.scatter(data.Time[data.Class == 0], data.Amount[data.Class == 0],  color = 'deepskyblue')
ax2.set_title('Normal')

plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()

## <a id='2.5'>2.5. Boxplots Amount vs Class</a>

In [None]:
plt.figure(figsize = (12, 6))
ax = sns.boxplot(x = 'Class', y = 'Amount', data = data, palette = my_pal)
ax.set_ylim([0, 300])
plt.title('Boxplot Amount vs Class')
plt.show()

## <a  id='2.6'>2.6. Correlation Matrix by Class</a>

In [None]:
#correlation matrix 
f, (ax1, ax2) = plt.subplots(1,2,figsize =( 15, 8))

sns.heatmap(data.query('Class==1').drop(['Class','Time'],1).corr(), vmax = .8, square=True, ax = ax1, cmap = 'YlGnBu')
ax1.set_title('Fraud')

sns.heatmap(data.query('Class==0').drop(['Class','Time'],1).corr(), vmax = .8, square=True, ax = ax2, cmap = 'YlGnBu');
ax2.set_title('Normal')

plt.show()

# <a id='3'>3. Data preparation</a>

## <a id='3.1'>3.1. Cut dataset</a>

one for train and test (skf) = 90%, other for validation = 10%

In [None]:
# Cut in 2 datasets, one for training & test, other for validation
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split (data, test_size = 0.1, random_state = 42)

In [None]:
# Reassign dataset names
data = train_set
valid_data = test_set

## <a id='#3.2'>3.2. Normalization Amount</a>

![](https://image.noelshack.com/fichiers/2018/20/5/1526651303-normalization.png)
source : https://www.slideshare.net/DhianaDevaRocha/qcon-rio-machine-learning-for-everyone

In [None]:
# Normalization Amount
from sklearn.preprocessing import StandardScaler
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))

In [None]:
f, (ax1, ax2) = plt.subplots(2,1,figsize =( 15, 8))

sns.kdeplot(data['Amount'],shade=True, ax = ax1, color='red')
ax1.set_title('Before Normalization')

sns.kdeplot(data['normAmount'],shade=True, ax = ax2, color='blue')
ax2.set_title('After Normalization')

plt.show()

## <a id='3.3'>3.3. Drop useless variables</a>

In [None]:
# Drop useless variables
data = data.drop(['Amount','Time'],axis=1)

In [None]:
data.describe()

# <a id='4'>4. Define metrics</a>

![](https://image.noelshack.com/fichiers/2018/20/5/1526651367-qcon-rio-machine-learning-for-everyone-51-638-1.jpg)
source : https://www.slideshare.net/DhianaDevaRocha/qcon-rio-machine-learning-for-everyone

## <a id='4.1'>4.1. Confusion matrix</a>

In [None]:
# confusion matrix 
def plot_confusion_matrix(cm, classes,
                          normalize = False,
                          title = 'Confusion matrix"',
                          cmap = plt.cm.Blues) :
    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])) :
        plt.text(j, i, cm[i, j],
                 horizontalalignment = 'center',
                 color = 'white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

![](https://image.noelshack.com/fichiers/2018/20/5/1526651914-cs-heezweaa5hp7.jpg)
source : https://twitter.com/bearda24

## <a id='4.2'>4.2. Recall, Precision and F1_score</a>

In [None]:
def show_metrics():
    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    tn = cm[0,0]
    print('Precision =     {:.3f}'.format(tp/(tp+fp)))
    print('Recall    =     {:.3f}'.format(tp/(tp+fn)))
    print('F1_score  =     {:.3f}'.format(2*(((tp/(tp+fp))*(tp/(tp+fn)))/
                                                 ((tp/(tp+fp))+(tp/(tp+fn))))))

## <a id='4.3'>4.3. Precision - Recall Curve</a>

In [None]:
# precision-recall curve
def plot_precision_recall():
    plt.step(recall, precision, color = 'b', alpha = 0.2,
             where = 'post')
    plt.fill_between(recall, precision, step ='post', alpha = 0.2,
                 color = 'b')

    plt.plot(recall, precision, linewidth=2)
    plt.xlim([0.0,1])
    plt.ylim([0.0,1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision Recall Curve')
    plt.show();

## <a id='4.4'>4.4. ROC Curve</a>

![](https://image.noelshack.com/fichiers/2018/20/5/1526651444-roc-curve.png)
source : https://www.slideshare.net/DhianaDevaRocha/qcon-rio-machine-learning-for-everyone

In [None]:
# ROC curve
def plot_roc():
    plt.plot(fpr, tpr, label = 'ROC curve', linewidth = 2)
    plt.plot([0,1],[0,1], 'k--', linewidth = 2)
    plt.xlim([0.0,0.001])
    plt.ylim([0.0,1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show();

## <a id='4.5'>4.5. Feature Importance</a>

In [None]:
predictors = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',
       'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',
       'Normamount']

In [None]:
#feature importance plot
def plot_feature_importance(model):
    tmp = pd.DataFrame({'Feature': predictors, 'Feature importance': model.feature_importances_})
    tmp = tmp.sort_values(by='Feature importance',ascending=False)
    plt.figure(figsize = (15,8))
    plt.title('Features importance',fontsize=14)
    s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
    s.set_xticklabels(s.get_xticklabels(),rotation=90)
    plt.show()

## <a id='4.6'>4.6. Define (X , y) and Stratifed K-folds</a>

In [None]:
# def X and Y
y = np.array(data.Class.tolist())
data = data.drop('Class', 1)
X = np.array(data.as_matrix())

![](https://image.noelshack.com/fichiers/2018/20/6/1526716452-general-tips-for-participating-kaggle-competitions-13-638.jpg)
source : https://www.slideshare.net/markpeng/general-tips-for-participating-kaggle-competitions

In [None]:
# Stratified k-folds
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
for train_index, test_index in skf.split(X, y):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]

# <a id='5'>5. Logistic Regression (LOG)</a>

##  <a id='5.1'>5.1. LOG - Without hyperparameters tuning</a>

In [None]:
# Logistic regression 
log_cfl = LogisticRegression()

log_cfl.fit(X_train, y_train)
y_pred = log_cfl.predict(X_test)
y_score = log_cfl.decision_function(X_test)  

# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes = class_names, 
                      title = 'LOG Confusion matrix')
plt.show()

show_metrics()

# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()

# Precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()

In [None]:
# Show parameters currently use
from pprint import pprint
print('Parameters currently in use:\n')
pprint(log_cfl.get_params())

##  <a id='5.2'>5.2. LOG - Grid Search CV to optimise Recall</a>

In [None]:
# Find the best parameters : recall optimisation
from sklearn.model_selection import GridSearchCV
param_grid = {
            'penalty' : ['l1','l2'], 
            'class_weight' : ['balanced', None], 
            'C' : [0.1, 1, 10, 100]
            }

CV_log_cfl = GridSearchCV(estimator = log_cfl, param_grid = param_grid , scoring = 'recall', verbose = 1, n_jobs = -1)
CV_log_cfl.fit(X_train, y_train)

best_parameters = CV_log_cfl.best_params_
print('The best parameters for using this model is', best_parameters)

## <a id='5.3'>5.3. LOG - With best hyperparameters</a>

In [None]:
#Logistic regression with the best parameters
log_cfl = LogisticRegression(C = best_parameters['C'], 
                             penalty = best_parameters['penalty'], 
                             class_weight = best_parameters['class_weight'])

log_cfl.fit(X_train, y_train)
y_pred = log_cfl.predict(X_test)
y_score = log_cfl.decision_function(X_test)

# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='LOG Confusion matrix')

plt.savefig('4.log_cfl_confusion_matrix.png')
plt.show()

show_metrics()

# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()

# Precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()

fpr_log, tpr_log, t_log = fpr, tpr, t
precision_log, recall_log, thresholds_log = precision, recall, thresholds

# <a id='6'>6. Extreme Gradient Boosting (XGB)</a>

## <a id='6.1'>6.1. XGB - Without hyperparameters tuning</a>

In [None]:
# xgb
xgb_cfl = xgb.XGBClassifier(n_jobs = -1)

xgb_cfl.fit(X_train, y_train)
y_pred = xgb_cfl.predict(X_test)
y_score = xgb_cfl.predict_proba(X_test)[:,1]

# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='XGB Confusion matrix')
plt.show()

show_metrics()

# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()

# Precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()

## <a id='6.2'>6.2. XGB - Grid Search CV to find best n_estimators (F1_score)</a>

Grid search CV is really slow, you can add or remove some hyperparameters (or use early_stop) : 

    param_grid = {
                'n_estimators': [100, 200, 300, 400]
                  }

    CV_xgb_cfl = GridSearchCV(estimator = xgb_cfl, param_grid = param_grid, scoring ='f1', verbose = 2)
    CV_xgb_cfl.fit(X_train, y_train)

    best_parameters = CV_xgb_cfl.best_params_
    print("The best parameters for using this model is", best_parameters)

![](https://image.noelshack.com/fichiers/2018/20/6/1526746266-ok.png)

## <a id='6.3'>6.3. XGB - With best hyperparameters</a>

In [None]:
# xgb
xgb_cfl = xgb.XGBClassifier(n_jobs = -1, 
                            n_estimators = 200)

xgb_cfl.fit(X_train, y_train)
y_pred = xgb_cfl.predict(X_test)
y_score = xgb_cfl.predict_proba(X_test)[:,1]

# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes = class_names, 
                      title = 'XGB Confusion matrix')
plt.savefig('2.xgb_cfl_confusion_matrix.png')
plt.show()

show_metrics()

# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()

# Precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()

fpr_xgb, tpr_xgb, t_xgb = fpr, tpr, t
precision_xgb, recall_xgb, thresholds_xgb = precision, recall, thresholds

In [None]:
plot_feature_importance(xgb_cfl)

# <a id='7'>7. Random Forest (RF)</a>

## <a id='7.1'>7.1. RF - Without hyperparameters tuning</a>

In [None]:
# Ramdom forest Classifier
rf_cfl = RandomForestClassifier(n_jobs = -1,
                                random_state = 42)

rf_cfl.fit(X_train, y_train)
y_pred = rf_cfl.predict(X_test)
y_score = rf_cfl.predict_proba(X_test)[:,1]

# Confusion maxtrix
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes = class_names, 
                      title = 'RF Confusion matrix')
plt.show()

show_metrics()

# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()

# Precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()

## <a id='7.2'>7.2. RF - Grid Search CV to optimise F1_score</a>

Grid search CV is really slow, you can add or remove some parameters  : 

    from sklearn.model_selection import GridSearchCV

    param_grid = {
                'n_estimators': [100, 200, 500],
                'max_features': [2, 3],
                'min_samples_leaf': [1, 2, 4],
                'min_samples_split': [2, 5, 10]
                }

    CV_rnd_cfl = GridSearchCV(estimator = rf_cfl, param_grid = param_grid, scoring = 'f1', verbose = 10, n_jobs = -1)
    CV_rnd_cfl.fit(X_train, y_train)

    best_parameters = CV_rf_cfl.best_params_
    print("The best parameters for using this model is", best_parameters)

![](https://image.noelshack.com/fichiers/2018/20/5/1526651240-gridcv-rnd.png)

## <a id='7.3'>7.3. RF - With best hyperparameters</a>

In [None]:
# Ramdom forest Classifier
rf_cfl = RandomForestClassifier(n_estimators = 200, 
                                 max_features = 3, 
                                 min_samples_leaf = 1, 
                                 min_samples_split = 2, 
                                 n_jobs = -1,
                                random_state = 42)

rf_cfl.fit(X_train, y_train)
y_pred = rf_cfl.predict(X_test)
y_score = rf_cfl.predict_proba(X_test)[:,1]

# Confusion maxtrix
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes = class_names, 
                      title = 'RF Confusion matrix')
plt.savefig('3.rf_cfl_confusion_matrix.png')
plt.show()

show_metrics()

# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()

# Precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()

fpr_rf, tpr_rf, t_rf = fpr, tpr, t
precision_rf, recall_rf, thresholds_rf = precision, recall, thresholds

In [None]:
plot_feature_importance(rf_cfl)

# <a id='8'>8. VotingClassifier = LOG - XGB - RND (F1 = 0.881)</a>

## <a href='#paeed'>8.1. VotingClassifier</a>

In [None]:
#Voting Classifier
voting_cfl = VotingClassifier (
        estimators = [('xgb', xgb_cfl), ('lt', log_cfl), ('rf', rf_cfl)],
                     voting='soft', weights = [1, 1, 1.33])
    
voting_cfl.fit(X_train,y_train)

y_pred = voting_cfl.predict(X_test)
y_score = voting_cfl.predict_proba(X_test)[:,1]

# Confusion maxtrix
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes = class_names, 
                      title = 'VOTING Confusion matrix')
plt.savefig('1.voting_confusion_matrix.png')
plt.show()

show_metrics()

# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()

# Precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()

fpr_voting, tpr_voting, t_voting = fpr, tpr, t
precision_voting, recall_voting, thresholds_voting = precision, recall, thresholds

## <a id='8.2'>8.2. Precision - Recall – Threshold Curve</a>

In [None]:
pr = 0.937
rec = 0.831
t = 0.5

In [None]:
# Precision-recall-threshold curve : 
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.figure(figsize=(16, 12))
    plt.title('Precision and Recall Scores (decision threshold)')
    plt.plot(thresholds, precisions[:-1], 'b-',linewidth=2, label='Precision')
    plt.plot(thresholds, recalls[:-1], 'g', linewidth=2, label='Recall')
    plt.axvline(t, color='k', linestyle='--', label='Threshold')
    plt.axhline(pr, color='blue', linewidth=2, linestyle='--')
    plt.axhline(rec, color='green', linewidth=2, linestyle='--')
    plt.ylabel('Score')
    plt.xlabel('Decision Threshold')
    plt.legend(loc='best')
    plt.savefig('5.prec_recc_threshold.png')
    plt.show();

In [None]:
plot_precision_recall_vs_threshold(precision, recall, thresholds)



cf. part 9.3 : select the best threshold

## <a id='8.3'>8.3. Compare ROC Curves (all models)</a>

In [None]:
def roc_curve_all_models () : 
    plt.figure(figsize=(16, 12))
    plt.plot(fpr_rf, tpr_rf, label = 'ROC curve', linewidth = 2)
    plt.plot(fpr_xgb, tpr_xgb, label = 'ROC curve', linewidth = 2)
    plt.plot(fpr_log, tpr_log, label = 'ROC curve', linewidth = 2)
    plt.plot(fpr_voting, tpr_voting, label = 'ROC curve', linewidth = 2)
    plt.plot([0,1],[0,1], 'k--', linewidth = 2)
    plt.xlim([0.0,0.001])
    plt.ylim([0.0,1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC')
    plt.legend(['Rnd','Xgb', 'Log', 'Voting'], loc='upper left')
    plt.savefig('6.roc.png')
    plt.show();

In [None]:
roc_curve_all_models ()

## <a id='8.4'>8.4. Compare Precision - Recall Curves (all models)</a>

In [None]:
def prec_recall_all_models () :
    plt.figure(figsize=(16, 12))
    plt.plot(recall_rf, precision_rf, linewidth = 2)
    plt.plot(recall_xgb, precision_xgb, linewidth = 2)
    plt.plot(recall_log, precision_log, linewidth = 2)
    plt.plot(recall_voting, precision_voting, linewidth = 2)
    plt.scatter(rec, pr, linewidth = 2, color = 'red')
    plt.axvline(rec, color = 'red', linewidth = 1, linestyle='--')
    plt.axhline(pr, color = 'red', linewidth = 1, linestyle='--')
    plt.xlim([0.0,1])
    plt.ylim([0.0,1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision Recall Curve - PR = 0.937 - REC = 0.831 - F1 = 881 ')
    plt.legend(['Rnd', 'Xgb', 'Log', 'Voting'], loc='upper right')
    plt.savefig('7.prec_recc.png')
    plt.show();

In [None]:
prec_recall_all_models () 

# <a id='9'>9. VotingClassifier : Validation (F1 = 0.884)</a>

dataset = valid_data 

## <a id='9.1'>9.1. Normalize Amount, drop useless variables and define (X, y)</a>    

In [None]:
# Normalize amount in valid_data 'normAmount' 
from sklearn.preprocessing import StandardScaler
valid_data['normAmount'] = StandardScaler().fit_transform(valid_data['Amount'].values.reshape(-1,1))

In [None]:
# Drop time and amount in valid_data
valid_data = valid_data.drop(['Amount','Time'],axis=1)

In [None]:
# Define X & y
y = np.array(valid_data.Class.tolist())
valid_data = valid_data.drop('Class', 1)
X = np.array(valid_data.as_matrix())

## <a href='#5-15-1'>9.2. VotingClassifier on valid_data (trained & tested in 8.1.)</a>

In [None]:
# Apply voting_classifier, previously trained, on valid_data : 
y_pred = voting_cfl.predict(X)
y_score = voting_cfl.predict_proba(X)[:,1]

In [None]:
# Metrics valid_data
# Confusion Matrix 
cm = confusion_matrix(y, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes = class_names, 
                      title = 'VOTING valid Confusion matrix')
plt.savefig('8.votingvf_cfl_confusion_matrix.png')
plt.show()

show_metrics()

#ROC
fpr, tpr, t = roc_curve(y, y_score)
plot_roc()

#precision recall
precision, recall, thresholds = precision_recall_curve(y, y_score)
plot_precision_recall()

## <a href='#5-15-2'>9.3. Threshold selection</a>

![](https://image.noelshack.com/fichiers/2018/21/1/1526914109-ok-thresh.png)
source : http://www.prolekare.cz/en/journal-of-czech-physicians-article/roc-analysis-and-the-use-of-cost-benefit-analysis-for-determination-of-the-optimal-cut-point-5403?confirm_rules=1

In [None]:
thresholds_adj = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

plt.figure(figsize = (15,15))

j = 1
for i in thresholds_adj:
    y_score = voting_cfl.predict_proba(X)[:,1] > i
    
    plt.subplot(3,3,j)
    j += 1
    
    cm = confusion_matrix(y, y_score)
    
    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    tn = cm[0,0]

    print('F1_score w/ threshold = %s :'%i, (2*(((tp/(tp+fp))*(tp/(tp+fn)))/
                                                 ((tp/(tp+fp))+(tp/(tp+fn))))))
    
    class_names = [0,1]
    plot_confusion_matrix(cm, 
                          classes=class_names, 
                          title='Threshold = %s'%i) 
    
plt.savefig('9.confusion_matrix_thresold_select.png')

The best compromise is the threshold = 0.5 (standard)

**Thank you all ! Merci à tous ! :)**
=====================================
----------