# PROJET SISE 2020-2021 BIG DATA MINING

## Classification dans un contexte déséquilibré - une application à la fraude bancaire

##### Equipe projet : Mathias Da Costa Meira, Fabrice Petitfrere

***

## PARTIE TESTS FABRICE

In [1]:
import pandas as pd

In [2]:
X_train = pd.read_csv('X_train.csv')
X_train_s1 = pd.read_csv('X_train_s1.csv')
X_train_s2 = pd.read_csv('X_train_s2.csv')
X_train_s3 = pd.read_csv('X_train_s3.csv')

In [3]:
y_train = pd.read_csv('y_train.csv')
y_train_s1 = pd.read_csv('y_train_s1.csv')
y_train_s2 = pd.read_csv('y_train_s2.csv')
y_train_s3 = pd.read_csv('y_train_s3.csv')

In [4]:
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

In [5]:
X_train.shape

(241588, 15)

In [6]:
X_train_s1.shape

(267570, 15)

In [7]:
X_train_s2.shape

(126740, 15)

In [8]:
X_train_s3.shape

(206410, 15)

### Arbre de décision simple

#### Sans resampling

In [12]:
import numpy as np
from time import time
from sklearn.model_selection import GridSearchCV

In [13]:
#Hyperparamètres 
params = {"max_depth": [3,6,9,12, None], 
            "min_samples_leaf": np.arange(1,9,1), 
            "criterion": ["gini", "entropy"]}

In [14]:
# Modélisation
from time import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, make_scorer

In [15]:
# Sans resampling
ct = DecisionTreeClassifier()
cv = 5
start = time()
classTree = GridSearchCV(ct, params, cv=cv, n_jobs=-1, scoring=make_scorer(f1_score), verbose=3)
classTree.fit(X_train,y_train)
# Prédiction
y_pred_DTC = classTree.best_estimator_.predict(X_test)
done = time() 
tps = round(done - start,1)
# Métriques sur l'échantillon de validation
cm = confusion_matrix(y_test,y_pred_DTC)
f1 = f1_score(y_test,y_pred_DTC)
auc = roc_auc_score(y_test,y_pred_DTC)

print("Train set size : "+str(X_train.shape[0]))
print("Test set size : "+str(X_test.shape[0]))
print("Best estimator : "+str(classTree.best_params_))
print(cm)
print("F1-score : {0:.3f}".format(f1))
print("ROC AUC score : {0:.3f}".format(auc))
print("Elapsed time : "+str(tps))

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:  5.7min finished


Train set size : 241588
Test set size : 264143
Best estimator : {'criterion': 'entropy', 'max_depth': 12, 'min_samples_leaf': 3}
[[263073    215]
 [   829     26]]
F1-score : 0.047
ROC AUC score : 0.515
Elapsed time : 343.9


#### Sans resampling mais avec class_weight='balanced'

In [16]:
# Modélisation
from time import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, make_scorer

In [18]:
# Sans resampling
ct = DecisionTreeClassifier(class_weight='balanced')
cv = 5
start = time()
classTree = GridSearchCV(ct, params, cv=cv, n_jobs=-1, scoring=make_scorer(f1_score), verbose=3)
classTree.fit(X_train,y_train)
# Prédiction
y_pred_DTC = classTree.best_estimator_.predict(X_test)
done = time() 
tps = round(done - start,1)
# Métriques sur l'échantillon de validation
cm = confusion_matrix(y_test,y_pred_DTC)
f1 = f1_score(y_test,y_pred_DTC)
auc = roc_auc_score(y_test,y_pred_DTC)

print("Train set size : "+str(X_train.shape[0]))
print("Test set size : "+str(X_test.shape[0]))
print("Best estimator : "+str(classTree.best_params_))
print(cm)
print("F1-score : {0:.3f}".format(f1))
print("ROC AUC score : {0:.3f}".format(auc))
print("Elapsed time : "+str(tps))

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:  5.7min finished


Train set size : 241588
Test set size : 264143
Best estimator : {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 4}
[[261352   1936]
 [   790     65]]
F1-score : 0.046
ROC AUC score : 0.534
Elapsed time : 341.8


#### Avec oversampling SMOTE 10/90 (1)

In [13]:
# Avec oversampling
ct_s1 = DecisionTreeClassifier()
cv = [(slice(None), slice(None))]
start = time()
classTree_s1 = GridSearchCV(ct_s1, params, cv=cv, n_jobs=-1, scoring=make_scorer(f1_score))
classTree_s1.fit(X_train_s1,y_train_s1)
# Prédiction
y_pred_DTC_s1 = classTree_s1.predict(X_test)
done = time() 
tps = round(done - start,3)
# Métriques sur l'échantillon de validation s1
cm_s1 = confusion_matrix(y_test,y_pred_DTC_s1)
f1_s1 = f1_score(y_test,y_pred_DTC_s1)
auc_s1 = roc_auc_score(y_test,y_pred_DTC_s1)

print("Train set size : "+str(X_train_s1.shape[0]))
print("Test set size : "+str(X_test.shape[0]))
print("Best estimator : "+str(classTree_s1.best_params_))
print(cm_s1)
print("F1-score : {0:.3f}".format(f1_s1))
print("ROC AUC score : {0:.3f}".format(auc_s1))
print("Elapsed time : "+str(tps))

Train set size : 8860
Test set size : 2000
Best estimator : {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1}
[[1962   22]
 [  13    3]]
F1-score : 0.146
ROC AUC score : 0.588
Elapsed time : 6.18


#### Avec oversampling SMOTE 5/95 + undersampling aléatoire 1:9 (2)

In [14]:
# Avec oversampling + undersampling
ct_s2 = DecisionTreeClassifier()
cv = [(slice(None), slice(None))]
start = time()
classTree_s2 = GridSearchCV(ct_s2, params, cv=cv, n_jobs=-1, scoring=make_scorer(f1_score))
classTree_s2.fit(X_train_s2,y_train_s2)
# Prédiction
y_pred_DTC_s2 = classTree_s2.predict(X_test)
done = time() 
tps = round(done - start,3)
# Métriques sur l'échantillon de validation s2
cm_s2 = confusion_matrix(y_test,y_pred_DTC_s2)
f1_s2 = f1_score(y_test,y_pred_DTC_s2)
auc_s2 = roc_auc_score(y_test,y_pred_DTC_s2)

print("Train set size : "+str(X_train_s2.shape[0]))
print("Test set size : "+str(X_test.shape[0]))
print("Best estimator : "+str(classTree_s2.best_params_))
print(cm_s2)
print("F1-score : {0:.3f}".format(f1_s2))
print("ROC AUC score : {0:.3f}".format(auc_s2))
print("Elapsed time : "+str(tps))

Train set size : 4190
Test set size : 2000
Best estimator : {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1}
[[1951   33]
 [  16    0]]
F1-score : 0.000
ROC AUC score : 0.492
Elapsed time : 3.246


#### Avec oversampling SMOTE 30/70 + undersampling aléatoire 1:1 (3)

In [15]:
# Avec oversampling + undersampling
ct_s3 = DecisionTreeClassifier()
cv = [(slice(None), slice(None))]
start = time()
classTree_s3 = GridSearchCV(ct_s3, params, cv=cv, n_jobs=-1, scoring=make_scorer(f1_score))
classTree_s3.fit(X_train_s3,y_train_s3)
# Prédiction
y_pred_DTC_s3 = classTree_s3.predict(X_test)
done = time() 
tps = round(done - start,3)
# Métriques sur l'échantillon de validation s2
cm_s3 = confusion_matrix(y_test,y_pred_DTC_s3)
f1_s3 = f1_score(y_test,y_pred_DTC_s3)
auc_s3 = roc_auc_score(y_test,y_pred_DTC_s3)

print("Train set size : "+str(X_train_s3.shape[0]))
print("Test set size : "+str(X_test.shape[0]))
print("Best estimator : "+str(classTree_s3.best_params_))
print(cm_s3)
print("F1-score : {0:.3f}".format(f1_s3))
print("ROC AUC score : {0:.3f}".format(auc_s3))
print("Elapsed time : "+str(tps))

Train set size : 6834
Test set size : 2000
Best estimator : {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1}
[[1934   50]
 [  12    4]]
F1-score : 0.114
ROC AUC score : 0.612
Elapsed time : 5.874


La deuxième stratégie de resampling (Avec oversampling SMOTE 5/95 + undersampling aléatoire 1:9) semble être la plus intéressante. Nous testerons celle là dans la suite.  
Vraiment ?

### Bagging

In [55]:
#hyperparamètres
params_rf = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 300, 400, 500]}


params_rf_simple = {'max_features': ['sqrt', 0.25, 0.5, 0.75, 1.0],
    'n_estimators': [200, 300, 400, 500]}

params_rf_xxs = {'n_estimators': [5,300], 'max_features': ['sqrt', 0.25]}

#### Sans resampling

In [19]:
#from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

In [19]:
# Sans resampling
rfc = RandomForestClassifier()
# cv = [(slice(None), slice(None))]
start = time()
random_forest = GridSearchCV(rfc, params_rf_simple, cv=5, n_jobs=-1, scoring=make_scorer(f1_score), verbose=3)
random_forest.fit(X_train,y_train)
# Prédiction
y_pred_bagging = random_forest.best_estimator_.predict(X_test)
done = time() 
tps = round(done - start,3)
# Métriques sur l'échantillon de validation
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score
cm = confusion_matrix(y_test,y_pred_bagging)
f1 = f1_score(y_test,y_pred_bagging)
auc = roc_auc_score(y_test,y_pred_bagging)

print("Train set size : "+str(X_train.shape[0]))
print("Test set size : "+str(X_test.shape[0]))
print("Best estimator : "+str(random_forest.best_params_))
print(cm)
print("F1-score : {0:.3f}".format(f1))
print("ROC AUC score : {0:.3f}".format(auc))
print("Elapsed time : "+str(tps))

  self.best_estimator_.fit(X, y, **fit_params)


Train set size : 8000
Test set size : 2000
Best estimator : {'max_features': 0.5, 'n_estimators': 200}
[[1984    0]
 [  16    0]]
F1-score : 0.000
ROC AUC score : 0.500
Elapsed time : 368.103


#### Test avec des arbres plus petits et sans gridsearchcv

In [26]:
# Sans resampling
rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=56, class_weight='balanced')
# cv = [(slice(None), slice(None))]
start = time()
#random_forest = GridSearchCV(rfc, params_rf_simple, cv=5, n_jobs=-1, scoring=make_scorer(f1_score), verbose=3)
random_forest = rfc
random_forest.fit(X_train,y_train)
# Prédiction
#y_pred_bagging = random_forest.best_estimator_.predict(X_test)
y_pred_bagging = random_forest.predict(X_test)
done = time() 
tps = round(done - start,3)
# Métriques sur l'échantillon de validation
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score
cm = confusion_matrix(y_test,y_pred_bagging)
f1 = f1_score(y_test,y_pred_bagging)
auc = roc_auc_score(y_test,y_pred_bagging)

print("Train set size : "+str(X_train.shape[0]))
print("Test set size : "+str(X_test.shape[0]))
#print("Best estimator : "+str(random_forest.best_params_))
print(cm)
print("F1-score : {0:.3f}".format(f1))
print("ROC AUC score : {0:.3f}".format(auc))
print("Elapsed time : "+str(tps))

  import sys


Train set size : 241588
Test set size : 264143
[[249503  13785]
 [   542    313]]
F1-score : 0.042
ROC AUC score : 0.657
Elapsed time : 31.148


#### Avec oversampling SMOTE 5/95 + undersampling aléatoire 1:9 (2)

In [56]:
from sklearn.ensemble import RandomForestClassifier

# Avec oversampling + undersampling
rf_s2 = RandomForestClassifier()
# cv = [(slice(None), slice(None))]
start = time()
random_forest_s2 = GridSearchCV(rf_s2, params_rf_simple, scoring=make_scorer(f1_score), cv=5, n_jobs=-1)
random_forest_s2.fit(X_train_s2,y_train_s2)
# Prédiction
y_pred_bagging_s2 = random_forest_s2.best_estimator_.predict(X_test)
done = time() 
tps = round(done - start,3)
# Métriques sur l'échantillon de validation s2
cm_s2 = confusion_matrix(y_test,y_pred_bagging_s2)
f1_s2 = f1_score(y_test,y_pred_bagging_s2)
auc_s2 = roc_auc_score(y_test,y_pred_bagging_s2)

print("Train set size : "+str(X_train_s2.shape[0]))
print("Test set size : "+str(X_test.shape[0]))
print("Best estimator : "+str(random_forest_s2.best_params_))
print(cm_s2)
print("F1-score : {0:.3f}".format(f1_s2))
print("ROC AUC score : {0:.3f}".format(auc_s2))
print("Elapsed time : "+str(tps))

  self.best_estimator_.fit(X, y, **fit_params)


Train set size : 126740
Test set size : 264143
Best estimator : {'max_features': 0.5, 'n_estimators': 400}
[[262910    378]
 [   797     58]]
F1-score : 0.090
ROC AUC score : 0.533
Elapsed time : 11545.88


#### Avec oversampling SMOTE 30/70 + undersampling aléatoire 1:1 (3)

In [20]:
from sklearn.ensemble import RandomForestClassifier

# Avec oversampling + undersampling
rf_s2 = RandomForestClassifier()
# cv = [(slice(None), slice(None))]
start = time()
random_forest_s3 = GridSearchCV(rf_s2, params_rf_simple, scoring=make_scorer(f1_score), cv=5, n_jobs=-1)
random_forest_s3.fit(X_train_s3,y_train_s3)
# Prédiction
y_pred_bagging_s3 = random_forest_s3.best_estimator_.predict(X_test)
done = time() 
tps = round(done - start,3)
# Métriques sur l'échantillon de validation s2
cm_s3 = confusion_matrix(y_test,y_pred_bagging_s3)
f1_s3 = f1_score(y_test,y_pred_bagging_s3)
auc_s3 = roc_auc_score(y_test,y_pred_bagging_s3)

print("Train set size : "+str(X_train_s3.shape[0]))
print("Test set size : "+str(X_test.shape[0]))
print("Best estimator : "+str(random_forest_s3.best_params_))
print(cm_s3)
print("F1-score : {0:.3f}".format(f1_s3))
print("ROC AUC score : {0:.3f}".format(auc_s3))
print("Elapsed time : "+str(tps))

  self.best_estimator_.fit(X, y, **fit_params)


Train set size : 4190
Test set size : 2000
Best estimator : {'max_features': 0.25, 'n_estimators': 200}
[[1979    5]
 [  14    2]]
F1-score : 0.174
ROC AUC score : 0.561
Elapsed time : 156.729


### Gradient Boosting

#### Sans resampling

In [53]:
#Hyperparamètres 
params_gb_simple={'n_estimators':[16,32,64,100,200], 'learning_rate':[0.25,0.1,0.05,0.025], 
            'max_depth':[1,2,4,8], 'subsample': [0.5,0.9,1], 'max_features':[0.5,0.75]} 

params_gb_more = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10]
    }

params_gb_xxs = {'n_estimators': [200,300], 'max_depth':[8], 'learning_rate':[0.25]}

In [22]:
from sklearn.ensemble import GradientBoostingClassifier

# Sans resampling
gb = GradientBoostingClassifier()
cv = [(slice(None), slice(None))]
start = time()
boosting = GridSearchCV(gb, params_gb_simple, cv=cv, n_jobs=-1, scoring=make_scorer(f1_score))
boosting.fit(X_train,y_train)
# Prédiction
y_pred_boosting = boosting.best_estimator_.predict(X_test)
done = time() 
tps = round(done - start,3)
# Métriques sur l'échantillon de validation
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score
cm = confusion_matrix(y_test,y_pred_boosting)
f1 = f1_score(y_test,y_pred_boosting)
auc = roc_auc_score(y_test,y_pred_boosting)

print("Train set size : "+str(X_train.shape[0]))
print("Test set size : "+str(X_test.shape[0]))
print("Best estimator : "+str(boosting.best_params_))
print(cm)
print("F1-score : {0:.3f}".format(f1))
print("ROC AUC score : {0:.3f}".format(auc))
print("Elapsed time : "+str(tps))

  return f(**kwargs)


Train set size : 8000
Test set size : 2000
Best estimator : {'learning_rate': 0.25, 'max_depth': 4, 'max_features': 0.5, 'n_estimators': 32, 'subsample': 1}
[[1977    7]
 [  16    0]]
F1-score : 0.000
ROC AUC score : 0.498
Elapsed time : 301.534


#### Avec oversampling SMOTE 5/95 + undersampling aléatoire 1:9 (2)

In [23]:
# Avec oversampling + undersampling
gb_s2 = GradientBoostingClassifier()
cv = [(slice(None), slice(None))]
start = time()
boosting_s2 = GridSearchCV(gb_s2, params_gb_simple, cv=cv, n_jobs=-1, scoring=make_scorer(f1_score))
boosting_s2.fit(X_train_s2,y_train_s2)
# Prédiction
y_pred_boosting_s2 = boosting_s2.best_estimator_.predict(X_test)
done = time() 
tps = round(done - start,3)
# Métriques sur l'échantillon de validation s2
cm_s2 = confusion_matrix(y_test,y_pred_boosting_s2)
f1_s2 = f1_score(y_test,y_pred_boosting_s2)
auc_s2 = roc_auc_score(y_test,y_pred_boosting_s2)

print("Train set size : "+str(X_train_s2.shape[0]))
print("Test set size : "+str(X_test.shape[0]))
print("Best estimator : "+str(boosting_s2.best_params_))
print(cm_s2)
print("F1-score : {0:.3f}".format(f1_s2))
print("ROC AUC score : {0:.3f}".format(auc_s2))
print("Elapsed time : "+str(tps))

  return f(**kwargs)


Train set size : 4190
Test set size : 2000
Best estimator : {'learning_rate': 0.25, 'max_depth': 4, 'max_features': 0.5, 'n_estimators': 64, 'subsample': 0.9}
[[1976    8]
 [  14    2]]
F1-score : 0.154
ROC AUC score : 0.560
Elapsed time : 182.263


#### Avec oversampling SMOTE 30/70 + undersampling aléatoire 1:1 (3)

In [23]:
# Avec oversampling + undersampling
gb_s3 = GradientBoostingClassifier()
cv = [(slice(None), slice(None))]
start = time()
boosting_s3 = GridSearchCV(gb_s3, params_gb_simple, cv=cv, n_jobs=-1, scoring=make_scorer(f1_score))
boosting_s3.fit(X_train_s3,y_train_s3)
# Prédiction
y_pred_boosting_s3 = boosting_s3.best_estimator_.predict(X_test)
done = time() 
tps = round(done - start,3)
# Métriques sur l'échantillon de validation s2
cm_s3 = confusion_matrix(y_test,y_pred_boosting_s3)
f1_s3 = f1_score(y_test,y_pred_boosting_s3)
auc_s3 = roc_auc_score(y_test,y_pred_boosting_s3)

print("Train set size : "+str(X_train_s3.shape[0]))
print("Test set size : "+str(X_test.shape[0]))
print("Best estimator : "+str(boosting_s3.best_params_))
print(cm_s3)
print("F1-score : {0:.3f}".format(f1_s3))
print("ROC AUC score : {0:.3f}".format(auc_s3))
print("Elapsed time : "+str(tps))

  return f(**kwargs)


Train set size : 4190
Test set size : 2000
Best estimator : {'learning_rate': 0.25, 'max_depth': 4, 'max_features': 0.5, 'n_estimators': 64, 'subsample': 0.9}
[[1976    8]
 [  14    2]]
F1-score : 0.154
ROC AUC score : 0.560
Elapsed time : 182.263


### Logistic Regression

#### Scaling

In [27]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train, y_train)
X_train_s1_scaled = scaler.fit_transform(X_train_s1, y_train)
X_train_s2_scaled = scaler.fit_transform(X_train_s2, y_train)
X_train_s3_scaled = scaler.fit_transform(X_train_s3, y_train)

#### Avec oversampling SMOTE 30/70 + undersampling aléatoire 1:1 (3)

In [28]:
#Hyperparamètres 
params_lr_small={'class_weight':[None,'balanced']} 
params_lr_large = {'class_weight':[None,'balanced'], 'penalty':['l2', None], 'C':[0.001,0.01,0.1,1,10]}

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, make_scorer
from time import time

# Sans resampling
model = LogisticRegression()
cv = 5
start = time()
model = GridSearchCV(model, params_lr_large, cv=cv, n_jobs=-1, scoring=make_scorer(f1_score), verbose=3)
model.fit(X_train_scaled,y_train)

# Prédiction
y_pred = model.best_estimator_.predict(X_test)
#y_pred = model.predict(X_test)
done = time() 
tps = round(done - start,3)
#lr_best_params = model.best_params_

# Métriques sur l'échantillon de validation
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score
cm = confusion_matrix(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
auc = roc_auc_score(y_test,y_pred)

print("Train set size : "+str(X_train.shape[0]))
print("Test set size : "+str(X_test.shape[0]))
print("Best estimator : "+str(model.best_params_))
print(cm)
print("F1-score : {0:.3f}".format(f1))
print("ROC AUC score : {0:.3f}".format(auc))
print("Elapsed time : "+str(tps))

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   31.0s finished
  return f(**kwargs)


Train set size : 241588
Test set size : 264143
Best estimator : {'C': 0.001, 'class_weight': 'balanced', 'penalty': 'l2'}
[[179656  83632]
 [   320    535]]
F1-score : 0.013
ROC AUC score : 0.654
Elapsed time : 32.118


### KNN

In [14]:
#Hyperparamètres 
params_knn_small={'n_neighbors':[5,9,13,25]} 
params_knn_large = {}

#### Avec oversampling SMOTE 30/70 + undersampling aléatoire 1:1 (3)

In [75]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
cv = [(slice(None), slice(None))]
start = time()
#model = GridSearchCV(model, params_knn_small, cv=cv, n_jobs=-1, scoring=make_scorer(f1_score), verbose=3)
model.fit(X_train_s3_scaled,y_train_s3)

# Prédiction
#y_pred = model.best_estimator_.predict(X_test)
y_pred = model.predict(X_test)
done = time() 
tps = round(done - start,3)
#knn_best_params = model.best_params_

# Métriques sur l'échantillon de validation
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score
cm = confusion_matrix(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
auc = roc_auc_score(y_test,y_pred)

print("Train set size : "+str(X_train.shape[0]))
print("Test set size : "+str(X_test.shape[0]))
#print("Best estimator : "+str(model.best_params_))
print(cm)
print("F1-score : {0:.3f}".format(f1))
print("ROC AUC score : {0:.3f}".format(auc))
print("Elapsed time : "+str(tps))

  import sys


Train set size : 965086
Test set size : 264143
[[261923   1365]
 [   854      1]]
F1-score : 0.001
ROC AUC score : 0.498
Elapsed time : 513.323


### VOTING CLASSIFIER

In [21]:
from sklearn.ensemble import VotingClassifier
from time import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

start = time()

clf1 = LogisticRegression(C=10, class_weight=None, penalty='l2',random_state=1)
clf2 = KNeighborsClassifier(n_neighbors=5)
clf3 = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_leaf=1,random_state=1)

#ensemble_model = VotingClassifier(estimators=[('lr', clf1),('knn', clf2), ('dt', clf3)], voting='hard')
ensemble_model = VotingClassifier(estimators=[('lr', clf1), ('knn', clf2),('dt', clf3)], voting='soft', weights=[1,1,2])
ensemble_model.fit(X_train_s1_scaled, y_train_s1)

y_pred = ensemble_model.predict(X_test)
done = time() 
tps = round(done - start,3)

# Métriques sur l'échantillon de validation
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score
cm = confusion_matrix(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
auc = roc_auc_score(y_test,y_pred)

print("Train set size : "+str(X_train.shape[0]))
print("Test set size : "+str(X_test.shape[0]))
#print("Best estimator : "+str(lr.best_params_))
print(cm)
print("F1-score : {0:.3f}".format(f1))
print("ROC AUC score : {0:.3f}".format(auc))
print("Elapsed time : "+str(tps))

  return f(**kwargs)


Train set size : 241588
Test set size : 264143
[[259476   3812]
 [   836     19]]
F1-score : 0.008
ROC AUC score : 0.504
Elapsed time : 135.638
