# Gridsearch

### Import modules

In [2]:
import pandas as pd
import numpy as np

import product_sub.settings as stg
from product_sub.infrastructure.eco_social import EcoSocioContext
from product_sub.infrastructure.bank_campaign import MarketingCampaign
from product_sub.infrastructure.dataset_builder import DatasetBuilder
from product_sub.domain.data_cleaning import CatImputer, NumImputer
from product_sub.domain.feature_creator import CategoricalCreatorFromNumerical, CategoricalFeatureCreator
from product_sub.domain.feature_encoder import FrequencyEncoder, OneHotEncoder

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

## Construction pipeline

In [3]:
dataset_merged = DatasetBuilder("data.csv", "socio_eco.csv").create_dataset()
RANDOM_STATE = 89
cv = StratifiedKFold(shuffle=True, n_splits=5, random_state=RANDOM_STATE)

X = dataset_merged.drop(columns=stg.COL_RAW_SUBSCRIPTION)
y = dataset_merged[stg.COL_RAW_SUBSCRIPTION].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

numeric_transformer = Pipeline(steps=[("num_imputer", NumImputer()),
                                      ("create_categorical",CategoricalCreatorFromNumerical(stg.DICT_TO_CREATE_COLS)),
                                      ("scaler", MinMaxScaler())])

categorical_transformer = Pipeline(steps=[("cat_imputer", CatImputer()),
                                          ("cat_creator", CategoricalFeatureCreator()),
                                          ("freq_encoder", FrequencyEncoder(stg.COLS_TO_FREQ_ENCODE)),
                                          ("one_hot_encoder", OneHotEncoder([stg.COL_RAW_JOB]))])

preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, selector(dtype_exclude="category")),
                                               ("cat", categorical_transformer, selector(dtype_include="category"))])

## Logistic Regression

In [7]:
lr = Pipeline(steps=[('preprocessor' , preprocessor),
                      ('lr',  LogisticRegression())])

#lr.get_params().keys()
#search.best_params_

params_grid_lr = {'lr' : [LogisticRegression()],
                  'lr__penalty' : ['l1'],
                  'lr__C' : [2.6366],
                  'lr__solver' : ['saga']}

search = GridSearchCV(lr, params_grid_lr, n_jobs=-1, cv=cv, verbose=0, scoring='accuracy')

search.fit(X_train, y_train)

y_pred = search.predict(X_test)

print(f'\nLe meilleur score entraînement est : {search.best_score_}\n') # 0.9067
print(f'\nLe meilleur score test est : {search.score(X_test, y_test)}') # 0.9038

In [83]:
print(classification_report(y_true=y_test, y_pred=y_pred, target_names = ['No','Yes']))

              precision    recall  f1-score   support

          No       0.92      0.98      0.95      7636
         Yes       0.67      0.37      0.48      1030

    accuracy                           0.90      8666
   macro avg       0.80      0.67      0.71      8666
weighted avg       0.89      0.90      0.89      8666



## Support Vector Classification

In [86]:
svc = Pipeline(steps=[('preprocessor' , preprocessor),
                      ('svm', SVC())])

#svc.get_params().keys()
#search.best_params_

params_grid_svc = {'svm' : [SVC()],
                   'svm__C' : [100],
                   'svm__gamma' : [0.1],
                   'svm__kernel' : ['rbf']}

search = GridSearchCV(svc, params_grid_svc, n_jobs=-1, cv=cv, verbose=0, scoring='accuracy')

search.fit(X_train, y_train)

y_pred = search.predict(X_test)

print(f'\nLe meilleur score entraînement est : {search.best_score_}\n') # 0.9033
print(f'\nLe meilleur score test est : {search.score(X_test, y_test)}') # 0.9005

In [85]:
print(classification_report(y_true=y_test, y_pred=y_pred, target_names = ['No','Yes']))

              precision    recall  f1-score   support

          No       0.92      0.98      0.95      7636
         Yes       0.65      0.35      0.45      1030

    accuracy                           0.90      8666
   macro avg       0.79      0.66      0.70      8666
weighted avg       0.89      0.90      0.89      8666



## Gaussian Naive Bayes

In [89]:
gnb = Pipeline(steps=[('preprocessor' , preprocessor),
                      ('gnb', GaussianNB())])

#gnb.get_params().keys()
#search.best_params_

params_grid_gnb = {'gnb' : [GaussianNB()],
                   'gnb__priors' : [None],
                   'gnb__var_smoothing' : [1e-08]}

search = GridSearchCV(gnb, params_grid_gnb, n_jobs=-1, cv=cv, verbose=0, scoring='accuracy')

search.fit(X_train, y_train)

y_pred = search.predict(X_test)

print(f'\nLe meilleur score entraînement est : {search.best_score_}\n') # 0.8350
print(f'\nLe meilleur score test est : {search.score(X_test, y_test)}') # 0.8262

In [88]:
print(classification_report(y_true=y_test, y_pred=y_pred, target_names = ['No','Yes']))

              precision    recall  f1-score   support

          No       0.92      0.88      0.90      7636
         Yes       0.33      0.46      0.39      1030

    accuracy                           0.83      8666
   macro avg       0.63      0.67      0.64      8666
weighted avg       0.85      0.83      0.84      8666



## KNeighbors Classifier

In [92]:
knc = Pipeline(steps=[('preprocessor' , preprocessor),
                      ('knc', KNeighborsClassifier())])

#knc.get_params().keys()
#search.best_params_

params_grid_knc = {'knc' : [KNeighborsClassifier()],
                   'knc__n_neighbors': [5],
                   'knc__leaf_size': [30],
                   'knc__p': [1]}

search = GridSearchCV(knc, params_grid_knc, n_jobs=-1, cv=cv, verbose=0, scoring='accuracy')

search.fit(X_train, y_train)

y_pred = search.predict(X_test)

print(f'\nLe meilleur score entraînement est : {search.best_score_}\n') # 0.8893
print(f'\nLe meilleur score test est : {search.score(X_test, y_test)}') # 0.8922

In [91]:
print(classification_report(y_true=y_test, y_pred=y_pred, target_names = ['No','Yes']))

              precision    recall  f1-score   support

          No       0.91      0.98      0.94      7636
         Yes       0.60      0.27      0.38      1030

    accuracy                           0.89      8666
   macro avg       0.76      0.63      0.66      8666
weighted avg       0.87      0.89      0.87      8666



## Decision Tree Classifier

In [95]:
dtc = Pipeline(steps=[('preprocessor' , preprocessor),
                      ('dtc', DecisionTreeClassifier())])

#dtc.get_params().keys()
#search.best_params_

params_grid_dtc = {'dtc' : [DecisionTreeClassifier()],
                   'dtc__criterion': ["gini"],
                   'dtc__splitter': ["best"],
                   'dtc__max_depth': [8],
                   'dtc__min_samples_split': [2],
                   'dtc__min_samples_leaf': [1]}

search = GridSearchCV(dtc, params_grid_dtc, n_jobs=-1, cv=cv, verbose=0, scoring='accuracy')

search.fit(X_train, y_train)

y_pred = search.predict(X_test)

print(f'\nLe meilleur score entraînement est : {search.best_score_}\n') # 0.9047
print(f'\nLe meilleur score test est : {search.score(X_test, y_test)}') # 0.9075

In [94]:
print(classification_report(y_true=y_test, y_pred=y_pred, target_names = ['No','Yes']))

              precision    recall  f1-score   support

          No       0.94      0.95      0.95      7636
         Yes       0.62      0.56      0.59      1030

    accuracy                           0.91      8666
   macro avg       0.78      0.76      0.77      8666
weighted avg       0.90      0.91      0.91      8666



## RandomForest Classifier

In [98]:
rf = Pipeline(steps=[('preprocessor' , preprocessor),
                      ('rf', RandomForestClassifier())])

#rf.get_params().keys()
#search.best_params_

params_grid_rf = {'rf' : [RandomForestClassifier()],
                  'rf__bootstrap': [True],
                  'rf__max_features': ['auto'],
                  'rf__n_estimators': [100],
                  'rf__criterion': ["gini"],
                  'rf__max_depth': [None],
                  'rf__min_samples_split': [2],
                  'rf__min_samples_leaf': [1]}

search = GridSearchCV(rf, params_grid_rf, n_jobs=-1, cv=cv, verbose=0, scoring='accuracy')

search.fit(X_train, y_train)

y_pred = search.predict(X_test)

print(f'\nLe meilleur score entraînement est : {search.best_score_}\n') # 0.9086 
print(f'\nLe meilleur score test est : {search.score(X_test, y_test)}') # 0.9111

In [97]:
print(classification_report(y_true=y_test, y_pred=y_pred, target_names = ['No','Yes']))

              precision    recall  f1-score   support

          No       0.93      0.97      0.95      7636
         Yes       0.69      0.50      0.58      1030

    accuracy                           0.91      8666
   macro avg       0.81      0.73      0.76      8666
weighted avg       0.91      0.91      0.91      8666



## GradientBoosting Classifier

In [101]:
gbc = Pipeline(steps=[('preprocessor' , preprocessor),
                      ('gbc', GradientBoostingClassifier())])

#gbc.get_params().keys()
#search.best_params_

params_grid_gbc = {'gbc': [GradientBoostingClassifier()],
                   'gbc__n_estimators': [100],
                   'gbc__loss': ['deviance'],
                   'gbc__learning_rate': [1],
                   'gbc__max_depth': [1],
                   'gbc__max_features' : ['sqrt'],
                   'gbc__random_state': [10]}

search = GridSearchCV(gbc, params_grid_gbc, n_jobs=-1, cv=cv, verbose=0, scoring='accuracy')

search.fit(X_train, y_train)

y_pred = search.predict(X_test)

print(f'\nLe meilleur score entraînement est : {search.best_score_}\n') # 0.9058
print(f'\nLe meilleur score test est : {search.score(X_test, y_test)}') # 0.9081

In [100]:
print(classification_report(y_true=y_test, y_pred=y_pred, target_names = ['No','Yes']))

              precision    recall  f1-score   support

          No       0.93      0.97      0.95      7636
         Yes       0.67      0.44      0.53      1030

    accuracy                           0.91      8666
   macro avg       0.80      0.71      0.74      8666
weighted avg       0.90      0.91      0.90      8666



## AdaBoost Classifier

In [104]:
abc = Pipeline(steps=[('preprocessor' , preprocessor),
                      ('abc', AdaBoostClassifier())])

#abc.get_params().keys()
#search.best_params_

params_grid_abc = {'abc': [AdaBoostClassifier()],
                   'abc__n_estimators': [100],
                   'abc__learning_rate': [1],
                   'abc__random_state': [10]}

search = GridSearchCV(abc, params_grid_abc, n_jobs=-1, cv=cv, verbose=0, scoring='accuracy')

search.fit(X_train, y_train)

y_pred = search.predict(X_test)

print(f'\nLe meilleur score entraînement est : {search.best_score_}\n') # 0.9046
print(f'\nLe meilleur score test est : {search.score(X_test, y_test)}') # 0.9072

In [103]:
print(classification_report(y_true=y_test, y_pred=y_pred, target_names = ['No','Yes']))

              precision    recall  f1-score   support

          No       0.93      0.97      0.95      7636
         Yes       0.68      0.42      0.52      1030

    accuracy                           0.91      8666
   macro avg       0.80      0.69      0.73      8666
weighted avg       0.90      0.91      0.90      8666



## XGBoost Classifier

In [107]:
boost = Pipeline(steps=[('preprocessor' , preprocessor),
                        ('xgb', XGBClassifier())])

#boost.get_params().keys()
#search.best_params_

params_grid_xgb = {'xgb': [XGBClassifier()],
                   'xgb__booster': ['gbtree'],
                   'xgb__eta': [0.3],
                   'xgb__gamma': [0],
                   'xgb__max_depth': [6],
                   'xgb__min_child_weight': [1],
                   'xgb__max_delta_step': [0],
                   'xgb__subsample': [1],
                   'xgb__colsample_bytree': [1]}

search = GridSearchCV(boost, params_grid_xgb, n_jobs=-1, cv=cv, verbose=0, scoring='accuracy')

search.fit(X_train, y_train)

y_pred = search.predict(X_test)

print(f'\nLe meilleur score entraînement est : {search.best_score_}\n') # 0.9074
print(f'\nLe meilleur score test est : {search.score(X_test, y_test)}') # 0.9091

In [106]:
print(classification_report(y_true=y_test, y_pred=y_pred, target_names = ['No','Yes']))

              precision    recall  f1-score   support

          No       0.93      0.96      0.95      7636
         Yes       0.65      0.50      0.57      1030

    accuracy                           0.91      8666
   macro avg       0.79      0.73      0.76      8666
weighted avg       0.90      0.91      0.90      8666



## LightGBM Classifier

In [110]:
lgbm = Pipeline(steps=[('preprocessor' , preprocessor),
                        ('lgbm', LGBMClassifier())])

#lgbm.get_params().keys()
#search.best_params_

params_grid_lgbm = {'lgbm': [LGBMClassifier()],
                    'lgbm__boosting_type ': ['gbdt'],
                    'lgbm__learning_rate ': [0.1],
                    'lgbm__n_estimators': [100],
                    'lgbm__max_depth': [1],
                    'lgbm__min_child_weight': [1e-3],
                    'lgbm__num_leaves': [31],
                    'lgbm__min_split_gain': [0],
                    'lgbm__colsample_bytree': [1]}

search = GridSearchCV(lgbm, params_grid_lgbm, n_jobs=-1, cv=cv, verbose=0, scoring='accuracy')

search.fit(X_train, y_train)

y_pred = search.predict(X_test)

print(f'\nLe meilleur score entraînement est : {search.best_score_}\n') # 0.90
print(f'\nLe meilleur score test est : {search.score(X_test, y_test)}') # 0.90

In [109]:
print(classification_report(y_true=y_test, y_pred=y_pred, target_names = ['No','Yes']))

              precision    recall  f1-score   support

          No       0.91      0.99      0.95      7636
         Yes       0.71      0.27      0.39      1030

    accuracy                           0.90      8666
   macro avg       0.81      0.63      0.67      8666
weighted avg       0.89      0.90      0.88      8666



## CatBoost Classifier

In [113]:
cat = Pipeline(steps=[('preprocessor' , preprocessor),
                      ('cat', CatBoostClassifier())])

#cat.get_params().keys()
#search.best_params_

params_grid_cat = {'cat': [CatBoostClassifier()]}

search = GridSearchCV(cat, params_grid_cat, n_jobs=-1, cv=cv, verbose=0, scoring='accuracy')

search.fit(X_train, y_train)

y_pred = search.predict(X_test)

print(f'\nLe meilleur score entraînement est : {search.best_score_}\n') # 0.9094
print(f'\nLe meilleur score test est : {search.score(X_test, y_test)}') # 0.9125

In [112]:
print(classification_report(y_true=y_test, y_pred=y_pred, target_names = ['No','Yes']))

              precision    recall  f1-score   support

          No       0.94      0.97      0.95      7636
         Yes       0.67      0.52      0.59      1030

    accuracy                           0.91      8666
   macro avg       0.80      0.74      0.77      8666
weighted avg       0.91      0.91      0.91      8666

