<a href="https://colab.research.google.com/github/darshan-kale-dsi/Bank-Marketing-Campaign/blob/seb_branch/notebooks/model_adjustment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Adjustment

In [279]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [280]:
dataframe = pd.read_csv('bank-full.csv', sep = ';')
# dataframe.head(5)

In [281]:
# trimming extreme values (outliers)
dataframe = dataframe[(dataframe['balance']<10000) & (dataframe['duration']<1800)]

dataframe['pdays'] = dataframe['pdays'].apply(lambda value: 0 if value==-1 else 1 )
dataframe['default'] = dataframe['default'].apply(lambda value: 1 if value =='yes' else 0)
dataframe['housing'] = dataframe['housing'].apply(lambda value: 1 if value =='yes' else 0)
dataframe['loan'] = dataframe['loan'].apply(lambda value: 1 if value =='yes' else 0)
dataframe['y']= dataframe['y'].apply(lambda value: 1 if value =='yes' else 0)
dataframe['month'] = dataframe['month'].replace({'may':5,
                              'jun':6,
                              'jul':7,
                              'aug':8,
                              'oct':10,
                              'nov':11,
                              'dec':12,
                              'jan':1,
                              'feb':2,
                              'mar':3,
                              'apr':4,
                              'sep':9})



dataframe['day_sine'] = dataframe['day'].apply(lambda value: np.sin(2*value*np.pi/31))
dataframe['day_cosine'] = dataframe['day'].apply(lambda value: np.cos(2*value*np.pi/31))

dataframe['month_sine'] = dataframe['month'].apply(lambda value: np.sin(2*value*np.pi/12))
dataframe['month_cosine'] = dataframe['month'].apply(lambda value: np.cos(2*value*np.pi/12))


  dataframe['month'] = dataframe['month'].replace({'may':5,


In [282]:
dataframe = dataframe.drop(['day', 'month'], axis = 1)

In [283]:
assessment = pd.read_csv('bank.csv', sep = ';')
outcome = assessment['y'].replace({'yes':1,'no':0})
# trimming extreme values (outliers)
# assessment = assessment[(dataframe['balance']<10000) & (dataframe['duration']<1800)]

assessment['pdays'] = assessment['pdays'].apply(lambda value: 0 if value==-1 else 1 )
assessment['default'] = assessment['default'].apply(lambda value: 1 if value =='yes' else 0)
assessment['housing'] = assessment['housing'].apply(lambda value: 1 if value =='yes' else 0)
assessment['loan'] = assessment['loan'].apply(lambda value: 1 if value =='yes' else 0)
assessment['y']= assessment['y'].apply(lambda value: 1 if value =='yes' else 0)
assessment['month'] = assessment['month'].replace({'may':5,
                              'jun':6,
                              'jul':7,
                              'aug':8,
                              'oct':10,
                              'nov':11,
                              'dec':12,
                              'jan':1,
                              'feb':2,
                              'mar':3,
                              'apr':4,
                              'sep':9})



assessment['day_sine'] = assessment['day'].apply(lambda value: np.sin(2*value*np.pi/31))
assessment['day_cosine'] = assessment['day'].apply(lambda value: np.cos(2*value*np.pi/31))

assessment['month_sine'] = assessment['month'].apply(lambda value: np.sin(2*value*np.pi/12))
assessment['month_cosine'] = assessment['month'].apply(lambda value: np.cos(2*value*np.pi/12))


  outcome = assessment['y'].replace({'yes':1,'no':0})
  assessment['month'] = assessment['month'].replace({'may':5,


In [311]:
numeric_columns = ['age', 'default', 'balance', 'housing',
                   'loan', 'campaign', 'pdays', 'previous', 'day_sine',
                   'day_cosine', 'month_sine', 'month_cosine', 'duration']



categorical_columns = ['job','marital', 'education','contact','poutcome']

In [312]:
preprocessor = ColumnTransformer(
    transformers=[('num',StandardScaler(),numeric_columns),
                  ('cat',OneHotEncoder(drop='first'),categorical_columns)
                  ]
    )


In [313]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(  [
        ('preprocessor', preprocessor),
        ('model', XGBClassifier())
        # ('model', XGBClassifier(colsample_bytree = 0.5,
        #               gamma = 0.0,
        #               learning_rate = 0.1,
        #               max_depth = 12,
        #               reg_alpha = 1,
        #               reg_lambda = 1e-5))
    ])

In [314]:
from imblearn.over_sampling import SMOTENC, SMOTE

X = dataframe.drop('y', axis = 1)
y = dataframe['y']

sm = SMOTENC(random_state=42,
             categorical_features = categorical_columns)

# sm = SMOTE(random_state=42)

X, y = sm.fit_resample(X, y)
# y.value_counts()

In [315]:

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    random_state = 123)

In [316]:
pipeline.fit(X_train, y_train)

In [317]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, cohen_kappa_score, f1_score

Y_pred_train = pipeline.predict(X_train)
Y_pred_test = pipeline.predict(X_test)
Y_poba_train = pipeline.predict_proba(X_train)
Y_proba_test = pipeline.predict_proba(X_test)

res = {
    'accuracy_score_train': accuracy_score(y_train, Y_pred_train),
    'accuracy_score_test': accuracy_score(y_test, Y_pred_test),
    'cohen_kappa_train': cohen_kappa_score(y_train, Y_pred_train),
    'cohen_kappa_test': cohen_kappa_score(y_test, Y_pred_test),
    'log_loss_train': log_loss(y_train, Y_poba_train),
    'log_loss_test': log_loss(y_test, Y_proba_test),
    'f1_score_train': f1_score(y_train, Y_pred_train),
    'f1_score_test': f1_score(y_test, Y_pred_test)
}
res

{'accuracy_score_train': 0.9535918829666736,
 'accuracy_score_test': 0.9391870333737476,
 'cohen_kappa_train': np.float64(0.9071850646049883),
 'cohen_kappa_test': np.float64(0.8783672481285048),
 'log_loss_train': 0.13162511761136184,
 'log_loss_test': 0.16775049490086205,
 'f1_score_train': 0.9519768881551796,
 'f1_score_test': 0.9370333663693425}

In [318]:
y_hat = pipeline.predict(assessment)
print(accuracy_score(outcome, y_hat))
print(confusion_matrix(outcome, y_hat))
print(classification_report(outcome, y_hat))

0.9152842291528422
[[3938   62]
 [ 321  200]]
              precision    recall  f1-score   support

           0       0.92      0.98      0.95      4000
           1       0.76      0.38      0.51       521

    accuracy                           0.92      4521
   macro avg       0.84      0.68      0.73      4521
weighted avg       0.91      0.92      0.90      4521



In [None]:
[3882  118]
 [ 163  358]

In [306]:
y_hat = pipeline.predict(X_test)
print(accuracy_score(y_test, y_hat))
print(confusion_matrix(y_test, y_hat))
print(classification_report(y_test, y_hat))

0.9496522238529769
[[7514  328]
 [ 461 7368]]
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      7842
           1       0.96      0.94      0.95      7829

    accuracy                           0.95     15671
   macro avg       0.95      0.95      0.95     15671
weighted avg       0.95      0.95      0.95     15671



In [307]:
from sklearn.model_selection import cross_validate
scoring = ['accuracy', 'f1', 'precision', 'recall', 'roc_auc', 'neg_log_loss', 'neg_brier_score']
d3_dict = cross_validate(pipeline,
                         X, y,
                         cv=5,
                         scoring = scoring,
                         return_train_score = True)

In [308]:
pd.DataFrame(d3_dict)

Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_f1,train_f1,test_precision,train_precision,test_recall,train_recall,test_roc_auc,train_roc_auc,test_neg_log_loss,train_neg_log_loss,test_neg_brier_score,train_neg_brier_score
0,0.336466,0.086467,0.673792,0.998915,0.535442,0.998914,0.929631,0.999904,0.376005,0.997926,0.879997,0.999993,-1.213905,-0.007343,-0.281928,-0.001205
1,0.381656,0.085367,0.894646,0.967248,0.904528,0.966968,0.826919,0.975332,0.998213,0.958745,0.995224,0.996004,-0.268624,-0.086257,-0.08072,-0.024943
2,1.153425,0.124886,0.880671,0.967264,0.893216,0.967033,0.808289,0.973886,0.998086,0.960276,0.99697,0.99609,-0.245031,-0.085001,-0.080794,-0.024651
3,0.362722,0.094414,0.698934,0.972959,0.76818,0.972792,0.624561,0.978841,0.997575,0.966817,0.986728,0.997193,-0.721744,-0.072946,-0.226641,-0.020854
4,0.379058,0.086865,0.684046,0.980649,0.759625,0.980494,0.612992,0.988425,0.998468,0.972688,0.946568,0.998401,-1.237388,-0.054344,-0.268453,-0.015104


In [310]:

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.1, 0.01, 0.001],
    "subsample": [0.8, 1.0],
    # "colsample_bytree": [0.8, 1.0],
    # "min_child_weight": [1, 3, 5],
    # "gamma": [0, 1, 5],
    # "reg_alpha": [0, 1, 10],
    # "reg_lambda": [0, 1, 10],
}

grid_cv = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring = 'roc_auc',
    cv = 5,
    verbose = 1,
    n_jobs = -1)


grid_cv.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


ValueError: Invalid parameter 'learning_rate' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['age', 'default', 'balance',
                                                   'housing', 'loan',
                                                   'campaign', 'pdays',
                                                   'previous', 'day_sine',
                                                   'day_cosine', 'month_sine',
                                                   'month_cosine',
                                                   'duration']),
                                                 ('cat',
                                                  OneHotEncoder(drop='first'),
                                                  ['job', 'marital',
                                                   'education', 'contact',
                                                   'poutcome'])])),
                ('model',
                 XGBClassifier(base_score...
                               feature_types=None, gamma=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=None, max_leaves=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, multi_strategy=None,
                               n_estimators=None, n_jobs=None,
                               num_parallel_tree=None, random_state=None, ...))]). Valid parameters are: ['memory', 'steps', 'transform_input', 'verbose'].

In [129]:
# def preprocessing_pipeline(dataframe):
#   binary_features = [
#                       'pdays',
#                       'default',
#                       'housing',
#                       'loan'
#                     ]
#   numeric_features = [
#                       'age',
#                       'balance',
#                       # 'duration',
#                       'campaign',
#                       'previous'
#                       ]
#   time_features = [
#                     'day_sine',
#                    'day_cosine',
#                    'month_sine',
#                    'month_cosine'
#                    ]
#   categorical_features = [
#                           'job',
#                           'marital',
#                           'education',
#                           'contact',
#                           'poutcome'
#                           ]

#   dataframe = dataframe[binary_features+numeric_features+time_features+categorical_features]

#   preprocessor = ColumnTransformer(transformers=[
#       ('num', StandardScaler(), numeric_features),
#       ('cat', OneHotEncoder(), categorical_features)
#   ])

#   dataframe_transformed = preprocessor.fit_transform(dataframe)

#   ohe_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
#   final_columns = numeric_features + list(ohe_feature_names)
#   X_train_preprocessed_df = pd.DataFrame(dataframe_transformed, columns=final_columns)

#   X_train_preprocessed_df = pd.concat([X_train_preprocessed_df,dataframe[time_features].reset_index(drop = True)], axis = 1)
#   X_train_preprocessed_df = pd.concat([X_train_preprocessed_df,dataframe[binary_features].reset_index(drop = True)], axis = 1)


#   return X_train_preprocessed_df

In [130]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def preprocessing_pipeline(df):
    """
    Preprocess the input DataFrame by scaling numeric features, one-hot encoding categorical features,
    and preserving time and binary features unchanged.

    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame containing all necessary columns.

    Returns
    -------
    df_preprocessed : pandas.DataFrame
        A DataFrame with transformed numeric and categorical features, along with the original time and binary features.
        The columns are ordered as: [scaled numeric features, one-hot encoded categorical features, time features, binary features].
    """

    # Define feature groups
    binary_features = [
                      # 'pdays',
                      'default',
                      'housing',
                      'loan'
                    ]
    numeric_features = [
                      'age',
                      'balance',
                      # 'duration',
                      'campaign',
                      'previous'
                      ]
    time_features = [
                    'day_sine',
                   'day_cosine',
                   'month_sine',
                   'month_cosine'
                   ]
    categorical_features = [
                          'job',
                          'marital',
                          'education',
                          'contact',
                          # 'poutcome'
                          ]



    # Select and copy only the necessary columns
    selected_columns = binary_features + numeric_features + time_features + categorical_features

    df = df[selected_columns].copy()

    # Create a ColumnTransformer to scale numeric features and one-hot encode categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(), categorical_features)
        ]
    )

    # Apply transformations on numeric and categorical features
    transformed_array = preprocessor.fit_transform(df)

    # Retrieve one-hot encoded feature names
    cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

    # Build DataFrame from the transformed numeric and categorical features
    transformed_feature_names = numeric_features + list(cat_feature_names)
    df_transformed = pd.DataFrame(transformed_array, columns=transformed_feature_names, index=df.index)

    # Concatenate the unchanged time and binary features
    df_preprocessed = pd.concat([df_transformed, df[time_features], df[binary_features]], axis=1)

    return df_preprocessed


In [131]:
X_train_processed = preprocessing_pipeline(X_train)
X_test_processed = preprocessing_pipeline(X_test)



In [132]:
# from sklearn import ensemble as en
# import boruta as bt
# x_train_n = X_train_processed.values
# y_train_n = y_train.values.ravel()

# # Define model
# et = en.ExtraTreesClassifier( n_estimators=250, random_state=0, n_jobs=-1 )

# # Define boruta
# boruta = bt.BorutaPy( et, n_estimators='auto', verbose=2, random_state=42 ).fit( x_train_n, y_train_n  )

In [133]:
# cols_selected = boruta.support_.tolist()
# X_train_processed = X_train_processed[X_train_processed.iloc[:, cols_selected].columns.to_list()]
# X_test_processed = X_test_processed[X_train_processed.columns]

In [134]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB

# model = BernoulliNB()
# model = KNeighborsClassifier()
# model = lgb.LGBMClassifier(learning_rate=0.09,max_depth=-5,random_state=42)
# model = RandomForestClassifier()
# model = XGBClassifier()
# model = XGBClassifier(colsample_bytree = 0.5,
#                       gamma = 0.0,
#                       learning_rate = 0.1,
#                       max_depth = 12,
#                       reg_alpha = 1,
#                       reg_lambda = 1e-5)
# 'colsample_bytree': 0.5, 'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 12, 'reg_alpha': 1, 'reg_lambda': 1e-05

# model = LogisticRegression()
# model = svm.SVC()
model.fit(X_train_processed, y_train)
y_hat = model.predict(X_test_processed)
print(accuracy_score(y_test, y_hat))
print(confusion_matrix(y_test, y_hat))

0.8897086501543326
[[11461   302]
 [ 1163   357]]


In [136]:
assessment = pd.read_csv('bank.csv', sep = ';')
outcome = assessment['y'].replace({'yes':1,'no':0})

  outcome = assessment['y'].replace({'yes':1,'no':0})


In [137]:



import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def preprocessing_pipeline(df):
    """
    Preprocess the input DataFrame by scaling numeric features, one-hot encoding categorical features,
    and preserving time and binary features unchanged.

    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame containing all necessary columns.

    Returns
    -------
    df_preprocessed : pandas.DataFrame
        A DataFrame with transformed numeric and categorical features, along with the original time and binary features.
        The columns are ordered as: [scaled numeric features, one-hot encoded categorical features, time features, binary features].
    """
    # df = df[(df['balance']<10000) & (df['duration']<1800)]
    df['pdays'] = df['pdays'].apply(lambda value: 0 if value==-1 else 1 )
    df['default'] = df['default'].apply(lambda value: 1 if value =='yes' else 0)
    df['housing'] = df['housing'].apply(lambda value: 1 if value =='yes' else 0)
    df['loan'] = df['loan'].apply(lambda value: 1 if value =='yes' else 0)
    df['y']= df['y'].apply(lambda value: 1 if value =='yes' else 0)
    df['month'] = df['month'].replace({'may':5,
                                'jun':6,
                                'jul':7,
                                'aug':8,
                                'oct':10,
                                'nov':11,
                                'dec':12,
                                'jan':1,
                                'feb':2,
                                'mar':3,
                                'apr':4,
                                'sep':9})

    df['day_sine'] = df['day'].apply(lambda value: np.sin(2*value*np.pi/31))
    df['day_cosine'] = df['day'].apply(lambda value: np.cos(2*value*np.pi/31))

    df['month_sine'] = df['month'].apply(lambda value: np.sin(2*value*np.pi/12))
    df['month_cosine'] = df['month'].apply(lambda value: np.cos(2*value*np.pi/12))

    # Define feature groups
    binary_features = [
                      # 'pdays',
                      'default',
                      'housing',
                      'loan'
                    ]
    numeric_features = [
                      'age',
                      'balance',
                      # 'duration',
                      'campaign',
                      'previous'
                      ]
    time_features = [
                    'day_sine',
                   'day_cosine',
                   'month_sine',
                   'month_cosine'
                   ]
    categorical_features = [
                          'job',
                          'marital',
                          'education',
                          'contact',
                          # 'poutcome'
                          ]



    # Select and copy only the necessary columns
    selected_columns = binary_features + numeric_features + time_features + categorical_features

    df = df[selected_columns].copy()

    # Create a ColumnTransformer to scale numeric features and one-hot encode categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(), categorical_features)
        ]
    )

    # Apply transformations on numeric and categorical features
    transformed_array = preprocessor.fit_transform(df)

    # Retrieve one-hot encoded feature names
    cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

    # Build DataFrame from the transformed numeric and categorical features
    transformed_feature_names = numeric_features + list(cat_feature_names)
    df_transformed = pd.DataFrame(transformed_array, columns=transformed_feature_names, index=df.index)

    # Concatenate the unchanged time and binary features
    df_preprocessed = pd.concat([df_transformed, df[time_features], df[binary_features]], axis=1)

    return df_preprocessed


In [138]:
assessment = preprocessing_pipeline(assessment)

  df['month'] = df['month'].replace({'may':5,


In [139]:
y_hat_assessment = model.predict(assessment)

In [140]:
print(accuracy_score(outcome, y_hat_assessment))
print(confusion_matrix(outcome, y_hat_assessment))

0.8942711789427118
[[3886  114]
 [ 364  157]]


In [89]:
outcome

Unnamed: 0,y
0,no
1,no
2,no
3,no
4,no
...,...
4516,no
4517,no
4518,no
4519,no


In [65]:
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval

# Space
space = {
    'learning_rate': hp.choice('learning_rate', [0.0001,0.001, 0.01, 0.1, 1]),
    'max_depth' : hp.choice('max_depth', range(3,21,3)),
    'gamma' : hp.choice('gamma', [i/10.0 for i in range(0,5)]),
    'colsample_bytree' : hp.choice('colsample_bytree', [i/10.0 for i in range(3,10)]),
    'reg_alpha' : hp.choice('reg_alpha', [1e-5, 1e-2, 0.1, 1, 10, 100]),
    'reg_lambda' : hp.choice('reg_lambda', [1e-5, 1e-2, 0.1, 1, 10, 100])
}

# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)


# Objective function
def objective(params):
  xgboost = XGBClassifier(seed=0, **params)
  score = cross_val_score(estimator=xgboost,
                            X=X_train_processed,
                            y=y_train,
                            cv=kfold,
                            scoring='average_precision',
                            n_jobs=-1).mean()
  # Loss is negative score
  loss = - score
  # Dictionary with information for evaluation
  return {'loss': loss, 'params': params, 'status': STATUS_OK}

# Optimize
best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 48, trials = Trials())

# Print the values of the best parameters
print(space_eval(space, best))

100%|██████████| 48/48 [00:59<00:00,  1.24s/trial, best loss: -0.9797753986687633]
{'colsample_bytree': 0.5, 'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 12, 'reg_alpha': 1, 'reg_lambda': 1e-05}


In [14]:


# Define the parameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.1, 0.01, 0.001],
    "subsample": [0.8, 1.0],
    # "colsample_bytree": [0.8, 1.0],
    # "min_child_weight": [1, 3, 5],
    # "gamma": [0, 1, 5],
    # "reg_alpha": [0, 1, 10],
    # "reg_lambda": [0, 1, 10],
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator = model,
    param_grid = param_grid,
    scoring = "roc_auc",
    cv = 5,
    verbose = 1,
    n_jobs = -1
)

# Fit the grid search
final = grid_search.fit(X_train_processed, y_train)
y_test = final.predict(X_test_processed)
print(accuracy_score(y_test, y_hat))
print(confusion_matrix(y_test, y_hat))

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)


Fitting 5 folds for each of 54 candidates, totalling 270 fits
0.990589475269141
[[12756    41]
 [   84   402]]
Best parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'subsample': 1.0}
Best cross-validation score: 0.7936587791330002
