In [None]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import set_config; set_config(display='diagram')
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectPercentile, VarianceThreshold, SelectFromModel, mutual_info_classif

In [37]:
#Retrieving data
df_bank = pd.read_csv("../data/bank-full.csv",sep=';', header=0)
df_bank.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [38]:
#Creating X and y variables
X = df_bank.drop(['y', 'duration'], axis=1)
y = df_bank['y']

In [40]:
y = y.map({'no':0,'yes':1})

In [44]:
# Step 1: Split into temp (train + val) and test
X_temp, X_test, y_temp, y_test = train_test_split(X, y,
                                                  test_size=0.20,
                                                  stratify=y,
                                                  random_state=42)

# Step 2: Split temp into train and val
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp,
                                                  test_size=0.25,
                                                  stratify=y_temp,
                                                  random_state=42)

In [46]:
from sklearn.base import BaseEstimator, TransformerMixin

class CyclicalMonthEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, column_name='month'):
        self.column_name = column_name

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        month_map = {
            'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
            'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
            'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
        }

        months = X[self.column_name].str.lower().str[:3].map(month_map)
        sin_month = np.sin(2 * np.pi * months / 12)
        cos_month = np.cos(2 * np.pi * months / 12)
        return np.c_[sin_month, cos_month]

    def get_feature_names_out(self, input_features=None):
        return [f"{self.column_name}_sin", f"{self.column_name}_cos"]


In [48]:
# Step 1: Encode binary values
# 1.1 Select features
categorical_features = ['contact','default','education','housing','job','loan','marital','poutcome']

# 1.2 Select method
categorical_transformer = OneHotEncoder(drop='if_binary',
                    sparse_output=False,
                    handle_unknown='ignore')

# Step 2: Encode numerical values
#2.1 Select features
# Get all int64 columns
all_numerical = make_column_selector(dtype_include='int64')(X_train)

# Exclude 'month'
numerical_features = [col for col in all_numerical if col != 'month']

#2.1 Select method
num_transformer = Pipeline([
    ('robust_scaler', RobustScaler())
])

#Step 3:Include the Cyclical Transformer
month_transformer = CyclicalMonthEncoder(column_name='month')

# Step 4: Use the ColumnTransformer to bring all features into 1 table
preprocessor = ColumnTransformer([
    ('categorical_features', categorical_transformer, categorical_features),
    ('num_transformer', num_transformer, numerical_features),
    ('month_transformer', month_transformer,['month'])],
    remainder='passthrough')


In [49]:
preprocessor

0,1,2
,transformers,"[('categorical_features', ...), ('num_transformer', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'if_binary'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,column_name,'month'


In [50]:
X_train_preprocessed = preprocessor.fit_transform(X_train)

In [51]:
X_train_preprocessed

array([[ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
         0.00000000e+00,  1.22464680e-16, -1.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
         0.00000000e+00,  1.22464680e-16, -1.00000000e+00],
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00, -8.66025404e-01, -5.00000000e-01],
       ...,
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  5.00000000e-01,  8.66025404e-01],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
         0.00000000e+00,  5.00000000e-01, -8.66025404e-01],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
         0.00000000e+00,  1.22464680e-16, -1.00000000e+00]],
      shape=(27126, 37))

In [52]:
feature_names = preprocessor.get_feature_names_out()

In [53]:
X_train_df = pd.DataFrame(X_train_preprocessed, columns=feature_names)
X_train_df

Unnamed: 0,categorical_features__contact_cellular,categorical_features__contact_telephone,categorical_features__contact_unknown,categorical_features__default_yes,categorical_features__education_primary,categorical_features__education_secondary,categorical_features__education_tertiary,categorical_features__education_unknown,categorical_features__housing_yes,categorical_features__job_admin.,...,categorical_features__poutcome_success,categorical_features__poutcome_unknown,num_transformer__age,num_transformer__balance,num_transformer__day,num_transformer__campaign,num_transformer__pdays,num_transformer__previous,month_transformer__month_sin,month_transformer__month_cos
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.266667,1.496211,-0.307692,-0.5,0.0,0.0,1.224647e-16,-1.000000
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.066667,0.164110,-0.538462,-0.5,0.0,0.0,1.224647e-16,-1.000000
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.000000,2.491961,0.692308,2.0,0.0,0.0,-8.660254e-01,-0.500000
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.200000,-0.096101,0.846154,0.0,0.0,0.0,-8.660254e-01,-0.500000
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,-0.533333,0.130845,0.538462,1.0,0.0,0.0,5.000000e-01,-0.866025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27121,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,-0.400000,0.799852,-0.538462,-0.5,93.0,3.0,8.660254e-01,0.500000
27122,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.533333,-0.228424,0.076923,-0.5,0.0,0.0,1.224647e-16,-1.000000
27123,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.133333,0.382924,1.076923,0.0,0.0,0.0,5.000000e-01,0.866025
27124,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,-0.133333,-0.024395,1.076923,0.0,0.0,0.0,5.000000e-01,-0.866025


In [54]:
# Define each model pipeline
logreg_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state= 42))
])


In [55]:
logreg_pipeline

0,1,2
,steps,"[('preprocessing', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categorical_features', ...), ('num_transformer', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'if_binary'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,column_name,'month'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [56]:
logreg_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categorical_features', ...), ('num_transformer', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'if_binary'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,column_name,'month'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [57]:
y_pred = logreg_pipeline.predict(X_val)


In [1]:
#I am testing

In [None]:
#accuracy, balanced_accuracy, F1, F1 Beta (assign more weight to precision or recall), ROC-AUC curve
#ideally create a function to be able to recycle these metrics
#we will need to decide which metric is best for the business