In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
# from ydata_profiling import ProfileReport
from utils import get_uniq_items, evaluate_model_performance


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_recall_fscore_support

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import (ColumnTransformer, 
                             make_column_selector, 
                             make_column_selector,)

from sklearn.pipeline import (FunctionTransformer, 
                              FeatureUnion, 
                              TransformerMixin, make_pipeline,)

from sklearn.base import BaseEstimator, TransformerMixin

In [5]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [6]:
from sklearn import datasets

In [7]:
path_data = '/Users/pinaki/Downloads/ML_datasets/'
file_data = path_data + 'Bank Marketing.csv'
file_data

'/Users/pinaki/Downloads/ML_datasets/Bank Marketing.csv'

In [8]:
QUANTILES = [0.01, 0.03, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.97, 0.99]

In [9]:
# convert 'unknown' to None
def fill_unknown_w_na(X_raw, cols:list):
    X = X_raw.copy()
    print(f"inside fill_unknown func; type of X: {type(X)} | shape of X: {X.shape} | cols: {cols}")
    for col in cols:
        filt_unknown = (X[col]=='unknown')
        if filt_unknown.sum()>0:  # condition returns data
            X.loc[filt_unknown, col] = None
    return X

# Data Read

In [10]:
df = pd.read_csv(file_data)

df.shape
df.head(3)

(45211, 18)

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,Class
0,1,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,1
1,2,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,1
2,3,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,1


In [11]:
X = df.drop(['Class'], axis=1)
y = df['Class']

X.shape
y.shape

(45211, 17)

(45211,)

In [12]:
y.isna().sum()

0

In [13]:
y.value_counts()
y.value_counts(normalize=True)

1    39922
2     5289
Name: Class, dtype: int64

1    0.883015
2    0.116985
Name: Class, dtype: float64

In [14]:
# Convert 'unknown' values to None
# X_new = X.copy()
X = X.drop(['id'], axis=1)
X =  fill_unknown_w_na(X, X.columns.to_list())

X.isna().sum()

inside fill_unknown func; type of X: <class 'pandas.core.frame.DataFrame'> | shape of X: (45211, 16) | cols: ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16']


V1         0
V2       288
V3         0
V4      1857
V5         0
V6         0
V7         0
V8         0
V9     13020
V10        0
V11        0
V12        0
V13        0
V14        0
V15        0
V16    36959
dtype: int64

In [15]:
X.describe(QUANTILES).T

Unnamed: 0,count,mean,std,min,1%,3%,5%,10%,25%,50%,75%,90%,95%,97%,99%,max
V1,45211.0,40.93621,10.618762,18.0,23.0,26.0,27.0,29.0,33.0,39.0,48.0,56.0,59.0,60.0,71.0,95.0
V6,45211.0,1362.272058,3044.765829,-8019.0,-627.0,-322.0,-172.0,0.0,72.0,448.0,1428.0,3574.0,5768.0,7777.9,13164.9,102127.0
V10,45211.0,15.806419,8.322476,1.0,2.0,2.0,3.0,5.0,8.0,16.0,21.0,28.0,29.0,30.0,31.0,31.0
V12,45211.0,258.16308,257.527812,0.0,11.0,22.0,35.0,58.0,103.0,180.0,319.0,548.0,751.0,914.7,1269.0,4918.0
V13,45211.0,2.763841,3.098021,1.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,5.0,8.0,10.0,16.0,63.0
V14,45211.0,40.197828,100.128746,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,185.0,317.0,349.0,370.0,871.0
V15,45211.0,0.580323,2.303441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,5.0,8.9,275.0


In [16]:
dict_uniq_items = get_uniq_items(X, 15)
dict_uniq_items

{'V1': [77],
 'V2': ['management',
  'technician',
  'entrepreneur',
  'blue-collar',
  None,
  'retired',
  'admin.',
  'services',
  'self-employed',
  'unemployed',
  'housemaid',
  'student'],
 'V3': ['married', 'single', 'divorced'],
 'V4': ['tertiary', 'secondary', None, 'primary'],
 'V5': ['no', 'yes'],
 'V6': [7168],
 'V7': ['yes', 'no'],
 'V8': ['no', 'yes'],
 'V9': [None, 'cellular', 'telephone'],
 'V10': [31],
 'V11': ['may',
  'jun',
  'jul',
  'aug',
  'oct',
  'nov',
  'dec',
  'jan',
  'feb',
  'mar',
  'apr',
  'sep'],
 'V12': [1573],
 'V13': [48],
 'V14': [559],
 'V15': [41],
 'V16': [None, 'failure', 'other', 'success']}

In [17]:
# {idx: col for idx, col in enumerate(X.columns.to_list())}
cols_all_names = {
    # '0': 'id',
    '1': 'V1',
    '2': 'job_type',
    '3': 'marital_status',
    '4': 'education_level',
    '5': 'V5',
    '6': 'V6',
    '7': 'V7',
    '8': 'V8',
    '9': 'phone_type',
    '10': 'days_of_month',
    '11': 'month',
    '12': 'V12',
    '13': 'V13',
    '14': 'V14',
    '15': 'V15',
    '16': 'prev_loan_status'
    
}
cols_all_names

{'1': 'V1',
 '2': 'job_type',
 '3': 'marital_status',
 '4': 'education_level',
 '5': 'V5',
 '6': 'V6',
 '7': 'V7',
 '8': 'V8',
 '9': 'phone_type',
 '10': 'days_of_month',
 '11': 'month',
 '12': 'V12',
 '13': 'V13',
 '14': 'V14',
 '15': 'V15',
 '16': 'prev_loan_status'}

In [18]:
cols_bool = ['V5', 'V7', 'V8']
cols_bool

cols_catg = [k for k,v in dict_uniq_items.items() if (len(v)>1)]
cols_catg = [col for col in cols_catg if col not in cols_bool]
cols_catg

cols_num = [col for col in dict_uniq_items.keys() if col not in cols_bool + cols_catg]
cols_num = [col for col in cols_num if col not in ['id']]
cols_num

['V5', 'V7', 'V8']

['V2', 'V3', 'V4', 'V9', 'V11', 'V16']

['V1', 'V6', 'V10', 'V12', 'V13', 'V14', 'V15']

dict_unknown_cnt = {}
for col in cols_catg:
    dict_unknown_cnt[col] = (X[col]=='unknown').sum()
    
dict_unknown_cnt

class TransformUnknownToNone(BaseEstimator, TransformerMixin):
    
    def __init__(self, func, cols:list):
        self.func = func
        self.cols = cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return self.func(X, self.cols)

X_new = fill_unknown_w_na(X, cols_catg)
X_new.shape
X_new.isna().sum().to_dict()

## Transformations needed
- Drop Unique (`ID`) column
- Replace Unknown to NA (`categorical`)
- Imputation of missing values (`Numerical`, `Categorical`)
- Labeling values (`binary`)
- OHE the values (`categorical`)
- Scaling the values (`numerical`)


## Important difference between `ColumnTransformer` & `Pipeline`
In ColumnTransformer, all the steps proceed independently. Each step use the same input_data as its input.
<br>In Pipeline, steps are performed sequentially. However, there is no provision to provide target columns where the transformations need to be carried out.
<br>We want to have the best of both worlds, and avoid the cons of both. To achieve that, we mix and match the two concepts.

In [19]:
# We can only pass column indices to numpy arrays & not column names
def get_col_indices(cols_all:list, cols:list):
    return [idx for idx, col in enumerate(cols_all) if col in cols]

In [20]:
cols_all = X.columns.to_list()
cols_all

cols_catg_idx = get_col_indices(cols_all, cols_catg)
cols_num_idx = get_col_indices(cols_all, cols_num)
cols_bool_idx = get_col_indices(cols_all, cols_bool)

cols_catg_idx
cols_num_idx
cols_bool_idx

['V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16']

[1, 2, 3, 8, 10, 15]

[0, 5, 9, 11, 12, 13, 14]

[4, 6, 7]

In [21]:
# ct_drop_cols = ColumnTransformer([
#     ('drop_cols', 'drop', [0]),
# ], remainder='passthrough')

# pipe_rmv_unknwn = Pipeline([
#     # ('unknown_remover', TransformUnknownToNone(fill_unknown_w_na, cols_catg)),
#     ('drop_id', ct_drop_cols),
# ])

In [22]:
ct_impute = ColumnTransformer(transformers=[
    ('impute_catg', SimpleImputer(strategy='most_frequent'), cols_catg_idx), 
    ('impute_numeric', SimpleImputer(strategy='mean'), cols_num_idx), 
], remainder='passthrough')

ct_encod = ColumnTransformer(transformers=[
    # ('label_binary', LabelEncoder(), cols_bool_idx),  # cannot be used in column transformers as it is meant for target encoding
    ('onehot', OneHotEncoder(handle_unknown='ignore'), cols_bool_idx + cols_catg_idx), 
    ('scaling', MinMaxScaler(clip=True), cols_num_idx)
], remainder='drop')

In [23]:
pipe_transfm = Pipeline([
    # ('step0', pipe_rmv_unknwn),
    ('impute', ct_impute),
    ('encode', ct_encod),
])

In [24]:
pipe_lr = Pipeline([
    ('preprocessing', pipe_transfm),
    ('model', LogisticRegression()),
])

pipe_lr

In [25]:
cols_bool_idx + cols_catg_idx

[4, 6, 7, 1, 2, 3, 8, 10, 15]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

X_train.shape, y_train.shape
X_test.shape, y_test.shape

((36168, 16), (36168,))

((9043, 16), (9043,))

In [27]:
def evaluate_model_performance(pipe, X_train, y_train, X_test, y_test):
    pipe.fit(X_train, y_train)
    
    y_train_pred = pipe.predict(X_train)
    y_test_pred = pipe.predict(X_test)
    
    print(f"\nModel performance on TRAINING data")
    ConfusionMatrixDisplay(confusion_matrix(y_train, y_train_pred)).plot.show()
    print(f"\nModel performance on VALIDATION data")
    ConfusionMatrixDisplay(confusion_matrix(y_test, y_test_pred)).plot().show()
    

In [28]:
evaluate_model_performance(pipe_lr, X_train, y_train, X_test, y_test)

ValueError: could not convert string to float: 'management'