In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,ConfusionMatrixDisplay, precision_recall_fscore_support, precision_score, recall_score, roc_curve, auc, precision_recall_curve
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from imblearn.under_sampling import RandomUnderSampler



In [3]:
df = pd.read_csv('bank-additional.csv', sep=';')

In [5]:
print(f"Dataset Size: {df.shape}")
print("\nFirst 5 rows:")
df.head()

Dataset Size: (4119, 21)

First 5 rows:


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no


In [7]:
# Rename columns
df.rename(columns={
    'age': 'Age',
    'job': 'Job',
    'marital': 'Marital',
    'education': 'Education',
    'default': 'Credit',
    'housing': 'HousingLoan',
    'loan': 'PersonalLoan',
    'contact': 'ContactCommunicationType',
    'month': 'LastContactMonth',
    'day_of_week': 'LastContactDayOfWeek',
    'duration': 'CallDuration',
    'campaign': 'CampaignContacts',
    'pdays': 'PreviousContactDays',
    'previous': 'PreviousCampaignContacts',
    'poutcome': 'PreviousCampaignOutcome',
    'emp.var.rate': 'EmploymentVarRate',
    'cons.price.idx': 'ConsumerPriceIndex',
    'cons.conf.idx': 'ConsumerConfidenceIndex',
    'euribor3m': 'Euribor3M',
    'nr.employed': 'NumberOfEmployees',
    'y': 'SubscribedTermDeposit'
}, inplace=True)

# Display basic information about the dataset
print("\nDataset Information:")
df.info()



Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4119 entries, 0 to 4118
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       4119 non-null   int64  
 1   Job                       4119 non-null   object 
 2   Marital                   4119 non-null   object 
 3   Education                 4119 non-null   object 
 4   Credit                    4119 non-null   object 
 5   HousingLoan               4119 non-null   object 
 6   PersonalLoan              4119 non-null   object 
 7   ContactCommunicationType  4119 non-null   object 
 8   LastContactMonth          4119 non-null   object 
 9   LastContactDayOfWeek      4119 non-null   object 
 10  CallDuration              4119 non-null   int64  
 11  CampaignContacts          4119 non-null   int64  
 12  PreviousContactDays       4119 non-null   int64  
 13  PreviousCampaignContacts  4119 non-null  

In [11]:
#data cleaning
# Check for outliers in numerical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

print("\nChecking for outliers in numerical columns...")
for col in numerical_columns:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
    print(f"{col}: {outliers} outliers detected")


Checking for outliers in numerical columns...
Age: 39 outliers detected
CallDuration: 291 outliers detected
CampaignContacts: 235 outliers detected
PreviousContactDays: 160 outliers detected
PreviousCampaignContacts: 596 outliers detected
EmploymentVarRate: 0 outliers detected
ConsumerPriceIndex: 0 outliers detected
ConsumerConfidenceIndex: 43 outliers detected
Euribor3M: 0 outliers detected
NumberOfEmployees: 0 outliers detected


In [13]:
# Fill or remove missing data
df.fillna("unknown", inplace=True) # Fill missing values with "unknown"
df = df.replace('unknown', np.nan) # Mark "unknown" values As missing data
df = df.replace('nonexistent', np.nan) # Mark "unknown" values as missing data

In [23]:
# Check for missing values
print("\nMissing values per column:")
missing_values = df.isna().sum()
print(missing_values[missing_values > 0])



Missing values per column:
Series([], dtype: int64)


In [21]:
# Show empty values
missing_values = df.isnull().sum()

# Show empty columns
missing_values = missing_values[missing_values > 0]

print("Empty columns and values: ")
print(missing_values)


Empty columns and values: 
Series([], dtype: int64)


In [17]:
# There are too much missing values
df.drop('PreviousCampaignOutcome', axis=1, inplace=True)

In [19]:
# Impute missing values for categorical columns
for col in df.select_dtypes(include=['object']).columns:
    if df[col].isna().sum() > 0:
        # Replace NaN with the most frequent value
        most_frequent = df[col].value_counts().index[0]
        print(f"Imputing missing values in {col} with most frequent value: {most_frequent}")
        df[col].fillna(most_frequent, inplace=True)

Imputing missing values in Job with most frequent value: admin.
Imputing missing values in Marital with most frequent value: married
Imputing missing values in Education with most frequent value: university.degree
Imputing missing values in Credit with most frequent value: no
Imputing missing values in HousingLoan with most frequent value: yes
Imputing missing values in PersonalLoan with most frequent value: no


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(most_frequent, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(most_frequent, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway