In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("bank.csv", sep=';', header=0)

# Display the first few rows
data


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


In [2]:
from sklearn.preprocessing import LabelEncoder

# Convert categorical columns to numeric
categorical_columns = data.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_columns:
    if col != 'y':  # We don't need to encode the target variable here
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le

# Define features and target variable
X = data.drop('y', axis=1)
y = data['y']

# Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [3]:
pip install imbalanced-learn


Collecting imbalanced-learn
  Using cached imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.3
Note: you may need to restart the kernel to use updated packages.




In [4]:
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def evaluate_model(X, y, sampler, n_splits=10):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    accuracies = []
    reports = []
    confusion_matrices = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Apply sampling
        X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)
        
        # Initialize and fit AdaBoost
        adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)
        adaboost.fit(X_train_resampled, y_train_resampled)
        
        # Predict on the test set
        y_pred = adaboost.predict(X_test)
        
        # Evaluate
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        
        accuracies.append(accuracy)
        reports.append(report)
        confusion_matrices.append(conf_matrix)
    
    avg_accuracy = sum(accuracies) / n_splits
    avg_conf_matrix = np.round(sum(confusion_matrices) / n_splits).astype(int)
    avg_report = reports[0]  # You may want to aggregate reports in a more sophisticated manner
    
    return avg_accuracy, avg_report, avg_conf_matrix

In [5]:
# Define samplers
smote = SMOTE(random_state=42)
adasyn = ADASYN(random_state=42)

# Evaluate with SMOTE
accuracy_smote, report_smote, conf_matrix_smote = evaluate_model(X, y, smote, n_splits=10)

# Evaluate with ADASYN
accuracy_adasyn, report_adasyn, conf_matrix_adasyn = evaluate_model(X, y, adasyn, n_splits=10)

# Print results
print("SMOTE + AdaBoost")
print(f"Average Accuracy: {accuracy_smote}")
print("Classification Report:")
print(report_smote)
print("Confusion Matrix:")
print(conf_matrix_smote)

print("\nADASYN + AdaBoost")
print(f"Average Accuracy: {accuracy_adasyn}")
print("Classification Report:")
print(report_adasyn)
print("Confusion Matrix:")
print(conf_matrix_adasyn)



SMOTE + AdaBoost
Average Accuracy: 0.8447283596085097
Classification Report:
              precision    recall  f1-score   support

          no       0.94      0.85      0.90       400
         yes       0.36      0.62      0.46        53

    accuracy                           0.83       453
   macro avg       0.65      0.74      0.68       453
weighted avg       0.88      0.83      0.85       453

Confusion Matrix:
[[349  51]
 [ 19  33]]

ADASYN + AdaBoost
Average Accuracy: 0.8396418175779953
Classification Report:
              precision    recall  f1-score   support

          no       0.94      0.85      0.89       400
         yes       0.34      0.58      0.43        53

    accuracy                           0.82       453
   macro avg       0.64      0.72      0.66       453
weighted avg       0.87      0.82      0.84       453

Confusion Matrix:
[[347  53]
 [ 19  33]]
