This notebook is designed for tabular classification tasks with pandas and scikit-learn.
It is a simple example of how to use pandas and scikit-learn to build a classification model using a tabular dataset. The code includes data preprocessing, model training, and evaluation steps.
The dataset used in this example is the bank marketing dataset from the UCI Machine Learning Repository. The dataset contains information about a bank's marketing campaign and whether or not a customer subscribed to a term deposit.
The goal is to predict whether a customer will subscribe to a term deposit based on their demographic and behavioral features.

In [None]:
# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, 
                                roc_auc_score, roc_curve,auc)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

In [None]:
# Load data
notebook_path = Path.cwd()
file_path = notebook_path.parent / 'dataset' / 'bank.csv'
print(f"Loading data from: {file_path}")
data = pd.read_csv(file_path, header=0, sep=';')

In [None]:
def rename_n_change(data):
    # Rename columns for better readability
    data.rename(columns={
        'marital':'marital_status',
        'default':'credit_default',
        'housing':'housing_loan',
        'loan':'personal_loan',
        'y':'response'}, inplace=True)
    #change data types for faster loading
    data['response'] = data['response'].astype('category')
    data['marital_status'] = data['marital_status'].astype('category')
    data['education'] = data['education'].astype('category')
    data['job'] = data['job'].astype('category')
    data['contact'] = data['contact'].astype('category')
    data['month'] = data['month'].astype('category')
    data['day'] = data['day'].astype('category')
    data['credit_default'] = data['credit_default'].astype('category')
    data['housing_loan'] = data['housing_loan'].astype('category')
    data['personal_loan'] = data['personal_loan'].astype('category')
    return data
data=rename_n_change(data)

In [None]:
data['poutcome'].value_counts(dropna=False)/len(data)*100

In [None]:
# Drop poutcome with more than 80% unknown values
data.drop('poutcome', axis=1, inplace=True)

Determine if data is imbalance

In [None]:
data['response'].value_counts().plot(kind='bar', color=['#FF9999', '#66B3FF'])
plt.title('Distribution of Response Variable')
plt.xlabel('Response')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

Exploratory Data Analysis with Seaborn
Visualize the data with Seaborn to understand the distribution of features and the relationship between features
and the target variable.

In [None]:
plt.figure(figsize=(8,12))
sns.set_style('whitegrid')
sns.pairplot(data,hue='response')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
data['housing_loan'].value_counts().plot(kind='bar')
plt.title('Distribution of Housing Loan')
plt.xlabel('Housing Loan')
plt.ylabel('Count')
plt.show()

In [None]:
#Correlation matrix for numeric features
plt.figure(figsize=(12, 8))
numeric_ft = data[['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']]
corr_matrix = numeric_ft.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()


Assign features and labels

In [None]:
X=data.drop(columns=['response'])
y=data['response']
print(X.shape)
print(y.shape)

In [None]:
#Split the data into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,stratify=y,random_state=78)

Apply Preprocessing to Data

In [None]:
# Separate numeric and categorical features
numeric_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical_features = ['job', 'marital_status', 'education', 'month', 'housing_loan', 'personal_loan', 'contact', 'credit_default']

In [None]:
# Create a column transformer to apply different preprocessing steps to different columns
pre_processor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)
# Create a pipeline that first applies the preprocessor and then fits a classifier
pre_pipeline = Pipeline(steps=[
    ('preprocessor', pre_processor)
])

In [None]:
# Label encode the target variable
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [None]:
# Fit preprocessing on training data and transform both sets
X_train = pre_pipeline.fit_transform(X_train)
X_test = pre_pipeline.transform(X_test)  # No fitting on test data!

In [None]:
print(X_train.shape)
print(X_test.shape)


In [None]:
smote = SMOTE()
X_train,y_train = smote.fit_resample(X_train, y_train) # type: ignore

Model Training and Evaluation

In [None]:
#Instantiate and train
logreg=LogisticRegression(class_weight= 'balanced')
dtree=DecisionTreeClassifier()
rforest=RandomForestClassifier(class_weight= 'balanced',n_estimators=100)
gbm=GradientBoostingClassifier()
gnb=GaussianNB()
knn=KNeighborsClassifier()
xgb=XGBClassifier()
lgbm=LGBMClassifier(class_weight='balanced')

In [None]:
# Predict, Evaluate and plot
models = {"Logistic Regression": logreg, "Decision Tree": dtree, "Random Forest": rforest, "Gradient Boosting": gbm, "Gaussian Naive Bayes": gnb, "K Neighbors": knn, "XGBoost": xgb, "LightGBM": lgbm}
results = pd.DataFrame(columns=['Model', 'Accuracy', 'ROC_AUC_Score'])
plt.figure(figsize=(10, 8))
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    new_row = pd.DataFrame([{'Model': name, 'Accuracy': accuracy, 'ROC_AUC_Score': roc_auc}])
    results = pd.concat([results, new_row], ignore_index=True)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Sort the results by ROC_AUC_Score in descending order
results = results.sort_values(by='ROC_AUC_Score', ascending = False,ignore_index = True)
# Display the results
print(results)

In [None]:
# Save the model
joblib.dump(logreg, '../saved_models/logreg_model.pkl')
joblib.dump(dtree, '../saved_models/dtree_model.pkl')
joblib.dump(rforest, '../saved_models/rforest_model.pkl')
joblib.dump(gbm, '../saved_models/gbm_model.pkl')
joblib.dump(gnb, '../saved_models/gnb_model.pkl')
joblib.dump(knn, '../saved_models/knn_model.pkl')
joblib.dump(xgb, '../saved_models/xgb_model.pkl')
joblib.dump(lgbm, '../saved_models/lgbm_model.pkl')
joblib.dump(pre_pipeline, '../saved_models/pre_pipeline.pkl')
joblib.dump(label_encoder, '../saved_models/label_encoder.pkl')

In [None]:
# Load the model
logreg_ = joblib.load('../saved_models/logreg_model.pkl')
dtree_ = joblib.load('../saved_models/dtree_model.pkl')
rforest_ = joblib.load('../saved_models/rforest_model.pkl')
gbm_ = joblib.load('../saved_models/gbm_model.pkl')
gnb_ = joblib.load('../saved_models/gnb_model.pkl')
knn_ = joblib.load('../saved_models/knn_model.pkl')
xgb_ = joblib.load('../saved_models/xgb_model.pkl')
lgbm_ = joblib.load('../saved_models/lgbm_model.pkl')
pre_pipeline_ = joblib.load('../saved_models/pre_pipeline.pkl')
label_encoder_ = joblib.load('../saved_models/label_encoder.pkl')

In [None]:
# Make predictions on new data
new_data = pd.DataFrame({
    'age': [30],
    'balance': [1000],
    'day': [15],
    'duration': [200],
    'campaign': [1],
    'pdays': [999],
    'previous': [0],
    'job': ['admin.'],
    'contact': ['cellular'],
    'marital_status': ['single'],
    'education': ['university.degree'],
    'month': ['may'],
    'housing_loan': ['yes'],
    'personal_loan': ['no'],
    'credit_default': ['no']
})

In [None]:
# Preprocess the new data
new_d = pd.DataFrame(pre_pipeline_.transform(new_data), columns=pre_pipeline_.get_feature_names_out()) # type: ignore

In [None]:

# Make predictions
logreg_pred = logreg_.predict(new_d)
dtree_pred = dtree_.predict(new_d)
rforest_pred = rforest_.predict(new_d)
gbm_pred = gbm_.predict(new_d)
gnb_pred = gnb_.predict(new_d)
knn_pred = knn_.predict(new_d)
xgb_pred = xgb_.predict(new_d)
lgbm_pred = lgbm_.predict(new_d)
# Print the predictions
print("Logistic Regression Prediction: ", logreg_pred)
print("Decision Tree Prediction: ", dtree_pred)
print("Random Forest Prediction: ", rforest_pred)
print("Gradient Boosting Prediction: ", gbm_pred)
print("Gaussian Naive Bayes Prediction: ", gnb_pred)
print("K Neighbors Prediction: ", knn_pred)
print("XGBoost Prediction: ", xgb_pred)
print("LightGBM Prediction: ", lgbm_pred)