In [None]:
# Errol Ian Ave Acosta
# CGPT | Google Colab
# Data Science with Python
# February 15, 2025

# Target Variable = suscribed-term-deposited

# This template covers end-to-end ML workflow ðŸš€
# from data loading, preprocessing, model training, evaluation, and saving models.

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
# Load Dataset
df = pd.read_csv('/content/bank-marketing.csv')

In [None]:
# Data Preprocessing
# Check for missing values
df.isnull().sum()

# Handle missing values (replace '?' with NaN and then drop or fill them)
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)  # Drop missing values or use df.fillna(value, inplace=True)

# Encode categorical variables
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()

# Apply label encoding to all object (categorical) columns
for col in df.select_dtypes(include='object').columns:
    df[col] = labelencoder.fit_transform(df[col])

# Define features and target variable
X = df.drop('suscribed-term-deposited', axis=1)  # Features
y = df['suscribed-term-deposited']               # Target variable

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Model Training
# Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
log_preds = log_model.predict(X_test)

In [None]:
# Random Forest Classifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

In [None]:
# Support Vector Machine
svc_model = SVC()
svc_model.fit(X_train, y_train)
svc_preds = svc_model.predict(X_test)

In [None]:
# Model Evaluation
print(accuracy_score(y_test, log_preds))           # Accuracy
print(confusion_matrix(y_test, log_preds))         # Confusion Matrix
print(classification_report(y_test, log_preds))    # Classification Report

In [None]:
# Save Trained Model
import joblib
joblib.dump(log_model, '/content/drive/My Drive/log_model.pkl')

In [None]:
# Load Trained Model Later
loaded_model = joblib.load('/content/drive/My Drive/log_model.pkl')
predictions = loaded_model.predict(X_test)

In [None]:
# Visualization Outputs
# Confusion Matrix Heatmap
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, predictions)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# ROC Curve
from sklearn.metrics import roc_curve, roc_auc_score
# Replace 'model' with the desired trained model (e.g., log_model, rf_model, or svc_model)
fpr, tpr, thresholds = roc_curve(y_test, log_model.predict_proba(X_test)[:,1])  # Using log_model as an example
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label="AUC = {:.3f}".format(roc_auc_score(y_test, predictions)))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
# Feature Importance (for tree based models)
# Replace 'model' with the desired trained model (e.g., log_model, rf_model, or svc_model)
model = rf_model  # Using rf_model (RandomForestClassifier) as an example
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.nlargest(10).plot(kind='barh')
plt.title('Top 10 Feature Importances')
plt.show()