In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from collections import Counter

# --- PART 1: Exploratory Data Analysis (EDA) ---
def perform_eda():
    print("--- 1. Starting Exploratory Data Analysis (EDA) ---")
    try:
        df = pd.read_csv("bank-full.csv", sep=';')
        print("Data loaded successfully for EDA.")

        # 1.1 Target Variable Distribution
        plt.figure(figsize=(6, 4))
        sns.countplot(data=df, x='y')
        plt.title('Distribution of Target Variable (y)')
        plt.xlabel('Subscribed a Term Deposit')
        plt.ylabel('Count')
        plt.savefig('target_distribution.png')
        plt.close()
        print("Generated target_distribution.png")

        # 1.2 Numeric Feature Distributions
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
        sns.histplot(df['age'], bins=30, kde=True, ax=ax1)
        ax1.set_title('Distribution of Age')
        sns.histplot(df['duration'], bins=50, kde=True, ax=ax2)
        ax2.set_title('Distribution of Contact Duration (seconds)')
        ax2.set_xlim(0, 2000)
        plt.tight_layout()
        plt.savefig('numeric_distributions.png')
        plt.close()
        print("Generated numeric_distributions.png")

        # 1.3 Duration vs Target
        plt.figure(figsize=(8, 6))
        sns.boxplot(data=df, x='y', y='duration')
        plt.title('Contact Duration vs Subscription (y)')
        plt.ylim(0, 1500)
        plt.savefig('duration_vs_target.png')
        plt.close()
        print("Generated duration_vs_target.png")

        # 1.4 Job vs Target
        plt.figure(figsize=(12, 7))
        sns.countplot(data=df, x='job', hue='y', order=df['job'].value_counts().index)
        plt.title('Job Type vs Subscription (y)')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig('job_vs_target.png')
        plt.close()
        print("Generated job_vs_target.png")

        # 1.5 Correlation Heatmap
        plt.figure(figsize=(10, 7))
        numeric_cols_df = df.select_dtypes(include=np.number)
        corr_matrix = numeric_cols_df.corr()
        sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
        plt.title('Correlation Heatmap')
        plt.tight_layout()
        plt.savefig('correlation_heatmap.png')
        plt.close()
        print("Generated correlation_heatmap.png")

        # 1.6 Categorical Features vs Target
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
        fig.suptitle('Categorical Features vs Subscription (y)', fontsize=16)

        sns.countplot(data=df, x='marital', hue='y', ax=axes[0, 0])
        sns.countplot(data=df, x='education', hue='y', ax=axes[0, 1])
        axes[0, 1].tick_params(axis='x', rotation=30)
        sns.countplot(data=df, x='housing', hue='y', ax=axes[1, 0])
        sns.countplot(data=df, x='loan', hue='y', ax=axes[1, 1])
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.savefig('categorical_eda_1.png')
        plt.close()
        print("Generated categorical_eda_1.png")

        # 1.7 poutcome vs Target
        plt.figure(figsize=(8, 5))
        sns.countplot(data=df, x='poutcome', hue='y', order=['unknown', 'failure', 'other', 'success'])
        plt.title('Previous Campaign Outcome vs Subscription (y)')
        plt.tight_layout()
        plt.savefig('poutcome_vs_target.png')
        plt.close()
        print("Generated poutcome_vs_target.png")

        # 1.8 Balance vs Target
        plt.figure(figsize=(8, 6))
        sns.boxplot(data=df, x='y', y='balance')
        plt.title('Average Yearly Balance vs Subscription (y)')
        plt.ylim(-2000, 8000)
        plt.tight_layout()
        plt.savefig('balance_vs_target.png')
        plt.close()
        print("Generated balance_vs_target.png")

        print("--- EDA Finished ---")

    except FileNotFoundError:
        print("Error: bank-full.csv not found.")
    except Exception as e:
        print(f"An error occurred during EDA: {e}")


# --- PART 2: Custom Components (From Scratch) ---

class MyStandardScaler:
    """Implements StandardScaler from scratch."""
    def __init__(self):
        self.mean_ = None
        self.std_ = None

    def fit(self, X):
        self.mean_ = np.mean(X, axis=0)
        self.std_ = np.std(X, axis=0)
        self.std_ = np.where(self.std_ == 0, 1e-8, self.std_)

    def transform(self, X):
        if self.mean_ is None or self.std_ is None:
            raise ValueError("Fit scaler before transforming.")
        return (X - self.mean_) / self.std_

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)


def train_test_split_scratch(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)

    if isinstance(X, pd.DataFrame):
        X = X.values
    if isinstance(y, pd.Series):
        y = y.values

    n_samples = X.shape[0]
    shuffled_indices = np.random.permutation(n_samples)
    n_test = int(n_samples * test_size)
    n_train = n_samples - n_test

    train_indices = shuffled_indices[:n_train]
    test_indices = shuffled_indices[n_train:]

    return X[train_indices], X[test_indices], y[train_indices], y[test_indices]

class MyLogisticRegression:
    """Logistic Regression from scratch."""
    def __init__(self, learning_rate=0.01, n_iterations=1000, verbose=False):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.verbose = verbose
        self.weights = None
        self.bias = None

    def _sigmoid(self, z):
        z_clipped = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z_clipped))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for i in range(self.n_iterations):
            z = np.dot(X, self.weights) + self.bias
            A = self._sigmoid(z)
            epsilon = 1e-8
            cost = -(1/n_samples) * np.sum(y * np.log(A + epsilon) + (1 - y) * np.log(1 - A + epsilon))

            dw = (1 / n_samples) * np.dot(X.T, (A - y))
            db = (1 / n_samples) * np.sum(A - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

            if self.verbose and i % 100 == 0:
                print(f"Iteration {i} | Cost: {cost:.4f}")

    def predict_proba(self, X):
        z = np.dot(X, self.weights) + self.bias
        return self._sigmoid(z)

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) >= threshold).astype(int)


# --- Evaluation Metrics ---

def confusion_matrix_scratch(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return tp, tn, fp, fn

def accuracy_scratch(y_true, y_pred):
    tp, tn, fp, fn = confusion_matrix_scratch(y_true, y_pred)
    total = tp + tn + fp + fn
    return (tp + tn) / total if total > 0 else 0

def precision_scratch(y_true, y_pred):
    tp, tn, fp, fn = confusion_matrix_scratch(y_true, y_pred)
    return tp / (tp + fp + 1e-8)

def recall_scratch(y_true, y_pred):
    tp, tn, fp, fn = confusion_matrix_scratch(y_true, y_pred)
    return tp / (tp + fn + 1e-8)

def f1_score_scratch(y_true, y_pred):
    prec = precision_scratch(y_true, y_pred)
    rec = recall_scratch(y_true, y_pred)
    return 2 * (prec * rec) / (prec + rec + 1e-8)


# --- PART 3: Full Pipeline ---

def main_pipeline():
    print("\n--- 2. Starting 'From Scratch' ML Pipeline ---")

    try:
        df = pd.read_csv("bank-full.csv", sep=';')
        print("Data loaded successfully for pipeline.")
    except Exception as e:
        print(f"Error loading data: {e}")
        return

    print("Starting preprocessing...")
    binary_cols = ['default', 'housing', 'loan', 'y']
    categorical_cols = df.select_dtypes(include=['object']).columns.drop(binary_cols)
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()

    for col in binary_cols:
        df[col] = df[col].map({'yes': 1, 'no': 0})
    print(f"Mapped binary columns to 1/0: {binary_cols}")

    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    print(f"One-hot encoded columns: {categorical_cols.tolist()}")

    X = df.drop('y', axis=1)
    y = df['y']

    X_train_df, X_test_df, y_train, y_test = train_test_split_scratch(X, y, test_size=0.2, random_state=42)

    X_train_df = pd.DataFrame(X_train_df, columns=X.columns)
    X_test_df = pd.DataFrame(X_test_df, columns=X.columns)
    print(f"Train set shape: {X_train_df.shape}, Test set shape: {X_test_df.shape}")

    print("Scaling numeric features 'from scratch'...")
    cols_to_scale = [col for col in numeric_cols if col in X_train_df.columns]
    print(f"Columns to be scaled: {cols_to_scale}")

    # ✅ FIX: Ensure all numeric columns are float type
    X_train_df[cols_to_scale] = X_train_df[cols_to_scale].apply(pd.to_numeric, errors='coerce')
    X_test_df[cols_to_scale] = X_test_df[cols_to_scale].apply(pd.to_numeric, errors='coerce')

    scaler = MyStandardScaler()
    scaler.fit(X_train_df[cols_to_scale].values)
    X_train_df[cols_to_scale] = scaler.transform(X_train_df[cols_to_scale].values)
    X_test_df[cols_to_scale] = scaler.transform(X_test_df[cols_to_scale].values)
    print("Scaling applied successfully.")

# ✅ Convert everything to float before training (fix exp() issue)
    X_train_df = X_train_df.apply(pd.to_numeric, errors='coerce').astype(float)
    X_test_df = X_test_df.apply(pd.to_numeric, errors='coerce').astype(float)

    print("\n--- Training Logistic Regression ---")
    model = MyLogisticRegression(learning_rate=0.01, n_iterations=1000, verbose=True)
    model.fit(X_train_df.values, y_train.astype(float))  # ensure y is also float
    print("Model training complete.")

    print("\n--- Evaluating Model ---")
    y_pred = model.predict(X_test_df.values)
    tp, tn, fp, fn = confusion_matrix_scratch(y_test, y_pred)
    acc = accuracy_scratch(y_test, y_pred)
    prec = precision_scratch(y_test, y_pred)
    rec = recall_scratch(y_test, y_pred)
    f1 = f1_score_scratch(y_test, y_pred)

    print("\nConfusion Matrix:")
    print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
    print("\nMetrics:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")

    print("\n--- Pipeline Complete ---")


# --- Run Everything ---
if __name__ == "__main__":
    perform_eda()
    main_pipeline()


--- 1. Starting Exploratory Data Analysis (EDA) ---
Data loaded successfully for EDA.
Generated target_distribution.png
Generated numeric_distributions.png
Generated duration_vs_target.png
Generated job_vs_target.png
Generated correlation_heatmap.png
Generated categorical_eda_1.png
Generated poutcome_vs_target.png
Generated balance_vs_target.png
--- EDA Finished ---

--- 2. Starting 'From Scratch' ML Pipeline ---
Data loaded successfully for pipeline.
Starting preprocessing...
Mapped binary columns to 1/0: ['default', 'housing', 'loan', 'y']
One-hot encoded columns: ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
Train set shape: (36169, 42), Test set shape: (9042, 42)
Scaling numeric features 'from scratch'...
Columns to be scaled: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
Scaling applied successfully.

--- Training Logistic Regression ---
Iteration 0 | Cost: 0.6931
Iteration 100 | Cost: 0.4263
Iteration 200 | Cost: 0.3529
Iteration 300 | C