In [9]:
#Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, ParameterGrid
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.model_selection import ParameterGrid

In [10]:
# Defining the class
class ModelValidator:
    # Creating attributes that define and use later
    def __init__(self, X_train, y_train, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.cv_results = None
        self.cv_mean = None
        self.cv_top_lr = None
        self.cv_top_dt = None
        self.top_hp_lr = None
        self.top_3_lr = None
        self.top_hp_dt = None
        self.top_3_dt = None
        self.top_model_lr = None
        self.top_model_dt = None

    # Defining the cross-validation method
    def cross_validate(self, model, kfolds=10):
        self.cv_results = cross_val_score(model, self.X_train, self.y_train, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        return self.cv_mean

    # Defining my logistic regression improving method.
    def check_log_reg(self, cv_now=False):
        # Create a dictionary with ranges on the hyperparameters
        log_reg_hyperp = {
            'C': [0.01, 0.1, 1.0, 10, 100, 1000],
            'penalty': ['l1', 'l2', 'elasticnet'],
            'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky'],
            'tol': [1e-5, 1e-4, 1e-3],
            'random_state': [42],  # Set random state to 42
            'max_iter': [1000, 10000]
        }

        top_3_lr = []

        # Instantiating the model and looping through the different combinations of hyperparameters for a logistic regression model.
        model_inst = LogisticRegression(random_state=42)  # Set random state to 42
        for params in ParameterGrid(log_reg_hyperp):
            try:
                model_inst.set_params(**params)
                model_inst.fit(self.X_train, self.y_train)
                y_pred = model_inst.predict(self.X_test)
                test_accuracy = accuracy_score(self.y_test, y_pred)
                if cv_now:
                    cv_accuracy = self.cross_validate(model_inst)
                    top_3_lr.append((params, cv_accuracy, test_accuracy))
                else:
                    top_3_lr.append((params, test_accuracy))

            except ValueError as e:
                continue

        top_3_lr.sort(key=lambda x: x[-1], reverse=True)
        top_hp_lr = top_3_lr[:1]
        top_3_lr = top_3_lr[:3]

        # Saving the attributes
        self.top_3_lr = top_3_lr
        self.top_hp_lr = top_hp_lr
        self.top_model_lr = LogisticRegression(random_state=42).set_params(**self.top_hp_lr[0][0]).fit(self.X_train,
                                                                                                        self.y_train)

    # Defining a model method to loop and find the best combination of hyperparameters for my decision tree
    def check_desc_tree(self, cv_now=False):
        # Creating a dictionary with the hyperparameters combinations
        decision_tree_hyperp = {
            'criterion': ['gini', 'entropy', 'log_loss'],
            'max_depth': range(1, 5),
            'min_samples_split': range(2, 4),
            'min_samples_leaf': range(1, 4),
            'random_state': [42],  # Set random state to 42
        }

        top_3_dt = []

        # Instantiating the model, looping through them and finding top performers
        model_inst = DecisionTreeClassifier(random_state=42)  # Set random state to 42
        for params in ParameterGrid(decision_tree_hyperp):
            try:
                model_inst.set_params(**params)
                model_inst.fit(self.X_train, self.y_train)
                y_pred = model_inst.predict(self.X_test)
                test_accuracy = accuracy_score(self.y_test, y_pred)
                if cv_now:
                    cv_accuracy = self.cross_validate(model_inst)
                    top_3_dt.append((params, cv_accuracy, test_accuracy))
                else:
                    top_3_dt.append((params, test_accuracy))
            except ValueError as e:
                continue

        top_3_dt.sort(key=lambda x: x[-1], reverse=True)
        top_hp_dt = top_3_dt[:1]
        top_3_dt = top_3_dt[:3]

        # Saving attributes
        self.top_3_dt = top_3_dt
        self.top_hp_dt = top_hp_dt
        self.top_model_dt = DecisionTreeClassifier(random_state=42).set_params(**self.top_hp_dt[0][0]).fit(self.X_train,
                                                                                                           self.y_train)

    # Plotting Confusion Matrix for top-performing models
    def plot_confusion_matrix(self):
        if self.top_hp_lr[0][-1] > self.top_hp_dt[0][-1]:
            top_model = LogisticRegression(random_state=42).set_params(**self.top_hp_lr[0][0]).fit(self.X_train,
                                                                                                     self.y_train)
            model_type = 'Logistic Regression'
        else:
            top_model = DecisionTreeClassifier(random_state=42).set_params(**self.top_hp_dt[0][0]).fit(self.X_train,
                                                                                                      self.y_train)
            model_type = 'Decision Tree'
        ConfusionMatrixDisplay.from_estimator(top_model, self.X_test, self.y_test)
        plt.title(f"Confusion Matrix for {model_type}")

    # Plotting ROC Curve for top-performing models
    def plot_roc_curve(self):
        if self.top_hp_lr[0][-1] > self.top_hp_dt[0][-1]:
            top_model = LogisticRegression(random_state=42).set_params(**self.top_hp_lr[0][0]).fit(self.X_train,
                                                                                                     self.y_train)
            model_type = 'Logistic Regression'
        else:
            top_model = DecisionTreeClassifier(random_state=42).set_params(**self.top_hp_dt[0][0]).fit(self.X_train,
                                                                                                      self.y_train)
            model_type = 'Decision Tree'
        RocCurveDisplay.from_estimator(top_model, self.X_test, self.y_test)
        plt.title(f"ROC Curve for {model_type}")

    # Determining most important features for my data
    def feature_importance(self, n=10):
        if self.top_hp_lr[0][-1] < self.top_hp_dt[0][-1]:
            top_model = DecisionTreeClassifier(random_state=42).set_params(**self.top_hp_dt[0][0])
            features_used = self.X_train.columns
            top_model.fit(self.X_train, self.y_train)
            feature_importance = list(zip(top_model.feature_importances_, features_used))
        else:
            top_model = LogisticRegression(random_state=42).set_params(**self.top_hp_lr[0][0])
            top_model.fit(self.X_train, self.y_train)
            features_used = self.X_train.columns
            feature_importance = list(zip(abs(top_model.coef_[0]), features_used))

        feature_importance.sort(key=lambda x: x[0], reverse=True)
        return feature_importance[:n]

    # Creating a method to obtain the score DataFrame
    def scores(self, both_models=False):
        dt_y_hat_train = self.top_model_dt.predict(self.X_train)
        dt_y_hat_test = self.top_model_dt.predict(self.X_test)
        lr_y_hat_train = self.top_model_lr.predict(self.X_train)
        lr_y_hat_test = self.top_model_lr.predict(self.X_test)
        self.cv_top_lr = self.top_hp_lr[0][1]
        self.cv_top_dt = self.top_hp_dt[0][1]

        dt_df = {
            'Accuracy train': round(accuracy_score(self.y_train, dt_y_hat_train), 3),
            'Accuracy test': round(accuracy_score(self.y_test, dt_y_hat_test), 3),
            'Recall train': round(recall_score(self.y_train, dt_y_hat_train), 3),
            'Recall test': round(recall_score(self.y_test, dt_y_hat_test), 3),
            'Precision train': round(precision_score(self.y_train, dt_y_hat_train), 3),
            'Precision test': round(precision_score(self.y_test, dt_y_hat_test), 3),
            'F1 train': round(f1_score(self.y_train, dt_y_hat_train), 3),
            'F1 test': round(f1_score(self.y_test, dt_y_hat_test), 3),
            'CV results': round(self.cv_top_dt, 3)
        }

        lr_df = {
            'Accuracy train': round(accuracy_score(self.y_train, lr_y_hat_train), 3),
            'Accuracy test': round(accuracy_score(self.y_test, lr_y_hat_test), 3),
            'Recall train': round(recall_score(self.y_train, lr_y_hat_train), 3),
            'Recall test': round(recall_score(self.y_test, lr_y_hat_test), 3),
            'Precision train': round(precision_score(self.y_train, lr_y_hat_train), 3),
            'Precision test': round(precision_score(self.y_test, lr_y_hat_test), 3),
            'F1 train': round(f1_score(self.y_train, lr_y_hat_train), 3),
            'F1 test': round(f1_score(self.y_test, lr_y_hat_test), 3),
            'CV results': round(self.cv_top_lr, 3)
        }

        decision_tree_df = pd.DataFrame(list(dt_df.values()), index=dt_df.keys(), columns=['Decision Tree'])
        logistic_regression_df = pd.DataFrame(list(lr_df.values()), index=lr_df.keys(),
                                               columns=['Logistic Regression'])

        if both_models:
            df = pd.concat([decision_tree_df, logistic_regression_df], axis=1)
            return df
        else:
            if self.top_hp_lr[0][-1] < self.top_hp_dt[0][-1]:
                return decision_tree_df
            else:
                return logistic_regression_df

    # Creating a method to plot the decision tree
    def plot_tree(self):
        f, ax = plt.subplots(figsize=(8, 8))
        plot_tree(self.top_model_dt, ax=ax)
        plt.title('Decision Tree Classifier')
