In [18]:
#Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, ParameterGrid
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay, roc_curve, auc, RocCurveDisplay
from sklearn.model_selection import ParameterGrid
from sklearn.utils import resample
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [20]:
# Defining the class
class ModelValidator:
    # Creating attributes that define and use later
    def __init__(self, X_train, y_train, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.cv_results = None
        self.cv_mean = None
        self.cv_top_lr = None
        self.cv_top_dt = None
        self.top_hp_lr = None
        self.top_3_lr = None
        self.top_hp_dt = None
        self.top_3_dt = None
        self.top_model_lr = None
        self.top_model_dt = None

    # Defining the cross-validation method
    def cross_validate(self, model, kfolds=10):
        self.cv_results = cross_val_score(model, self.X_train, self.y_train, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        return self.cv_mean
    
    # Checking for multicollinearity
    def check_multicollinearity(self):
        X_train = self.X_train  # Get your training data
        vif = pd.DataFrame()
        vif["VIF Factor"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
        vif["features"] = X_train.columns
        vif_sorted = vif.sort_values('VIF Factor', ascending=False)
        return vif_sorted
    
    #Down Sampler
    @staticmethod
    def downsample(X_train, y_train, target_column, random_state=42):
        
        # Separate majority and minority classes
        majority_class = X_train[X_train[target_column] == 0]
        minority_class = X_train[X_train[target_column] == 1]

        # Downsample majority class
        majority_downsampled = resample(majority_class, replace=False, n_samples=len(minority_class), random_state=random_state)

        # Combine minority class with downsampled majority class
        downsampled_data = pd.concat([majority_downsampled, minority_class])
    
        # Separate features and target in the downsampled data
        X_downsampled = downsampled_data.drop(target_column, axis=1)
        y_downsampled = downsampled_data[target_column]

        return X_downsampled, y_downsampled

    
    # Defining precision improving method
    def check_precision(self, cv_now=False):
        # Create a dictionary with ranges on the hyperparameters relevant to precision tuning
        precision_hyperp = {
            'C': [0.001, 0.01, 0.1, 1.0, 10],  # Adjusted range with smaller values
            'penalty': ['l2'],  # Use 'l2' penalty for smoother decision boundaries
            'solver': ['lbfgs', 'liblinear', 'newton-cg'],  # Common solvers for 'l2' penalty
            'tol': [1e-5, 1e-4, 1e-3],
            'random_state': [42],  # Set random state to 42 for reproducibility
            'max_iter': [1000, 10000]
        }

        top_3_precision = []

        # Instantiate the model (e.g., LogisticRegression or DecisionTreeClassifier) with a random state
        model_inst = LogisticRegression(random_state=42)  

        # Loop through hyperparameter combinations
        for params in ParameterGrid(precision_hyperp):
            try:
                model_inst.set_params(**params)
                model_inst.fit(self.X_train, self.y_train)
                y_pred = model_inst.predict(self.X_test)
                precision = precision_score(self.y_test, y_pred)  # Calculate precision score

                if cv_now:
                    cv_precision = self.cross_validate(model_inst)  # Optionally, calculate cross-validation precision
                    top_3_precision.append((params, cv_precision, precision))
                else:
                    top_3_precision.append((params, precision))

            except ValueError as e:
                continue

        top_3_precision.sort(key=lambda x: x[-1], reverse=True)
        top_hp_precision = top_3_precision[:1]

        # Save the best hyperparameters and the tuned model
        self.top_3_precision = top_3_precision
        self.top_hp_precision = top_hp_precision
        self.top_model_precision = LogisticRegression(random_state=42).set_params(**top_hp_precision[0][0]).fit(self.X_train, self.y_train)

        
          #Defining my logistic regression improving method.
    
    def check_log_reg(self, cv_now=False):
        #Create a dictionary with ranges on the hyperparameters
        log_reg_hyperp = {
            'C': [0.01,0.1, 1.0,10,100,1000],
            'penalty': ['l1', 'l2','elasticnet'],
            'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky'],
            'tol': [1e-5,1e-4, 1e-3],
            'random_state': [420],
            'max_iter':[1000,10000]
    }
        
        top_3_lr = []

        #Instantiating the model and looping through the different combinations of hyperparameters for a logistic regression model.
        
        model_inst = LogisticRegression()
        for params in ParameterGrid(log_reg_hyperp):
            try:
                model_inst.set_params(**params)
                model_inst.fit(self.X_train, self.y_train)
                y_pred = model_inst.predict(self.X_test)
                test_accuracy = accuracy_score(self.y_test, y_pred)
                if cv_now:
                    cv_accuracy = self.cross_validate(model_inst)
                    top_3_lr.append((params, cv_accuracy, test_accuracy))
                else:
                    top_3_lr.append((params, test_accuracy))
                
            except ValueError as e:
                continue

        top_3_lr.sort(key=lambda x: x[-1], reverse=True)
        top_hp_lr = top_3_lr[:1]
        top_3_lr = top_3_lr[:3]
        
        # Saving the attributes
        
        self.top_3_lr = top_3_lr
        self.top_hp_lr = top_hp_lr
        self.top_model_lr = LogisticRegression().set_params(**self.top_hp_lr[0][0]).fit(self.X_train,self.y_train)

        
    # Defining a model method to loop and find the best combination of hyperparameters for my decision tree
    def check_desc_tree(self, cv_now=False):
        # Creating a dictionary with the hyperparameters combinations
        decision_tree_hyperp = {
            'criterion': ['gini', 'entropy', 'log_loss'],
            'max_depth': range(1, 8),
            'min_samples_split': range(2, 4),
            'min_samples_leaf': range(1, 4),
            'random_state': [42],  # Set random state to 42
        }

        top_3_dt = []

        # Instantiating the model, looping through them and finding top performers
        model_inst = DecisionTreeClassifier(random_state=42)  # Set random state to 42
        for params in ParameterGrid(decision_tree_hyperp):
            try:
                model_inst.set_params(**params)
                model_inst.fit(self.X_train, self.y_train)
                y_pred = model_inst.predict(self.X_test)
                test_accuracy = accuracy_score(self.y_test, y_pred)
                if cv_now:
                    cv_accuracy = self.cross_validate(model_inst)
                    top_3_dt.append((params, cv_accuracy, test_accuracy))
                else:
                    top_3_dt.append((params, test_accuracy))
            except ValueError as e:
                continue

        top_3_dt.sort(key=lambda x: x[-1], reverse=True)
        top_hp_dt = top_3_dt[:1]
        top_3_dt = top_3_dt[:3]

        # Saving attributes
        self.top_3_dt = top_3_dt
        self.top_hp_dt = top_hp_dt
        self.top_model_dt = DecisionTreeClassifier(random_state=42).set_params(**self.top_hp_dt[0][0]).fit(self.X_train,
                                                                                                           self.y_train)

    # Plotting Confusion Matrix for top-performing models
    def plot_confusion_matrix(self):
        lr_params = self.top_hp_lr[0][0]
        dt_params = self.top_hp_dt[0][0]

        # Train the best Logistic Regression model
        lr_model = LogisticRegression(random_state=42).set_params(**lr_params).fit(self.X_train, self.y_train)

        # Train the best Decision Tree model
        dt_model = DecisionTreeClassifier(random_state=42).set_params(**dt_params).fit(self.X_train, self.y_train)

        # Generate confusion matrices
        lr_cm = confusion_matrix(self.y_test, lr_model.predict(self.X_test))
        dt_cm = confusion_matrix(self.y_test, dt_model.predict(self.X_test))

        # Display confusion matrix for Logistic Regression
        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        disp_lr = ConfusionMatrixDisplay(confusion_matrix=lr_cm, display_labels=['Class 0', 'Class 1'])
        disp_lr.plot(cmap=plt.cm.Blues, values_format='d')
        plt.title("Confusion Matrix for Logistic Regression")

        # Display confusion matrix for Decision Tree
        plt.subplot(1, 2, 2)
        disp_dt = ConfusionMatrixDisplay(confusion_matrix=dt_cm, display_labels=['Class 0', 'Class 1'])
        disp_dt.plot(cmap=plt.cm.Blues, values_format='d')
        plt.title("Confusion Matrix for Decision Tree")

        plt.tight_layout()
        plt.show()

    # Plotting ROC Curve for top-performing models
    def plot_roc_curve(self):
        top_model_lr = LogisticRegression(random_state=42).set_params(**self.top_hp_lr[0][0]).fit(self.X_train, self.y_train)
        top_model_dt = DecisionTreeClassifier(random_state=42).set_params(**self.top_hp_dt[0][0]).fit(self.X_train, self.y_train)

        fpr_lr, tpr_lr, _ = roc_curve(self.y_test, top_model_lr.predict_proba(self.X_test)[:, 1])
        roc_auc_lr = auc(fpr_lr, tpr_lr)

        fpr_dt, tpr_dt, _ = roc_curve(self.y_test, top_model_dt.predict_proba(self.X_test)[:, 1])
        roc_auc_dt = auc(fpr_dt, tpr_dt)

        plt.figure(figsize=(8, 6))
        plt.plot(fpr_lr, tpr_lr, color='darkorange', lw=2, label='Logistic Regression ROC curve (area = {:.2f})'.format(roc_auc_lr))
        plt.plot(fpr_dt, tpr_dt, color='green', lw=2, label='Decision Tree ROC curve (area = {:.2f})'.format(roc_auc_dt))
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc='lower right')
        plt.show()


    # Determining most important features for my data
    def feature_importance(self, n=10):
        if self.top_hp_lr[0][-1] < self.top_hp_dt[0][-1]:
            top_model = DecisionTreeClassifier(random_state=42).set_params(**self.top_hp_dt[0][0])
        else:
            top_model = LogisticRegression(random_state=42).set_params(**self.top_hp_lr[0][0])

        if isinstance(self.X_train, np.ndarray):
            # Convert X_train to a DataFrame if it's a NumPy array
            features_used = pd.DataFrame(self.X_train, columns=self.X_train.columns)
        else:
            features_used = self.X_train

        top_model.fit(features_used, self.y_train)
    
        if isinstance(self.X_train, np.ndarray):
            feature_importance = list(zip(top_model.feature_importances_, features_used.columns))
        else:
            feature_importance = list(zip(abs(top_model.coef_[0]), features_used.columns))
    
        feature_importance.sort(key=lambda x: x[0], reverse=True)
        return feature_importance[:n]


    
    # Creating a method to obtain the score DataFrame
    def scores(self, both_models=False, include_precision=True):
        dt_y_hat_train = self.top_model_dt.predict(self.X_train)
        dt_y_hat_test = self.top_model_dt.predict(self.X_test)
        lr_y_hat_train = self.top_model_lr.predict(self.X_train)
        lr_y_hat_test = self.top_model_lr.predict(self.X_test)
        self.cv_top_lr = self.top_hp_lr[0][1]
        self.cv_top_dt = self.top_hp_dt[0][1]

        dt_metrics = {
            'Accuracy train': round(accuracy_score(self.y_train, dt_y_hat_train), 3),
            'Accuracy test': round(accuracy_score(self.y_test, dt_y_hat_test), 3),
            'Recall train': round(recall_score(self.y_train, dt_y_hat_train), 3),
            'Recall test': round(recall_score(self.y_test, dt_y_hat_test), 3),
            'F1 train': round(f1_score(self.y_train, dt_y_hat_train), 3),
            'F1 test': round(f1_score(self.y_test, dt_y_hat_test), 3),
            'CV results': round(self.cv_top_dt, 3)
        }

        lr_metrics = {
            'Accuracy train': round(accuracy_score(self.y_train, lr_y_hat_train), 3),
            'Accuracy test': round(accuracy_score(self.y_test, lr_y_hat_test), 3),
            'Recall train': round(recall_score(self.y_train, lr_y_hat_train), 3),
            'Recall test': round(recall_score(self.y_test, lr_y_hat_test), 3),
            'F1 train': round(f1_score(self.y_train, lr_y_hat_train), 3),
            'F1 test': round(f1_score(self.y_test, lr_y_hat_test), 3),
            'CV results': round(self.cv_top_lr, 3)
        }

        if include_precision:
            precision_metrics = {
                'Precision train': round(precision_score(self.y_train, self.top_model_precision.predict(self.X_train)), 3),
                'Precision test': round(precision_score(self.y_test, self.top_model_precision.predict(self.X_test)), 3)
            }
            dt_metrics.update(precision_metrics)
            lr_metrics.update(precision_metrics)

        decision_tree_df = pd.DataFrame(list(dt_metrics.values()), index=dt_metrics.keys(), columns=['Decision Tree'])
        logistic_regression_df = pd.DataFrame(list(lr_metrics.values()), index=lr_metrics.keys(),
                                               columns=['Logistic Regression'])

        if both_models:
            df = pd.concat([decision_tree_df, logistic_regression_df], axis=1)
            return df
        else:
            if self.top_hp_lr[0][-1] < self.top_hp_dt[0][-1]:
                return decision_tree_df
            else:
                return logistic_regression_df


    # Creating a method to plot the decision tree
    def plot_tree(self):
        f, ax = plt.subplots(figsize=(8, 8))
        plot_tree(self.top_model_dt, ax=ax)
        plt.title('Decision Tree Classifier')