<div id="container" style="position:relative;">
<div style="float:left"><h1> Capstone Project: Diabetes risk predictor based on health indicators</h1></div>
<div style="float:left"><h2> Notebook 4: # Modelling - Logistics Regression models</h2></div>
<div style="float:left"><h2> Created by Diego Villanueva</h2></div>
</div>
</div>

This notebook includes  logistc models that use unmodified and oversampled training datasets.

## Table of contents

4a. Modelling (Logistics Regression)

        4a.1 Load data files

        4a.2 

### Imports

In [None]:
# data manipulation
import numpy as np
import pandas as pd

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

# modelling
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

# Scaling data
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

# PCA
from sklearn.decomposition import PCA

# File saving
import joblib 

# Temporary directories for caching
from tempfile import mkdtemp

# Cross-validation
from sklearn.model_selection import cross_val_score

# Pipeline building
from sklearn.pipeline import Pipeline

# Grid search
from sklearn.model_selection import GridSearchCV

### 4a.1 Load data files

In [None]:
X_rem = joblib.load("../data/Preprocessed_data/X_rem.pkl") # Unmodified dataset
y_rem = joblib.load("../data/Preprocessed_data/y_rem.pkl") # Unmodified dataset

X_rem_os = joblib.load("../data/Preprocessed_data/X_rem_oversampled") # Over sampled dataset
y_rem_os = joblib.load("../data/Preprocessed_data/y_rem_oversampled") # Over sampled dataset

X_test = joblib.load("../data/Preprocessed_data/X_test.pkl") # not scaled dataset
y_test = joblib.load("../data/Preprocessed_data/y_test.pkl") #unmodified dataset

### 4a.2 Function that will get different datasets as inputs and run the logistic regression

In [None]:
def log_reg(X_train, y_train, X_test, y_test, c_values=None, cv=5, verbose=1, random_state=1):
    """
    Fits a logistic regression model using grid search with cross-validation.

    :param X_train: Training data (features)
    :param y_train: Training data (target variable)
    :param X_test: Test data (features)
    :param y_test: Test data (target variable)
    :param c_values: List of C values to try in grid search. Default is a predefined range.
    :param cv: Number of cross-validation folds. Default is 5.
    :param verbose: Verbosity level for grid search. Default is 1.
    :param random_state: Random state for reproducibility. Default is 1.
    :return: Dictionary containing the best estimator and accuracy scores.
    """

    # Create a directory that will be used to cache the pipeline results
    cachedir = mkdtemp()

    # C values that will be used if no attribute provided
    if c_values is None:
        c_values = [.0001, .001, .1, 1, 10, 100, 1000]

    # Setup Pipeline
    my_pipeline = Pipeline([('scaler', StandardScaler()), 
                            ('dim_reducer', PCA()), 
                            ('model', LogisticRegression())], 
                            memory=cachedir)

    # Parameter grid
    log_reg_param_grid = [

    # l1 without PCA
    {'scaler': [None, StandardScaler(), MinMaxScaler(), RobustScaler()],
     'dim_reducer': [None],
     'model': [LogisticRegression(penalty='l1', random_state=1, n_jobs=-1, max_iter=10000)],
     'model__C': c_values},
    
    # l1 with PCA
    {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
     'dim_reducer': [PCA()],
     'dim_reducer__n_components': [0.95, 0.9, 0.85, 0.8],
     'model': [LogisticRegression(penalty='l1', random_state=1, n_jobs=-1, max_iter=10000)],
     'model__C': c_values},
    
    # l2 (default) without PCA
    {'scaler': [None, StandardScaler(), MinMaxScaler(), RobustScaler()],
     'dim_reducer': [None],
     'model': [LogisticRegression(solver='lbfgs', random_state=1, n_jobs=-1, max_iter=10000)],
     'model__C': c_values},
    
    # l2 (default) with PCA
    {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
     'dim_reducer': [PCA()],
     'dim_reducer__n_components': [0.95, 0.9, 0.85, 0.8],
     'model': [LogisticRegression(solver='lbfgs', random_state=1, n_jobs=-1, max_iter=10000)],
     'model__C': c_values}
]

    # Instantiate logistic regression with grid search
    logreg_gs = GridSearchCV(my_pipeline, param_grid=log_reg_param_grid, cv=cv, n_jobs=-1, verbose=verbose)

    # Fit the logistic regression with grid search
    fitted_logreg_gs = logreg_gs.fit(X_train, y_train)

    # Accuracy of the optimized model
    accuracy_remainder = fitted_logreg_gs.score(X_train, y_train)
    accuracy_test = fitted_logreg_gs.score(X_test, y_test)
    print("Accuracies of the optimized model:")
    print(f"The best logistic regression's accuracy on the training set: {accuracy_remainder}")
    print(f"The best logistic regression's accuracy on the test set: {accuracy_test}")

    # Return the best estimator and accuracy scores
    return {
        'best_estimator': fitted_logreg_gs.best_estimator_,
        'accuracy_train': accuracy_remainder,
        'accuracy_test': accuracy_test
    }

In [None]:
# Example usage:
# results = fit_logistic_regression(X_train, y_train, X_test, y_test)