# Algorithm: Linear Regression
Acknowledgement: Most of the code was referenced from Prof. Mafas Raheem's AML lab-6.2.1.
## Model Validation Strategy
Three models evaluated with two different datasets as described below (altogether 6 models).
### Standard dataset (without class balancing)
- Base
- Grid-Search-CV
- Random-Search-CV
### Oversampled dataset (with class balancing)
- Base
- Grid-Search-CV
- Random-Search-CV
### Conclusion (for all evaluations)
- accuracy chart

In [None]:
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from numpy import arange
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

import custom_libs.evaluation_helper as ev
import models.uci_heart_disease_dataset as uci

# Warning was disabled to capture errors and warning. Can be enabled when needed.
# warnings.filterwarnings("ignore")

In [None]:
# Load the preprocessed dataset (299 records; with 14 features)
data = pd.read_csv(uci.UCIHeartDiseaseDataFile.cleveland_preprocessed, names = uci.get_standard_features())
print(f'Data shape: {data.shape}.')
data.head(3)

# Set Models' Hyperparameters and Cross-Validation Strategy

In [None]:
# Verbose variable for log inspection.
verbose = False

# Dictionary to hold hyperparameters for grid and random search.
parameters = dict()
# The seed of the pseudo random number generated which is used while shuffling the data
parameters['random_state'] = arange(0, 100, 1)
# Inverse regularization parameter - A control variable that retains strength modification of Regularization by being inversely
# positioned to the Lambda regulator. C = 1/λ
parameters['C'] = arange(0.0001, 10, 10)
# Optimization
parameters['solver'] = ['liblinear', 'newton-cg', 'lbfgs', 'saga']
# Penalization (Regularization).
parameters['penalty'] = ['l1', 'l2']
parameters['multi_class'] = ['auto', 'ovr', 'multinomial']

# A stratified K fold for cross-validation strategy - values are assigned from the evaluation helper module.
skfolds = StratifiedKFold(n_splits = ev.kfold_n_split,
                          shuffle = ev.kfold_shuffle,
                          random_state = ev.random_state)

# Dict for storing accuracies and drawing chart in the conclusion section.
ev.accuracies = {}

# Base Model (Function)

In [None]:
# A function to build base model.
def build_and_validate_base_model(x_train, x_test, y_train, y_test, test_name):
    print(f'\nEvaluation name: {test_name}.')

    # Train the model with training set.
    log_reg= LogisticRegression(verbose=verbose, random_state=ev.random_state)
    log_reg.fit(x_train, y_train)

    # Display the hyperparameters used.
    print(f'Params         :{ log_reg.get_params()}.')

    # Do prediction with the train model.
    y_pred = log_reg.predict(x_test)

    # Calls the evaluation helper module to display classification-report, confusion-matrix and ROC curve and return accuracy.
    return ev.display_validation_report(y_test, y_pred, x_test, log_reg)

# Grid Search CV (Function)

In [None]:
# A function discover the model's best hyperparameters and perform cross-validation using GridSearchCV.
def build_and_validate_with_grid_search_cv(x_train, x_test, y_train, y_test, test_name):
    print(f'\nEvaluation name: {test_name}.')

    log_reg= LogisticRegression(verbose=verbose, random_state=ev.random_state)

    # Build Grid-search with parameters and stratified k fold strategy for cross-validation.
    grid_scv = GridSearchCV(log_reg, parameters, scoring='accuracy', cv=skfolds)

    # Train and discover the model's best hyperparameters for optimal performance.
    grid_scv.fit(x_train,y_train)

    # Display the best hyperparameters and score. The best score is mean of CV scores for train-set.
    print(f'Best params         :{grid_scv.best_params_}.')
    print(f'Best score (*mean)  :{grid_scv.best_score_}.')

    # Predict using the trained model on the test-set.
    y_pred = grid_scv.predict(x_test)

    # Calls the evaluation helper module to display classification-report, confusion-matrix and ROC curve and return accuracy.
    return ev.display_validation_report(y_test, y_pred, x_test, grid_scv)

# Random Search CV (Function)

In [None]:
# A function discover the model's best hyperparameters and perform cross-validation using RandomSearchCV.
def build_and_validate_with_random_search_cv(x_train, x_test, y_train, y_test, test_name):
    print(f'\nEvaluation name: {test_name}.')

    log_reg= LogisticRegression(verbose=verbose, random_state=ev.random_state)

    # Build Random-search with parameters and stratified k fold strategy for cross-validation.
    rand_scv = RandomizedSearchCV(log_reg, parameters, scoring='accuracy', cv=skfolds)

    # Train and discover the model's best hyperparameters for optimal performance.
    rand_scv.fit(x_train,y_train)

    # Display the best hyperparameters and score. The best score is mean of CV scores for train-set.
    print(f'Best params         :{rand_scv.best_params_}.')
    print(f'Best score (*mean)  :{rand_scv.best_score_}.')

    # Predict using the trained model on the test-set.
    y_pred = rand_scv.predict(x_test)

    # Calls the evaluation helper module to display classification-report, confusion-matrix and ROC curv and return accuracy.
    return ev.display_validation_report(y_test, y_pred, x_test, rand_scv)

# Evaluation with Standard Dataset (without class balancing)

### Data Preparation

In [None]:
# Prepare features and target variables.
X = data.drop(uci.UCIHeartDiseaseData.target, axis = 1)
y = data[uci.UCIHeartDiseaseData.target]

# Calls the evaluation helper module to scale features and do split (classes are balanced using stratify split).
X_train, X_test, Y_train, Y_test = ev.scale_and_split(X, y)

# Display the counter for '1' and '0' in y set before splitting.
print('Class counter (stratified):')
print(f'Entire-set  : {Counter(y)}.')
print(f'Train-set   : {Counter(Y_train)}.')
print(f'Test-set    : {Counter(Y_test)}.')

std_y_data = [['Entire', Counter(y).get(0), Counter(y).get(1)],
              ['Train', Counter(Y_train).get(0), Counter(Y_train).get(1)],
              ['Test', Counter(Y_test).get(0), Counter(Y_test).get(1)]]

pd.DataFrame(std_y_data, columns=['Set', '0', '1']).set_index('Set').plot.bar();

In [None]:
# Evaluation: Standard Base
acc = build_and_validate_base_model(X_train, X_test, Y_train, Y_test, ev.EvaluationType.std_base)
ev.accuracies[ev.EvaluationType.std_base] = acc

In [None]:
# Evaluation: Standard Grid Search CV
acc = build_and_validate_with_grid_search_cv(X_train, X_test, Y_train, Y_test, ev.EvaluationType.std_grid_search_cv)
ev.accuracies[ev.EvaluationType.std_grid_search_cv] = acc

In [None]:
# Evaluation: Standard Random Search CV
acc = build_and_validate_with_random_search_cv(X_train, X_test, Y_train, Y_test, ev.EvaluationType.std_random_search_cv)
ev.accuracies[ev.EvaluationType.std_random_search_cv] = acc

In [None]:
print(f'Accuracies : {ev.accuracies}.')

# Evaluation with Oversampled Dataset (with class balancing)

### Data Preparation and Oversampling

In [None]:
# Prepare features and target variables from original data frame.
X = data.drop(uci.UCIHeartDiseaseData.target, axis=1)
y = data[uci.UCIHeartDiseaseData.target]

# Perform over-sampling with SMOTE
X_b, y_b = SMOTE(random_state=ev.random_state).fit_resample(X, y)

# Scale and spit the oversampled data.
Xb_train, Xb_test, Yb_train, Yb_test = ev.scale_and_split(X_b, y_b)

# Display the counter for '1' and '0' in y set before splitting.
print('Class counter (stratified):')
print(f'Entire-set  : {Counter(y_b)}.')
print(f'Train-set   : {Counter(Yb_train)}.')
print(f'Test-set    : {Counter(Yb_test)}.')

os_y_data = [['Entire', Counter(y_b).get(0), Counter(y_b).get(1)],
              ['Train', Counter(Yb_train).get(0), Counter(Yb_train).get(1)],
              ['Test', Counter(Yb_test).get(0), Counter(Yb_test).get(1)]]

pd.DataFrame(os_y_data, columns=['Set', '0', '1']).set_index('Set').plot.bar();

In [None]:
# Before and after oversampling
target_set_data = [['Standard', Counter(y).get(0), Counter(y).get(1)],
                   ['Oversampled', Counter(y_b).get(0), Counter(y_b).get(1)]]

pd.DataFrame(target_set_data, columns=['Set','0','1']).set_index('Set').plot.bar();

In [None]:
# Evaluation: Over-sampled Base
acc = build_and_validate_base_model(Xb_train, Xb_test, Yb_train, Yb_test, ev.EvaluationType.os_base)
ev.accuracies[ev.EvaluationType.os_base] = acc

In [None]:
# Evaluation: Over-sampled Grid Search CV
acc = build_and_validate_with_grid_search_cv(Xb_train, Xb_test, Yb_train, Yb_test, ev.EvaluationType.os_grid_search_cv)
ev.accuracies[ev.EvaluationType.os_grid_search_cv] = acc

In [None]:
# Evaluation: Over-sampled Random Search CV
acc = build_and_validate_with_random_search_cv(Xb_train, Xb_test, Yb_train, Yb_test, ev.EvaluationType.os_random_search_cv)
ev.accuracies[ev.EvaluationType.os_random_search_cv] = acc

In [None]:
print(f'Accuracies : {ev.accuracies}.')

# Conclusion

In [None]:
# Function to display value on the bar chart.
def add_labels(x_pos, y_pos, delta):
    for i in range(len(x_pos)):
        dist = i + delta
        plt.text(dist, y_pos[i], y_pos[i], ha = 'center')

acc_model_type = ['Base', 'GridCSV', 'RandSCV']
bar_width = 0.35
x = np.arange(len(acc_model_type))

# Bar Plot size
plt.figure(figsize = (7,7))

# Set standard bar type and values.
acc_standard_data = [
    ev.accuracies[ev.EvaluationType.std_base],
    ev.accuracies[ev.EvaluationType.std_grid_search_cv],
    ev.accuracies[ev.EvaluationType.std_random_search_cv]
]
# Set oversampled bar type and values.
acc_oversampled_data = [
    ev.accuracies[ev.EvaluationType.os_base],
    ev.accuracies[ev.EvaluationType.os_grid_search_cv],
    ev.accuracies[ev.EvaluationType.os_random_search_cv]
]
plt.bar(x - bar_width / 2, acc_standard_data, bar_width, label='Standard', color='skyblue')
plt.bar(x + bar_width / 2, acc_oversampled_data, bar_width, label='Oversampled', color='lightgreen')

# Display value on the bar chart.
add_labels(acc_model_type,acc_standard_data,-(bar_width/2))
add_labels(acc_model_type,acc_oversampled_data, bar_width/2)

# Adding labels and title
plt.title('Model Accuracy Comparison')
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.ylabel('Accuracy (%)')
plt.xticks(x, acc_model_type)
plt.legend(title='Regions', loc='lower right')
plt.show();

# For debugging values.
print(f'Model types             : {acc_model_type}.')
print(f'Standard accuracies     : {acc_standard_data}.')
print(f'Oversampled accuracies  : {acc_oversampled_data}.')
print(f'Accuracies (dict)       : {ev.accuracies}.')

In [None]:
# Since, GridSCV perform more extensive cross-validation compared to RandSCV to ensure model's stability, the highest score between
# standard data and oversampled data for GridSCV is selected.
grid_accuracies = [ev.accuracies[ev.EvaluationType.std_grid_search_cv], ev.accuracies[ev.EvaluationType.os_grid_search_cv]]

print(f'Grid accuracies     : {grid_accuracies}.')

# Set the global score for comparisons.
ev.global_accuracies['LR'] = max(grid_accuracies)

print(f'Highest accuracy    : {ev.global_accuracies}.')