<a href="https://colab.research.google.com/github/dgambone3/CSC4850-Machine-Learning/blob/main/ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import sklearn 
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import ConfusionMatrixDisplay


# processing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate


# models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import ComplementNB, MultinomialNB 
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsClassifier

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML_Project/Diabetes Indicators Dataset/diabetes_012_health_indicators_BRFSS2015.csv')

In [None]:
X = df.iloc[:, 1:] # features
X = X.astype(int)
y = df.iloc[:, :1]
y = y.astype(int).values.ravel()

# Helper Functions


*   Learning Curves
*   Best Polynomial



### Helper function to get learning curve plot
#### Predicts target variable at every n interval of the dataset to use as points on learning curve plot.

In [None]:
def get_learning_curve(model, X, y, title):
  scores=[]
  perc=[]
  for n in range(1,101):
    perc.append(n)
    XX = X.iloc[0:int(len(X) * (n/100))] #df
    yy = y[0:int(len(y) * (n/100))] #list
    pred = model.predict(XX)
    score = MSE(y_true=yy, y_pred=pred)
    scores.append(score)
  return pd.DataFrame({'percent':perc, 'scores':scores})

### Helper function to find best polynomial for linear regression

In [None]:
def best_poly(X_train, y_train, X_test, y_test):
  degree = [1, 2]
  z = []
  poly_df = pd.DataFrame(columns=['polynomial', 'score'])
  print('   Polynomial Scores')
  for deg in degree:
    linreg = LinearRegression()
    polynomial_features = PolynomialFeatures(degree=deg,
                                             include_bias=False)
    lin_pipe = Pipeline([('scaler', MinMaxScaler()),
                        ("polynomial_features", polynomial_features),
                        ("linear_regression", linreg)])
    lin_pipe.fit(X,y)
    score = lin_pipe.score(X_test,y_test)
    z.append(score)
    print(f'Degree: {deg}  Score: {score}')
  p = z.index(max(z)) + 1
    
  return p 

### Split original data into three seperate ratios
#### Splitting data outside of loop for cohesion. This way assures the same data points are used for all the splits on all the models. 
#### Initialize dataframe labels and plot colors



In [None]:
# X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y,train_size=0.20)
# Create all three splits outside of loop to run each model on same data splits
X55_train, X55_test, y55_train, y55_test = train_test_split(X, 
                                                            y, 
                                                            train_size=.5, 
                                                            test_size=.5, 
                                                            shuffle=True,
                                                            random_state=1234)
X73_train, X73_test, y73_train, y73_test = train_test_split(X, 
                                                            y, 
                                                            train_size=.7, 
                                                            test_size=.3, 
                                                            shuffle=True,
                                                            random_state=1234)
X82_train, X82_test, y82_train, y82_test = train_test_split(X, 
                                                            y, 
                                                            train_size=.8, 
                                                            test_size=.2, 
                                                            shuffle=True,
                                                            random_state=1234)
# List of train and test motels to access later in loop
train = [(X55_train, y55_train), 
         (X73_train, y73_train), 
         (X82_train, y82_train)]
test = [(X55_test, y55_test),
        (X73_test, y73_test),
        (X82_test, y82_test)]

# initialize lists of color for plots
train_colors = ['navy', 'green', 'firebrick']
test_colors = ['skyblue', 'palegreen', 'salmon']

#### Initialize models

In [None]:
titles = ['Decision Tree Classifier',
          'Perceptron',
          'Compliment Naive Bayes',
          'Multinomial Naive Bayes',
          'Logistic Regression',
          'Linear Regression',
          'SVM - Linear',
          'SVM - RBF',
          'Gradient Boost',
          'Muti-Layer Perceptron',
          'Regularilized Linear Regression',
          'Lasso Linear Regression',
          'k-Nearest Neighbors',
          'Linear Regression with Optimal Polynomial']

models = [DecisionTreeClassifier(criterion='entropy', splitter='best'),
          Perceptron(class_weight='balanced'),
          ComplementNB(),
          MultinomialNB (),
          LogisticRegression(class_weight='balanced'), 
          LinearRegression(),
          LinearSVC(class_weight='balanced', dual=False),
          SVC(kernel='rbf', decision_function_shape='ovr'),
          GradientBoostingClassifier(),
          MLPClassifier(max_iter=500, hidden_layer_sizes=10),
          SGDRegressor(loss='squared_error', penalty='l2'),
          Lasso(selection='random'),
          KNeighborsClassifier(weights='distance'),
          LinearRegression()]

# list of split ratios to output onto plots
splits = [(0.5, 0.5), 
          (0.7, 0.3),
          (0.8, 0.2)]

# create labels for fold metrics dataframe
index = []
for i in range(1,11):
  index.append(f'Fold {i}')

### Pipeline

This cell holds a nested loop which performs multiple steps to initiialize the models, get their model metrics, output learning curves, etc. Outputs for metrics of each fold, and split are generated.


1. for each model
*   scale the data
2. for each split on the model
*   fit the pipeline

*   10-fold cross-validation

*   calculate metrics for model performance on test data

*   generate and output learning curves wiht one plot for each model, and all splits on that one plot. 













In [None]:
i = 0
# dataframe to store all split information, will be used to pick the best splits
splits_df = pd.DataFrame(columns=['Model', 
                                  'Split', 
                                  'Error',  
                                  'Generalization', 
                                  'true', 
                                  'pred'])
# dataframe to hold all model and split metrics
big_df = pd.DataFrame(columns=['Model', 'Split'])

for model in models: # for each model in model list
    # print(titles[i])
    pipe = make_pipeline(MinMaxScaler(), model) # make pipeline with steps to use MinMaxScaler to [0,1]
    # for plotting learning curves
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    # dataframe to hold and output all metridcs for every split on model
    fold_df = pd.DataFrame(columns=['Accuracy',
                                    'Precision', 
                                    'Recall', 
                                    'F1-Score', 
                                    'R2'
                                    'Error'],
                                    index=index)
    j=0
    for j in range(len(splits)): # for each data split
      # rename variables for test/train data, based on list of tuples in previous cell
      X_train = train[j][0]
      y_train = train[j][1]
      X_test = test[j][0]
      y_test = test[j][1]

      # condition to call polynomial function only on this model type
      if 'Polynomial' in titles[i]:
        poly_df = pd.DataFrame(columns=['polynomial', 'score'])
        poly = best_poly(X_train, y_train, X_test, y_test)       
        pipe = make_pipeline(MinMaxScaler(),
                             PolynomialFeatures(degree=poly),
                             LinearRegression())
      # fit data to the training dataset
      pipe.fit(X_train, y_train)
      
      # print(splits[j])
      # conduct 10-fold cross validation with specific scores
      cv = cross_validate(pipe, 
                          X_train, 
                          y_train,
                          scoring=['accuracy', 
                                    'precision_weighted',
                                    'recall_weighted',
                                    'f1_weighted',
                                    'r2',
                                    'neg_mean_squared_error'], 
                          cv=10) 
      # variable to hold predicted values on test set
      pred = pipe.predict(X_test)
      # calculate test error on test set
      test_error = MSE(y_test, pred)

      # adding absolute value of neg mean squared error so lower error is better, 
      # while CV uses neg_MSE to follow convention of all other metrics that higher is better
      # concat metrics to dataframe
      fold_df = pd.concat({'Accuracy':pd.Series(cv['test_accuracy']), 
                            'Precision':pd.Series(cv['test_precision_weighted']),
                            'Recall':pd.Series(cv['test_recall_weighted']),
                            'F1-Score':pd.Series(cv['test_f1_weighted']),
                            'R2' : pd.Series(cv['test_r2']),
                            'Error':pd.Series(abs(cv['test_neg_mean_squared_error']))},
                            axis=1)
      # add metrics to best splits dataframe, generalization is calculated from the difference between test and train errors
      splits_df.loc[len(splits_df)] = [titles[i], 
                                      splits[j], 
                                      fold_df['Error'].min(),
                                      abs(fold_df['Error'].min() - test_error),
                                      test[j][1],
                                      pred]
      # concat data to big dataframe for final output
      big_df = pd.concat([big_df, fold_df], ignore_index=True, axis=0)
      # add split identification to rows all 10 rows of the cross validation fold
      big_df['Split'].iloc[((i*30)+(j*10)):((i*30)+(j*10)+10)] = str(splits[j])
      
      fold_df.index = index # set index for visualation purposes
      # create display caption for fold dataframe, for visualization purposes
      fold_disp = fold_df.style.set_caption(f'Fold Metrics for {titles[i]} with {int(splits[j][0] * 100)}/{int(splits[j][1]*100)} Split')
      # print('\n')
      display(fold_disp) # display fold metrics
      
      # PLOT LEARNING CURVES
      # get training and testing scores by calling learning curve function
      train_scores = get_learning_curve(pipe, X_train, y_train, titles[i])
      test_scores = get_learning_curve(pipe, X_test, y_test, titles[i])
      # plot training scores
      ax.plot(train_scores['percent'], 
              train_scores['scores'], 
              color=train_colors[j],
              label=f'{int(splits[j][0] * 100)}/{int(splits[j][1]*100)} Training')
      # plot testing scores
      ax.plot(test_scores['percent'], 
              test_scores['scores'],  
              color=test_colors[j],
              label=f'{int(splits[j][0] * 100)}/{int(splits[j][1]*100)} Validation')
      # set labeles and titles
      ax.set_xlabel('Sample Size')
      ax.set_ylabel('Error')
      ax.set_title(f'Learning Curve for {titles[i]}')
      ax.legend(loc='best')
      fig.tight_layout(pad=1.5)
      fig.show()
      j += 1 # increment to next split
    # add model name to all 30 fold metrics (10 + 10 + 10 -> 10 folds, 3 splits, 1 model)
    big_df['Model'].iloc[(i*30):((i*30)+30)] = titles[i]
    i+=1  # increment to next fold
    


At this point, split_df holds the metrics for the best fold for each split, so there are three rows for every model, one for each split (50/50, 70/30, 80/20) for a total of (15models * 3splits).

The following cell selects the *best split* based on all the best folds for each model, which was selected in the previous cell. The dataframe groups by model name, and selects the split for each model based on which of the three has the lowest error. 

In [None]:
best = (splits_df.loc[splits_df.groupby('Model', sort=False)['Error'].idxmin()])
display(best)

#### Generating Confusion Matrix
This cell generates and outputs the Confusion Matricies for all applicable models. If 'Linear Regression' is in the model name, the function is skipped as confusion matricies are not applicable for continuious values. 

In [None]:
# loop to generate confusion matrix
for b in range(len(best)):
  # skip models with Linear Regression in the title because cannot calculate confision matrix for continuious values
  if 'Linear Regression' not in best.iloc[b]['Model']:
    ConfusionMatrixDisplay.from_predictions(y_true=best.iloc[b]['true'], 
                                            y_pred=best.iloc[b]['pred'],
                                            cmap='RdPu')
    # set titles
    title=best.iloc[b]['Model']
    plt.title(f'Confusion Matrix for {title}')
    plt.show()
  else:
    # output confirmation that regression model was skipped
    print('Cannot calculate Confusion Matrix for Regression Problems - ', titles[b])

Selecting columns to display for the dataframe with all the best models. This dataframe holds one row for each model, indicating the best split, for each model, based on the best fold for that split, with the lowest MSE.

In [None]:
# drop columns with y_true and y_pred that were used to generate matrix
best = best.drop(['true', 'pred'], axis=1)

# adjust dataframe for better display
best.reset_index(inplace=True, drop=True) 
disp_best = best.style.set_caption(f'Best Split for Each Model')
display(disp_best)

Display big_df with every metric for every model, for ever split. 

In [None]:
#output big_df with all metrics for every model, on every split
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(big_df)