In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import preprocessing
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
pd.options.mode.chained_assignment = None

infile = "archive/diabetes_binary_5050split_health_indicators_BRFSS2015.csv"
df = pd.read_csv(infile)

In [None]:
# Show you all the columns in this file
df.columns
# Show you the first 5 rows in this file
df.head()

In [None]:
df2 = pd.get_dummies(df, columns=['GenHlth'])

columns_standard_scaling = ['Age','Education','Income']
columns_minmax_scaling = ['BMI','MentHlth','PhysHlth']

float_list = columns_minmax_scaling + columns_standard_scaling
int_list = df2.columns.difference(float_list)

df2[int_list] = df2[int_list].astype(int)

minmax_scaler = MinMaxScaler()
standard_scaler = StandardScaler()


df2[columns_standard_scaling] = standard_scaler.fit_transform(df2[columns_standard_scaling])
df2[columns_minmax_scaling] = minmax_scaler.fit_transform(df2[columns_minmax_scaling])

df2.head()

In [None]:
# Train test split
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

target = "Diabetes_binary"
if "012" in infile:
    target="Diabetes_012"

X = df2.drop(columns=[target])
y = df2[target]

# Write code to make train and test splits
## START CODE ##
oversampling = 0
if oversampling==0:
    traindf, testdf = train_test_split(df2, test_size=0.2, random_state=42, stratify=df2[target])
    X_train = traindf.drop(columns=[target])
    y_train = traindf[target]
    X_test = testdf.drop(columns=[target])
    y_test = testdf[target]
else:
    X_train_noresample, X_test, y_train_noresample, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train_noresample, y_train_noresample)

## END CODE ##

# # Write code to make train and validation splits

# ## START CODE ##
# traindf, valdf = train_test_split(traindf, test_size=0.2, random_state=15, stratify=traindf[target])
# X_train = traindf.drop(columns=[target])
# y_train = traindf[target]
# X_val = valdf.drop(columns=[target])
# y_val = valdf[target]
# ## END CODE ##

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import (accuracy_score,roc_auc_score,precision_score,recall_score,f1_score,precision_recall_fscore_support)
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import label_binarize

def grid_search_for_classifier(model, param_grid, X_train, y_train):
  # Grid search
  grid_search = GridSearchCV(model, param_grid=param_grid, verbose=2, cv=5, n_jobs=-1)

  # Conduct grid search using the training set (1 line of code only)
  ### START CODE ###
  grid_search.fit(X_train,y_train)
  ### END CODE ###
  print(grid_search.best_params_)

  # Set the best paramters for your clf (1 line of code only)
  ### START CODE ###
  # model.set_params(**grid_search.best_params_)
  ### END CODE ###
  return grid_search.best_estimator_

def evaluate_classifier(model, X_test, y_test):
  # Fit your classifier on the training set
  ### START CODE ###
  # model.fit(X_train,y_train)
  ### END CODE ###

  y_pred = model.predict(X_test)

  print("Confusion matrix: ")
  # Print the confusion matrix computed from the test set (1 line of code only)
  ### START CODE ###
  print(confusion_matrix(y_test, y_pred))
  ### END CODE ###


  ### START CODE ###
  # y_pred_proba = model.predict_proba(X_test)
  # y_test_binarized = label_binarize(y_test, classes=[0, 1, 2])
  acc_score = accuracy_score(y_test, y_pred)
  # auc_score = roc_auc_score(y_test, y_pred_proba)
  # auc_score = roc_auc_score(y_test_binarized, y_pred_proba, average='macro')
  # prec_score = precision_score(y_test, y_pred, average='macro')
  # rec_score = recall_score(y_test, y_pred, average='macro')
  # fone_score = f1_score(y_test, y_pred, average='macro')
  precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
  ### END CODE ###

  print("Accuracy: {}, F1_Weighted: {}, Precision: {}, Recall: {}".format(acc_score, f1_weighted,  precision_weighted, recall_weighted))
  return model

In [None]:
from sklearn.linear_model import LogisticRegression
import pickle

# Define the parameter grid for GridSearchCV
param_grid = {
    'C': [0.1, 1, 10], 
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear','saga','sag'],
    "random_state": [42]
}

# Initialize Logistic Regression model
log_reg = LogisticRegression()

best_model = grid_search_for_classifier(log_reg, param_grid, X_train, y_train)
final_model = evaluate_classifier(best_model, X_test, y_test)

# with open('logreg-binary_imbalanced.pkl', 'wb') as file:
#     pickle.dump(model, file)

In [None]:
from sklearn.svm import SVC
import pickle

# Define the parameter grid for GridSearchCV
param_grid = {
    'C': [0.1,1], 
    'kernel': ['linear','rbf','poly'], 
    'gamma': ['scale','auto'],
    "random_state": [42]
}

# Initialize SVM model
svm = SVC()

best_model = grid_search_for_classifier(svm, param_grid, X_train, y_train)
final_model = evaluate_classifier(best_model, X_test, y_test)

# with open('svm-012.pkl', 'wb') as file:
#     pickle.dump(final_model, file)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
def search_best_svm(kernel, C_search_space, gamma, random_state):
    best_score = -np.inf
    for C in C_search_space:
        # Initialize an SVM classifier with the specified kernel type, C value, and random state
        ### START CODE ###
        svm = SVC(kernel=kernel,C=C, gamma=gamma, random_state=random_state)
        ### END CODE ###

        # Evaluate accuracy scores using 5-fold cross-validation scores
        ### START CODE ###
        scores = cross_val_score(svm, X_train, y_train, cv=5, n_jobs=-1)
        ### END CODE ###

        # Compute the average score and compare with the current best score to update the best C
        ### START CODE ###
        score = np.mean(scores)
        if (score > best_score):
          best_score = score
          best_C = C
        ### END CODE ###
        print(f"C: {C} Avg Cross Val Score: {np.round(score, 4)}")

    print(f"Best C: {best_C}")

    # Initialize the model using the specified kernel type, best C, and random state;
    # and then fit the model using training set
    ### START CODE ###
    model = SVC(kernel=kernel, C=best_C, random_state=random_state)
    model.fit(X_train,y_train)
    ### END CODE ###
    return model, best_C

In [None]:
C_values = [0.1,1]
kernel_types = ['linear','rbf','poly']
gammas = ['scale', 'auto']

for kernel_type in kernel_types:
    best_Cs = []
    accuracies = []
    for gamma in gammas:
        model, C = search_best_svm(kernel_type, C_values, gamma, 42)
        best_Cs.append(C)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
    for i, gamma in enumerate(gammas):
        # print(kernel_type)
        # print(best_Cs)
        # print(gamma)
        # print(accuracies)
        print("For %s, the best model had C_value of %f, gamma of %s, and accuracy of %f" % (kernel_type, best_Cs[i], gamma, accuracies[i]))