In [2]:
#M13D03A03
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostRegressor

#M14D03A02
import warnings
warnings.filterwarnings("ignore")

<summary> Data details
<details>

### --- Main element --- 
Diabetes_012

---
### --- Binary elements ---
HighBP  
HighChol  
CholCheck  
Smoker  
Stroke  
HeartDiseaseorAttack  
PhysActivity  
Fruits  
Veggies  
HvyAlcoholConsump  
AnyHealthcare  
NoDocbcCost  
DiffWalk  
Sex &rarr; (0 = female, 1 = male)
  
---
### --- Integer Elements ---
BMI - 12-98
  
GenHlth (Self reported value)  
&emsp;1 = excellent  
&emsp;2 = very good  
&emsp;3 = good  
&emsp;4 = fair  
&emsp;5 = poor  
  
MentHlth - Mental Health // # of "not good days" in the last 30   
  
PhysHlth - Physical Health // # of "not good days" in the last 30  
   
Age [1-13] - (filled in using CDC <a href="https://www.cdc.gov/nchs/data/statnt/statnt20.pdf" target="_blank">Master List</a>, grouping 18-19, 20-24) [* indicated field given with origional data]  
&emsp;1 = 18-24 *  
&emsp;2 - 25-29  
&emsp;3 - 30-34  
&emsp;4 - 35-39  
&emsp;5 - 40-44  
&emsp;6 - 45-49  
&emsp;7 - 50-54  
&emsp;8 - 55-59  
&emsp;9 - 60-64 *  
&emsp;10- 65-69  
&emsp;11- 70-74  
&emsp;12- 75-79  
&emsp;13- 80 or older *  
  
Education [1-6]  
&emsp;1 = Never attended school or only kindergarten  
&emsp;2 = Grades 1 through 8 (Elementary)  
&emsp;3 = Grades 9 through 11 (Some high school)  
&emsp;4 = Grade 12 or GED (High school graduate)  
&emsp;5 = College 1 year to 3 years (Some college or technical school)  
&emsp;6 = College 4 years or more (College graduate) 
  
Income [1-8]  
&emsp;1 = less than $10,000  
&emsp;5 = less than $35,000  
&emsp;8 = $75,000 or more  

[https://www.markdownguide.org/hacks/#:~:text=Basically%2C%20every%20in%20your,sentence%20of%20my%20indented%20paragraph.]:#


# Clustering check  (KMEans)  --------------------------------------------------

In [39]:
# M11D01A05

# Inputs : df, [0, 1], cluster_max_range
# Set all random states to 1
#2nd input, 0 = no elbo plot, 1 = return elbow plot
# returns the best # of clusters 

def kmeans_cluster_check(df, plot, cluster_max_range):
    """
    Retures most probably # of Clusters, using KMeans.

    :param df: Dataframe to be used for KMeanse check.
    :param plot: [0 = No Elbow Plot]  [1 = Return Elbow Plot].
    :param cluster_max_range: Max value to be used for KMeanse run.
    :return: The highest % change key (not value) in a cluster check as an integer.
    """ 
    inertia_values = []
    k_values = list(range(1,11))
    for k in k_values:
        model = KMeans(n_clusters=k, n_init='auto', random_state=1)
        model.fit(df)
        inertia_values.append(model.inertia_)
    # display(inertia_values)

    if plot == 1:
        elbow_data = {"k": k_values, "inertia": inertia_values}
        df_elbow = pd.DataFrame(elbow_data)
        df_elbow.plot.line(
            x="k",
            y="inertia",
            xticks=k_values
        )

    #M11D01A04
    delta={}
    delta_numbers = k_values
    delta_values = inertia_values
    for d in range(1, len(inertia_values)):
        percentage_decrease = (delta_values[d-1] - delta_values[d]) / delta_values[d-1] * 100
        # print(f"Percentage decrease from k={delta_numbers[d-1]} to k={delta_numbers[d]}: {percentage_decrease:.2f}%")
        delta.update({d:percentage_decrease})
    # print(delta)
    return max(delta, key=delta.get)+1

### Application element - modle / fit / predict / append / Splitting used df into clusters
# cluster_model = KMeans(n_clusters=2, n_init='auto', random_state=23)
# cluster_model.fit(df_used)
# cluster = cluster_model.predict(df_used)
# df_used["cluster"] = cluster
# df_0 = df_used[df_used["cluster"] == 0]
# df_1 = df_used[df_used["cluster"] == 1]

# Logistic Regression ----------------------------------------------------------

In [31]:
def lr_run(data, iteration, random_st):
    """
    Performs Logistic Regression on Dataframe. Using the solvers of 
    'saga', 'sag', 'newton-cholesky', 'newton-cg', 'liblinear', and 'lbfgs'

    :param data: Dataframe to be used for KMeanse check.
    :param iteration: Max iteration for the Logistic Regresion model.
    :param random_st: Random State for the Logistic Regresion model.
    :return: Will cycle through the solver types and returning the closest Test/Train match.
    """ 
    X_train, X_test, y_train, y_test = data
    #lr_solver_types = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
    lr_solver_types = ['saga', 'sag', 'newton-cholesky', 'newton-cg', 'liblinear', 'lbfgs']    
    # lbfgs = default
    lr_threshold = 0
    print("Start LR Run")
    for solvers in lr_solver_types:
        #print(f"----{solvers}-----")
        lr_model = LogisticRegression(max_iter=iteration, random_state=random_st, solver=solvers)
        lr_model.fit(X_train, y_train)
        train = lr_model.score(X_train, y_train)
        test = lr_model.score(X_test, y_test)
        if lr_threshold == 0:
            lr_threshold = abs(test-train)
            preferred_lr_solver = solvers
            run_test = test
            run_train = train
            #print("initial")
        if lr_threshold > abs(test-train):
            preferred_lr_solver = solvers
            run_train = train
            run_test = test
            #print("improving threshold")
        #else:
            #print("Leaving threshold alone")
    display(f"-------- Logistic Regression // Solver : {preferred_lr_solver} --------")
    print('Train Accuracy: %.3f' % run_train)
    print('Test Accuracy: %.3f' % run_test)
    print("--------------------------------------------------------------------------")





# Support Vector Machine ------------------------------------------------------

In [32]:
### Get clusters from K-Means, and ron on them individually
# Rebuild with scaled data - relavent with binary data?


svm_kernel_types = ['linear', 'poly', 'rbf', 'sigmoid']
#rbf = default
# "precomputed" removed from list

svm_model = SVR(kernel='poly')
# svm_model.fit(X_train, y_train)
# print('Train Accuracy: %.3f' % svm_model.score(X_train, y_train))
# print('Test Accuracy: %.3f' % svm_model.score(X_test, y_test))

# K Nearest Neighbor -----------------------------------------------------------

In [33]:
#M13D02A01,02
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

# Switch to appended list?


def knn_run(data):
    """
    Performs K Nearest Neighbor analysis on Dataframe, using the neighbor counts of 5, 7, 9, and 11. 

    :param data: Dataframe to be used for KMeanse check.
    :return: Will cycle through the neighbor counts and returns the closest Test/Train match.
    """ 
    X_train, X_test, y_train, y_test = data
    knn_number_of_neighbors = [5, 7, 9, 11]
    knn_threshold = 0
    print("Start KNN Run")
    for n_o_ns in knn_number_of_neighbors:
        # print(f"---- #{n_o_ns}-----")
        knn_model = KNeighborsClassifier(n_neighbors = n_o_ns)
        knn_model.fit(X_train, y_train)
        train = knn_model.score(X_train, y_train)
        test = knn_model.score(X_test, y_test)
        if knn_threshold == 0:
            lr_threshold = abs(test-train)
            preferred_knn_number = n_o_ns
            run_test = test
            run_train = train
            # print("initial")
        if knn_threshold > abs(test-train):
            preferred_knn_number = n_o_ns
            run_train = train
            run_test = test
            # print("improving threshold")
        #else:
            # print("Leaving threshold alone")
    display(f"--------K Nearest Neighbors Run // Number {preferred_knn_number} --------")
    print('Train Accuracy: %.3f' % run_train)
    print('Test Accuracy: %.3f' % run_test)
    print("--------------------------------------------------------------------------")


# Random Forest ----------------------------------------------------------------

In [34]:
# M14D03A02
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

def rf_run(data):
    """
    Performs Random Forest analysis on Dataframe, using GridSearchCV. 
        "n_estimators": [100, 150, 200],  
        "max_depth" : [3, 5],  
        "random_state": [1]  

    :param data: Dataframe to be used for KMeanse check.
    :return: Will cycle through the Grid and return the closest Test/Train match.
    """ 
    print("Starting RF run")
    X_train, X_test, y_train, y_test = data
    from sklearn.model_selection import GridSearchCV
    rf_grid_model = RandomForestClassifier()
    rf_search_grid = {
        "n_estimators": [100, 150, 200],
        "max_depth" : [3, 5],
        "random_state": [1]
    }
    grid_rf_run = GridSearchCV(rf_grid_model, rf_search_grid, verbose=0)
    grid_rf_run.fit(X_train, y_train)
    print(f"------- Randome Forest Run // Parameters : {grid_rf_run.best_params_}")
    print('Train Accuracy: %.3f' % grid_rf_run.best_estimator_.score(X_train, y_train))
    print('Test Accuracy: %.3f' % grid_rf_run.best_estimator_.score(X_test, y_test))
    print("--------------------------------------------------------------------------")




# Main Run ---------------------------------------------------------------------

In [3]:
# https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators

file_path_01 = "diabetes_012_health_indicators_BRFSS2015.csv"
file_path_02 = "diabetes_binary_health_indicators_BRFSS2015.csv"
file_path_03 = "diabetes_binary_5050split_health_indicators_BRFSS2015.csv"

file_path_used = file_path_03

df = pd.read_csv(file_path_used)
# for col in df.columns:
#     display(df[col].value_counts())

# Data processing elements : M13D03A02 // Unnneeded


# Data Switches ----------------------------------------------------------------

In [24]:
df_used = df.copy()


# df_used = df_used.drop(columns="Sex")
# df_used = df_used.drop(columns="Age")

# Results in 61% / 58%  62% -----------------------------------
# df_used = df_used.drop(columns="HighBP")
# df_used = df_used.drop(columns="BMI")
# df_used = df_used.drop(columns="Smoker")
# df_used = df_used.drop(columns="Stroke")
# df_used = df_used.drop(columns="HeartDiseaseorAttack")
# df_used = df_used.drop(columns="DiffWalk")


# Results in 72% -----------------------------------
# df_used = df_used.drop(columns="HvyAlcoholConsump")
# df_used = df_used.drop(columns="PhysActivity")
# df_used = df_used.drop(columns="GenHlth")
# df_used = df_used.drop(columns="MentHlth")
# df_used = df_used.drop(columns="PhysHlth")

# Results in 74% -----------------------------------
df_used = df_used.drop(columns="Education")
df_used = df_used.drop(columns="Income")

df_used = df_used.drop(columns="AnyHealthcare")
df_used = df_used.drop(columns="NoDocbcCost")

df_used = df_used.drop(columns="Fruits")
df_used = df_used.drop(columns="Veggies")

# Results in 74% -----------------------------------------------
df_used = df_used.drop(columns="CholCheck")
df_used = df_used.drop(columns="HighChol")

df_used.head()

Unnamed: 0,Diabetes_binary,HighBP,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age
0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0
1,0.0,1.0,26.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0
2,0.0,0.0,26.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0
3,0.0,1.0,28.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0
4,0.0,0.0,29.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0


# Splitting Clusters -----------------------------------------------------------

In [147]:
kmeans_cluster_check(df_used, 0, 5)

2

In [40]:
cluster_model = KMeans(n_clusters=2, n_init='auto', random_state=23)
cluster_model.fit(df_used)
cluster = cluster_model.predict(df_used)
df_used["cluster"] = cluster
df_0 = df_used[df_used["cluster"] == 0]
df_1 = df_used[df_used["cluster"] == 1]
# df_1

### Simulations run on seperate clusters (8/24/24), with no decernable change to final prediction values.
### Clusters left intact, but disconnected in the next step to allow the origional df_used to pass through to the final simulation.

In [36]:
# df_used = df_0

# Split ------------------------------------------------------------------------

In [41]:
if file_path_used == file_path_01:
    y = df_used["Diabetes_012"]
    X = df_used.copy().drop(columns="Diabetes_012")
else:
    y = df_used["Diabetes_binary"]
    X = df_used.copy().drop(columns="Diabetes_binary")

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23)

# data = [X_train, X_test, y_train, y_test]

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


data = [X_train_scaled, X_test_scaled, y_train, y_test]



# AdaBoost - (Not included) ---------------------------------------------------- 

In [7]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html

print("Starting Ada run")
X_train, X_test, y_train, y_test = data
from sklearn.model_selection import GridSearchCV

ada_sub_rf_model = RandomForestClassifier()
ada_grid_model = AdaBoostRegressor(estimator=ada_sub_rf_model)
ada_search_grid = {
    #"estimator": "sklearn.tree.RandomForestClassifier",
    "n_estimators": [50], # [100, 150, 200]
    "random_state": [1],
    "learning_rate": [0.1, 1.0] # [0.01, 0.1, 1.0]
}
grid_ada_run = GridSearchCV(ada_grid_model, ada_search_grid, verbose=0, cv=2)
grid_ada_run.fit(X_train, y_train)

grid_ada_run_best = grid_ada_run.best_estimator_

print('Train Accuracy: %.3f' % grid_ada_run.score(X_train, y_train))
print('Test Accuracy: %.3f' % grid_ada_run.score(X_test, y_test))
#  M13D03A04 -- defining the general function

Starting Ada run
Train Accuracy: 0.745
Test Accuracy: -0.163


# Analysis ---------------------------------------------------------------------

In [38]:
lr_run(data, 200, 23)
# aprox. 2 min
knn_run(data)
# aprox. 3 min
rf_run(data)
# aprox. 3 min

Start LR Run


'-------- Logistic Regression // Solver : saga --------'

Train Accuracy: 0.739
Test Accuracy: 0.744
--------------------------------------------------------------------------
Start KNN Run


'--------K Nearest Neighbors Run // Number 11 --------'

Train Accuracy: 0.766
Test Accuracy: 0.723
--------------------------------------------------------------------------
Starting RF run
------- Randome Forest Run // Parameters : {'max_depth': 5, 'n_estimators': 150, 'random_state': 1}
Train Accuracy: 0.738
Test Accuracy: 0.740
--------------------------------------------------------------------------


In [None]:
### Model 1 // 6min 31sec
# Start LR Run
# '-------- Logistic Regression // Solver : liblinear --------'
# Train Accuracy: 0.846
# Test Accuracy: 0.847
# (w/o chart)
#   Train Accuracy: 0.846
#   Test Accuracy: 0.847
# (w/o chart, Cholesterol)
#   Train Accuracy: 0.845
#   Test Accuracy: 0.846
# --------------------------------------------------------------------------
# Start KNN Run
# '--------K Nearest Neighbors Run // Number 11 --------'
# Train Accuracy: 0.857
# Test Accuracy: 0.841
# (w/o chart)
#   Train Accuracy: 0.856
#   Test Accuracy: 0.841
# (w/o chart, Cholesterol)
#   Train Accuracy: 0.856
#   Test Accuracy: 0.841
# --------------------------------------------------------------------------
# Starting RF run
# ------- Randome Forest Run // Parameters : {'max_depth': 5, 'n_estimators': 150, 'random_state': 1}
# Train Accuracy: 0.844
# Test Accuracy: 0.845
# (w/o chart, estimators @ 200)
#   Train Accuracy: 0.844
#   Test Accuracy: 0.845
# (w/o chart, Cholesterol)
#   Train Accuracy: 0.844
#   Test Accuracy: 0.845
# --------------------------------------------------------------------------


In [None]:
### Model 2 // 6min 4sec
# Start LR Run
# '-------- Logistic Regression // Solver : saga --------'
# Train Accuracy: 0.863
# Test Accuracy: 0.865
# --------------------------------------------------------------------------
# Start KNN Run
# '--------K Nearest Neighbors Run // Number 11 --------'
# Train Accuracy: 0.873
# Test Accuracy: 0.858
# --------------------------------------------------------------------------
# Starting RF run
# ------- Randome Forest Run // Parameters : {'max_depth': 5, 'n_estimators': 150, 'random_state': 1}
# Train Accuracy: 0.861
# Test Accuracy: 0.862
# --------------------------------------------------------------------------

In [None]:
### Model 3 // 1min 4sec
# Start LR Run
# '-------- Logistic Regression // Solver : lbfgs --------'
# Train Accuracy: 0.747
# Test Accuracy: 0.749
# --------------------------------------------------------------------------
# Start KNN Run
# '--------K Nearest Neighbors Run // Number 11 --------'
# Train Accuracy: 0.765
# Test Accuracy: 0.722
# --------------------------------------------------------------------------
# Starting RF run
# ------- Randome Forest Run // Parameters : {'max_depth': 5, 'n_estimators': 200, 'random_state': 1}
# Train Accuracy: 0.744
# Test Accuracy: 0.743
# --------------------------------------------------------------------------

In [None]:
#  M13D03A04 -- defining the general function