In [3]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Read in csv
data = pd.read_csv('data/diabetic_data_formatted.csv')


# Remove the columns that have ~0.5 or more '?'
data.drop(['weight', 'medical_specialty'], axis=1, inplace=True)

# Replace <30 or >30 days readmission to YES
data['readmitted'] = data['readmitted'].replace(1, 0)
data['readmitted'] = data['readmitted'].replace(0, 0)

# Select target column to predict
X = data.drop(columns=['readmitted'])
y = data['readmitted']


# Get the unique class names from the target variable
class_names = ['YES', 'NO']

X_encoded = X
y_encoded = y

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.25, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_reduced = X.sample(frac=0.02, random_state=42) 
y_reduced = y[X_reduced.index]

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reduced, y_reduced, test_size=0.25, random_state=42)

models = {
    "Random Forest": RandomForestClassifier(
        n_estimators=80,
        criterion='gini',
        max_depth=50,
        class_weight='balanced',
        min_samples_leaf=20,
        max_features='sqrt',
        bootstrap = False,
        verbose=0,
        ccp_alpha=0.0,
        max_leaf_nodes=None,
        max_samples=None,
        min_impurity_decrease=0.0,
        monotonic_cst=None,
        oob_score=False,
        random_state=None,
        warm_start=False,
        min_samples_split=2
    ),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(100, 1),alpha=0.1, max_iter=200),
    "SVM": SVC(C=1, class_weight=None, gamma='scale', kernel='rbf') 
}

results = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1 Score": [],
    "Execution Time": []
}
for model_name, model in models.items():
    start = time.time()
    if model_name == "Linear Regression":
        continue

    if model_name in ["Perceptron", "Neural Network"]:
        model.fit(X_train_scaled, y_train)
    elif model_name == "SVM":
          model.fit(X_train_r, y_train_r)
    else:
        model.fit(X_train, y_train)


    if model_name == "SVM":
        y_pred = model.predict(X_test_r)
        y_test = y_test_r
    else:
        y_pred = model.predict(X_test_scaled if model_name in ["Perceptron", "Neural Network"] else X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    end = time.time()

    results["Model"].append(model_name)
    results["Accuracy"].append(accuracy)
    results["Precision"].append(precision)
    results["Recall"].append(recall)
    results["F1 Score"].append(f1)
    results["Execution Time"].append(abs(start-end))


results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="Accuracy", ascending=False)

print(results_df)

            Model  Accuracy  Precision    Recall  F1 Score  Execution Time
0   Random Forest  0.645711   0.645168  0.645711  0.645330       10.773701
1  Neural Network  0.630146   0.628876  0.630146  0.628291       49.099970
2             SVM  0.582150   0.507498  0.582150  0.437021        0.193229
