In [2]:
import pandas as pd
import numpy as np
import pickle, os
import seaborn as sns
import matplotlib.pyplot as plt

# Utils
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.compose import ColumnTransformer

# Models
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [4]:
DATA_PATH = 'data/cleaned_combined_gender.csv'
MODELS_PATH = 'models/diabetes_a1c/'
RESULTS_PATH = 'results/diabetes/'
os.makedirs(MODELS_PATH, exist_ok = True)
os.makedirs(RESULTS_PATH, exist_ok = True)

vars = pd.read_csv('variables.csv')
cat_cols = vars[vars['Variable Type'].fillna('-').str.contains('Cat')]['Variable Common Name'].values

In [5]:
df = pd.read_csv(DATA_PATH)

df = df[df['Has diabetes'].isin([1, 2])] # Keep only those with 'Has diabtes' in [1, 2] (Yes / No)
df = df.dropna(subset=['Has diabetes']) # y cannot have any missing values
assert df.isna().sum().sum() == 0

X = df[['Glycohemoglobin (%)  (AIC)']]
y = (df['Has diabetes'] == 1).astype('category')

cat_cols = [col for col in cat_cols if col in X.columns]
num_cols = [col for col in X.columns if col not in cat_cols]

X = pd.get_dummies(X, columns = cat_cols, drop_first = True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)

In [6]:
pipe_sel_from_model = lambda model, param_grid: GridSearchCV(

    make_pipeline(
        # Standardize numerical features
        ColumnTransformer(transformers = [('num', StandardScaler(), num_cols)], remainder = 'passthrough'),
        SelectFromModel(RandomForestClassifier(n_jobs = -1, random_state = 42)), # select most important features
        model
    ),

    # Search for the best no. of features to select and model hyperparameters
    param_grid = param_grid,

    # Use 5-fold cross validation
    cv = 5,

    # Refit the model with the best parameters on all the data
    # (will be stored in the best_estimator_ attribute)
    refit = True,

    # Use area under roc curve to evaluate best params
    scoring = 'roc_auc'
)

In [7]:
max_features = [1]
model_and_params = [
    (
        DummyClassifier(strategy = 'most_frequent'),
        {
            'selectfrommodel__max_features': max_features,
        }
    ),
    (
        DecisionTreeClassifier(random_state = 42),
        {
            'selectfrommodel__max_features': max_features,
            'decisiontreeclassifier__max_depth': [None, 10, 100],
            'decisiontreeclassifier__min_samples_split': [2, 4],
            'decisiontreeclassifier__min_samples_leaf': [1, 2],
        }
    ),
    (
        RandomForestClassifier(n_jobs = -1, random_state = 42),
        {
            'selectfrommodel__max_features': max_features,
            'randomforestclassifier__n_estimators': [100, 500, 1000, 2000],
            'randomforestclassifier__max_depth': [None, 10, 100],
            'randomforestclassifier__min_samples_split': [2, 4],
            'randomforestclassifier__min_samples_leaf': [1, 2],
        }
    ),
    (
        LogisticRegression(max_iter = 1000, random_state = 42),
        {
            'selectfrommodel__max_features': max_features,
            'logisticregression__C': [0.1, 0.5, 1, 10],
            'logisticregression__class_weight': [None, 'balanced']
        }
    ),
    (
        KNeighborsClassifier(),
        {
            'selectfrommodel__max_features': max_features,
            'kneighborsclassifier__n_neighbors': [1, 3, 5, 10, 50, 100, 200],
        }
    ),
    (
        LinearDiscriminantAnalysis(),
        {
            'selectfrommodel__max_features': max_features,
        }
    ),
    (
        QuadraticDiscriminantAnalysis(),
        {
            'selectfrommodel__max_features': max_features,
        }
    )
]

In [10]:
for model, param_grid in model_and_params:

    print(f'{model.__class__.__name__} ...')

    p = pipe_sel_from_model(model, param_grid)
    p.fit(X_train, y_train)
    print('Best params:', p.best_params_)

    # Save the GridSearchCV object
    with open(os.path.join(MODELS_PATH, f'{model.__class__.__name__}.pkl'), 'wb') as f:
        pickle.dump(p, f)

DummyClassifier ...
Best params: {'selectfrommodel__max_features': 1}
DecisionTreeClassifier ...
Best params: {'decisiontreeclassifier__max_depth': None, 'decisiontreeclassifier__min_samples_leaf': 2, 'decisiontreeclassifier__min_samples_split': 2, 'selectfrommodel__max_features': 1}
RandomForestClassifier ...
Best params: {'randomforestclassifier__max_depth': 10, 'randomforestclassifier__min_samples_leaf': 2, 'randomforestclassifier__min_samples_split': 2, 'randomforestclassifier__n_estimators': 100, 'selectfrommodel__max_features': 1}
LogisticRegression ...
Best params: {'logisticregression__C': 0.1, 'logisticregression__class_weight': None, 'selectfrommodel__max_features': 1}
KNeighborsClassifier ...
Best params: {'kneighborsclassifier__n_neighbors': 200, 'selectfrommodel__max_features': 1}
LinearDiscriminantAnalysis ...
Best params: {'selectfrommodel__max_features': 1}
QuadraticDiscriminantAnalysis ...
Best params: {'selectfrommodel__max_features': 1}


In [9]:
results = []

for model_name in os.listdir(MODELS_PATH):

    # Load the GridSearchCV object
    with open(os.path.join(MODELS_PATH, model_name), 'rb') as f:
        grid_cv = pickle.load(f)

    # Columns selected by the SelectFromModel
    selected_cols = [c for c, s in zip(X_train.columns, grid_cv.best_estimator_[1].get_support()) if s]
    
    # Get the best score and parameters
    best_cv_score = grid_cv.best_score_
    best_params = grid_cv.best_params_
    preds = grid_cv.best_estimator_[-1].predict(StandardScaler().fit_transform(X_test[selected_cols]))
    test_score = roc_auc_score(y_test, preds)
    selected_features = grid_cv.best_estimator_.named_steps['selectfrommodel'].get_support().sum()

    results.append((model_name.split('.')[0], best_cv_score, test_score, selected_features))

results = pd.DataFrame(results, columns = ['Model', 'Best CV Score', 'Test Score', 'No. of selected vars']).sort_values('Test Score', ascending = False)
results.to_csv(os.path.join(RESULTS_PATH, 'results_diabetes_a1c.csv'), index = False)
results

Unnamed: 0,Model,Best CV Score,Test Score,No. of selected vars
3,RandomForestClassifier,0.919454,0.85214,1
4,DecisionTreeClassifier,0.917767,0.85214,1
0,KNeighborsClassifier,0.924843,0.826932,1
2,LogisticRegression,0.927405,0.766905,1
6,QuadraticDiscriminantAnalysis,0.921412,0.766905,1
5,LinearDiscriminantAnalysis,0.927405,0.725238,1
1,DummyClassifier,0.5,0.5,1
