# ADS 504 Team 1 Project
## Modeling Techniques Notebook

This notebook implements the modeling pipeline for predicting comorbidities in Type II diabetes respondents from the 2023 BRFSS dataset.

### Table of Contents
1. Setup
2. Data Loading & Preprocessing
3. Dimensionality Reduction & Feature Selection
4. Baseline & Simple Classifiers
5. Ensemble & Kernel Methods
6. Hyperparameter Tuning
7. Neural Networks & Deep Learning
8. Save Trained Models

In [1]:

# Section 1: Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

import joblib
import warnings, os, json
warnings.filterwarnings('ignore')
RANDOM_STATE = 42



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\darre\miniconda3\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\darre\miniconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\darre\miniconda3\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "c:\Users\darre\miniconda3\Lib\site-

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\darre\miniconda3\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\darre\miniconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\darre\miniconda3\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "c:\Users\darre\miniconda3\Lib\site-

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\darre\miniconda3\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\darre\miniconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\darre\miniconda3\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "c:\Users\darre\miniconda3\Lib\site-

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import

In [1]:

# Section 2: Data Loading & Preprocessing
df = pd.read_csv('processed_data/diabetes_df.csv')
df_engineering = pd.read_csv('diabetes_df_engineered.csv')
print(f"Shape: {df.shape}")

# Identify target variable
TARGET = 'GENHLTH'  # <-- update with actual target column  #todo: replace with actual target column name
y = df[TARGET]
X = df.drop(columns=[TARGET])

# Drop columns with >95% missing
missing_pct = X.isna().mean()
drop_cols = missing_pct[missing_pct > 0.95].index.tolist()
print(f"Dropping {len(drop_cols)} cols with >95% missing")
X = X.drop(columns=drop_cols)

# Numeric / categorical split
num_cols = X.select_dtypes(include=['float64','int64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()

from sklearn import impute
numeric_transformer = Pipeline(steps=[('imputer',  impute.SimpleImputer(strategy='median')),
                                      ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('imputer', impute.SimpleImputer(strategy='most_frequent')),
                                          ('ohe', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)
print(X_train.shape, X_test.shape)


NameError: name 'pd' is not defined

#### 3. Dimensionality Reduction & Feature Selection

In [19]:

from sklearn.decomposition import PCA
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

pca = PCA(n_components=0.95, random_state=RANDOM_STATE)
X_train_pca = pca.fit_transform(X_train_transformed)
X_test_pca = pca.transform(X_test_transformed)
print('PCA reduced shape:', X_train_pca.shape)


PCA reduced shape: (16055, 82)


#### 4. Baseline & Simple Classifiers

In [20]:
models = {
    'log_reg': LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    'perceptron': Perceptron(random_state=RANDOM_STATE),
    'knn': KNeighborsClassifier(),
    'dtree': DecisionTreeClassifier(random_state=RANDOM_STATE)
}

results = {}
for name, model in models.items():
    clf = Pipeline(steps=[('pre', preprocessor),
                         ('model', model)])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:,1] if hasattr(clf, 'predict_proba') else None
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else np.nan
    print(f"{name}: acc={acc:.3f}, auc={auc:.3f}")
    results[name] = {'acc': acc, 'auc': auc}
    joblib.dump(clf, f'model_{name}.joblib')

with open('model_results.json', 'w') as f:
    json.dump(results, f)


ValueError: multi_class must be in ('ovo', 'ovr')

#### 5. Ensemble & Kernel Methods

In [None]:

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

ensemble_models = {
    'rf': RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1),
    'gbc': GradientBoostingClassifier(random_state=RANDOM_STATE),
    'ada': AdaBoostClassifier(random_state=RANDOM_STATE),
    'xgb': XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'),
    'svm_rbf': SVC(kernel='rbf', probability=True, random_state=RANDOM_STATE)
}

for name, model in ensemble_models.items():
    clf = Pipeline(steps=[('pre', preprocessor),
                         ('model', model)])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:,1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    print(f"{name}: acc={acc:.3f}, auc={auc:.3f}")
    results[name] = {'acc': acc, 'auc': auc}
    joblib.dump(clf, f'model_{name}.joblib')

with open('model_results.json', 'w') as f:
    json.dump(results, f)


#### 6. Hyperparameter Tuning

In [None]:

param_grid = {
    'model__n_estimators': [100, 300, 500],
    'model__max_depth': [None, 5, 10]
}
rf_pipeline = Pipeline(steps=[('pre', preprocessor),
                             ('model', RandomForestClassifier(random_state=RANDOM_STATE))])
grid = GridSearchCV(rf_pipeline, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train, y_train)
print('Best RF params:', grid.best_params_, 'AUC:', grid.best_score_)
joblib.dump(grid.best_estimator_, 'model_rf_tuned.joblib')


#### 7. Neural Networks & Deep Learning

In [None]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.metrics import AUC

X_train_np = preprocessor.fit_transform(X_train).toarray()
X_test_np = preprocessor.transform(X_test).toarray()

model_nn = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_np.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
model_nn.compile(optimizer='adam',
                 loss='binary_crossentropy',
                 metrics=[AUC(name='auc')])
history = model_nn.fit(X_train_np, y_train, epochs=10, batch_size=256,
                       validation_split=0.2, verbose=0)
nn_auc = model_nn.evaluate(X_test_np, y_test, verbose=0)[1]
print('Neural Net AUC:', nn_auc)
model_nn.save('model_nn.h5')
results['nn'] = {'acc': np.nan, 'auc': nn_auc}

with open('model_results.json', 'w') as f:
    json.dump(results, f)


#### 8. Save Trained Models

In [None]:

pd.DataFrame(results).T.sort_values('auc', ascending=False)
