# SAMSHA MH-CLD

## Environment

In [22]:
# setting the random seed for reproducibility
import random
random.seed(493)

import os

# for manipulating dataframes
import pandas as pd

import pickle
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, make_scorer, accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve

# For visualizations
import matplotlib.pyplot as plt

In [2]:
# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [3]:
# Read the Parquet file back into a DataFrame
df = pd.read_csv('../data/ml/2022_balanced.csv')

In [4]:
df.dtypes

AGE            int64
RACE           int64
GENDER         int64
MH1            int64
MH2            int64
MH3            int64
MARSTAT        int64
SMISED         int64
EMPLOY         int64
VETERAN        int64
LIVARAG        int64
TRAUSTREFLG    int64
ANXIETYFLG     int64
ADHDFLG        int64
CONDUCTFLG     int64
DELIRDEMFLG    int64
BIPOLARFLG     int64
DEPRESSFLG     int64
ODDFLG         int64
PDDFLG         int64
PERSONFLG      int64
SCHIZOFLG      int64
ALCSUBFLG      int64
OTHERDISFLG    int64
STATEFIP       int64
REGION         int64
dtype: object

In [5]:
df.head()

Unnamed: 0,AGE,RACE,GENDER,MH1,MH2,MH3,MARSTAT,SMISED,EMPLOY,VETERAN,LIVARAG,TRAUSTREFLG,ANXIETYFLG,ADHDFLG,CONDUCTFLG,DELIRDEMFLG,BIPOLARFLG,DEPRESSFLG,ODDFLG,PDDFLG,PERSONFLG,SCHIZOFLG,ALCSUBFLG,OTHERDISFLG,STATEFIP,REGION
0,1,3,2,999,999,999,999,2,999,999,999,0,0,0,0,0,0,0,0,0,0,0,0,0,34,1
1,8,6,2,999,999,999,999,1,999,999,999,0,0,0,0,0,0,0,0,0,0,0,0,0,6,4
2,3,5,2,1,2,999,999,2,999,999,2,1,1,0,0,0,0,0,0,0,0,0,0,0,27,2
3,2,6,2,13,999,999,999,2,999,999,999,0,0,0,0,0,0,0,0,0,0,0,0,1,6,4
4,7,5,2,999,999,999,3,999,999,2,999,0,0,0,0,0,0,0,0,0,0,0,0,0,47,3


In [6]:
# Splitting features and target
X = df.drop('ALCSUBFLG', axis=1)
y = df['ALCSUBFLG']

In [7]:
df.columns

Index(['AGE', 'RACE', 'GENDER', 'MH1', 'MH2', 'MH3', 'MARSTAT', 'SMISED',
       'EMPLOY', 'VETERAN', 'LIVARAG', 'TRAUSTREFLG', 'ANXIETYFLG', 'ADHDFLG',
       'CONDUCTFLG', 'DELIRDEMFLG', 'BIPOLARFLG', 'DEPRESSFLG', 'ODDFLG',
       'PDDFLG', 'PERSONFLG', 'SCHIZOFLG', 'ALCSUBFLG', 'OTHERDISFLG',
       'STATEFIP', 'REGION'],
      dtype='object')

In [8]:
# Identify binary categorical features (columns ending with FLG)
binary_features = [col for col in X.columns if col.endswith('FLG')]

# Identify other categorical features
categorical_features = [col for col in X.columns if col not in binary_features]

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features),  # Encode categorical columns
        ('binary', 'passthrough', binary_features)  # Pass binary features as-is
    ]
)


In [9]:
# Models to evaluate
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [10]:
# DataFrame to track metrics
metrics_tally = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall'])

In [11]:
# Train/test split for final evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=493, stratify=y)

In [12]:
# Cross-validation setup
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=493)

In [13]:
# Evaluate each model
for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Cross-validation scores
    accuracy = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='accuracy').mean()
    
    # Custom scoring for precision and recall
    precision = cross_val_score(pipeline, X_train, y_train, cv=kfold,
                                 scoring=make_scorer(precision_score)).mean()
    recall = cross_val_score(pipeline, X_train, y_train, cv=kfold,
                              scoring=make_scorer(recall_score)).mean()
    
    # Append results to the DataFrame using pd.concat
    metrics_tally = pd.concat([
        metrics_tally,
        pd.DataFrame([{
            'Model': model_name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall
        }])
    ], ignore_index=True)

  metrics_tally = pd.concat([
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [14]:
metrics_tally

Unnamed: 0,Model,Accuracy,Precision,Recall
0,Logistic Regression,1.0,1.0,1.0
1,Naive Bayes,0.841685,0.759522,1.0
2,KNN,0.98697,0.979323,0.994947
3,Decision Tree,1.0,1.0,1.0
4,Random Forest,1.0,1.0,1.0
5,XGBoost,1.0,1.0,1.0


In [15]:
# Final evaluation on the test set for the best model (optional)
# Example with KNN
best_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

final_accuracy = accuracy_score(y_test, y_pred)
final_precision = precision_score(y_test, y_pred)
final_recall = recall_score(y_test, y_pred)

print(f"Final Test Set Performance (Logistic Regression):\n"
      f"Accuracy: {final_accuracy}, Precision: {final_precision}, Recall: {final_recall}")

Final Test Set Performance (Logistic Regression):
Accuracy: 0.9879433315688296, Precision: 0.9812196897882922, Recall: 0.9949293734154292


In [19]:
with open("../models/2022_balanced_knn_model.pkl", 'wb') as model_file:
    pickle.dump(model, model_file)

In [None]:
# Predict probabilities for the test set
y_prob = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate AUC-ROC
auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC-ROC Score: {auc_score}")

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid()
plt.show()