# SAMSHA MH-CLD

## Environment

In [1]:
# setting the random seed for reproducibility
import random
random.seed(493)

import os

# for manipulating dataframes
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, make_scorer, accuracy_score

In [2]:
# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [3]:
# Read the Parquet file back into a DataFrame
df = pd.read_csv('../data/2020_to_2022.csv')

In [4]:
df.dtypes

_year          int64
AGE            int64
RACE           int64
GENDER         int64
MH1            int64
MH2            int64
MH3            int64
SUB            int64
MARSTAT        int64
SMISED         int64
SAP            int64
EMPLOY         int64
DETNLF         int64
VETERAN        int64
LIVARAG        int64
TRAUSTREFLG    int64
ANXIETYFLG     int64
ADHDFLG        int64
CONDUCTFLG     int64
DELIRDEMFLG    int64
BIPOLARFLG     int64
DEPRESSFLG     int64
ODDFLG         int64
PDDFLG         int64
PERSONFLG      int64
SCHIZOFLG      int64
ALCSUBFLG      int64
OTHERDISFLG    int64
STATEFIP       int64
REGION         int64
_caseid        int64
dtype: object

In [5]:
df.head()

Unnamed: 0,_year,AGE,RACE,GENDER,MH1,MH2,MH3,SUB,MARSTAT,SMISED,SAP,EMPLOY,DETNLF,VETERAN,LIVARAG,TRAUSTREFLG,ANXIETYFLG,ADHDFLG,CONDUCTFLG,DELIRDEMFLG,BIPOLARFLG,DEPRESSFLG,ODDFLG,PDDFLG,PERSONFLG,SCHIZOFLG,ALCSUBFLG,OTHERDISFLG,STATEFIP,REGION,_caseid
0,2020,14,3,1,5,11,-9,-9,1,1,1,-9,-9,-9,3,0,0,0,0,1,0,0,0,0,0,1,0,0,1,3,20200000001
1,2020,1,1,1,13,2,3,-9,1,2,2,-9,-9,-9,-9,0,1,1,0,0,0,0,0,0,0,0,0,1,1,3,20200000002
2,2020,3,6,2,1,2,-9,-9,-9,2,2,5,5,-9,2,1,1,0,0,0,0,0,0,0,0,0,0,0,1,3,20200000003
3,2020,12,1,1,11,10,-9,5,-9,1,1,2,-9,-9,3,0,0,0,0,0,0,0,0,0,1,1,0,0,1,3,20200000004
4,2020,8,5,1,10,-9,-9,10,-9,1,1,1,-9,-9,3,0,0,0,0,0,0,0,0,0,1,0,0,0,1,3,20200000005


In [6]:
df = df.drop('_caseid', axis=1)

In [7]:
df.head()

Unnamed: 0,_year,AGE,RACE,GENDER,MH1,MH2,MH3,SUB,MARSTAT,SMISED,SAP,EMPLOY,DETNLF,VETERAN,LIVARAG,TRAUSTREFLG,ANXIETYFLG,ADHDFLG,CONDUCTFLG,DELIRDEMFLG,BIPOLARFLG,DEPRESSFLG,ODDFLG,PDDFLG,PERSONFLG,SCHIZOFLG,ALCSUBFLG,OTHERDISFLG,STATEFIP,REGION
0,2020,14,3,1,5,11,-9,-9,1,1,1,-9,-9,-9,3,0,0,0,0,1,0,0,0,0,0,1,0,0,1,3
1,2020,1,1,1,13,2,3,-9,1,2,2,-9,-9,-9,-9,0,1,1,0,0,0,0,0,0,0,0,0,1,1,3
2,2020,3,6,2,1,2,-9,-9,-9,2,2,5,5,-9,2,1,1,0,0,0,0,0,0,0,0,0,0,0,1,3
3,2020,12,1,1,11,10,-9,5,-9,1,1,2,-9,-9,3,0,0,0,0,0,0,0,0,0,1,1,0,0,1,3
4,2020,8,5,1,10,-9,-9,10,-9,1,1,1,-9,-9,3,0,0,0,0,0,0,0,0,0,1,0,0,0,1,3


In [8]:
# Splitting features and target
X = df.drop('ALCSUBFLG', axis=1)
y = df['ALCSUBFLG']

In [9]:
df.columns

Index(['_year', 'AGE', 'RACE', 'GENDER', 'MH1', 'MH2', 'MH3', 'SUB', 'MARSTAT',
       'SMISED', 'SAP', 'EMPLOY', 'DETNLF', 'VETERAN', 'LIVARAG',
       'TRAUSTREFLG', 'ANXIETYFLG', 'ADHDFLG', 'CONDUCTFLG', 'DELIRDEMFLG',
       'BIPOLARFLG', 'DEPRESSFLG', 'ODDFLG', 'PDDFLG', 'PERSONFLG',
       'SCHIZOFLG', 'ALCSUBFLG', 'OTHERDISFLG', 'STATEFIP', 'REGION'],
      dtype='object')

In [17]:
# Identify binary categorical features (columns ending with FLG)
binary_features = [col for col in X.columns if col.endswith('FLG')]

# Identify other categorical features
categorical_features = [col for col in X.columns if col not in binary_features]

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features),  # Encode categorical columns
        ('binary', 'passthrough', binary_features)  # Pass binary features as-is
    ]
)


In [18]:
# Models to evaluate
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [19]:
# DataFrame to track metrics
metrics_tally = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall'])

In [20]:
# Train/test split for final evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=493, stratify=y)

In [21]:
# Cross-validation setup
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=493)

In [None]:
# Evaluate each model
for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Cross-validation scores
    accuracy = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='accuracy').mean()
    
    # Custom scoring for precision and recall
    precision = cross_val_score(pipeline, X_train, y_train, cv=kfold,
                                 scoring=make_scorer(precision_score)).mean()
    recall = cross_val_score(pipeline, X_train, y_train, cv=kfold,
                              scoring=make_scorer(recall_score)).mean()
    
    # Append results to the DataFrame using pd.concat
    metrics_tally = pd.concat([
        metrics_tally,
        pd.DataFrame([{
            'Model': model_name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall
        }])
    ], ignore_index=True)

1 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Dd\OneDrive\Documents\_github\samhsa-mh-cld\v312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Dd\OneDrive\Documents\_github\samhsa-mh-cld\v312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Dd\OneDrive\Documents\_github\samhsa-mh-cld\v312\Lib\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)

In [None]:
metrics_tally