2,500,000 Sample

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_predict
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler

In [2]:
##Data Exploration and Preprocessing

In [3]:
data= pd.read_csv('sample2.csv')

In [4]:
df = data.drop(columns=['CASEID', 'CMPSERVICE', 'IJSSERVICE', 
                        'OPISERVICE','RTCSERVICE', 'SPHSERVICE', 'YEAR',
                        'TRAUSTREFLG', 'ANXIETYFLG', 
                        'ADHDFLG', 'CONDUCTFLG', 'DELIRDEMFLG', 
                        'BIPOLARFLG', 'DEPRESSFLG', 'ODDFLG', 
                        'PDDFLG', 'PERSONFLG', 'SCHIZOFLG', 
                        'ALCSUBFLG', 'OTHERDISFLG',
                        'DIVISION', 'REGION'])

In [5]:
def label_risk(row):
    if row['SMISED'] == 1:
        return 1
    if row['SMISED'] == 2:
        return 1
    if row['SMISED'] == 3:
        return 0
    else:
        return float('NaN')

In [6]:
df['mh_risk'] = df.apply(label_risk, axis=1)
df = df.dropna()
df = df.astype('int64')
df = df.drop(columns=['SMISED'])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2343299 entries, 0 to 2499999
Data columns (total 18 columns):
 #   Column    Dtype
---  ------    -----
 0   AGE       int64
 1   EDUC      int64
 2   ETHNIC    int64
 3   RACE      int64
 4   GENDER    int64
 5   MH1       int64
 6   MH2       int64
 7   MH3       int64
 8   SUB       int64
 9   MARSTAT   int64
 10  SAP       int64
 11  EMPLOY    int64
 12  DETNLF    int64
 13  VETERAN   int64
 14  LIVARAG   int64
 15  NUMMHS    int64
 16  STATEFIP  int64
 17  mh_risk   int64
dtypes: int64(18)
memory usage: 339.7 MB


In [8]:
X = df.drop(columns=['mh_risk'])
X.head()

Unnamed: 0,AGE,EDUC,ETHNIC,RACE,GENDER,MH1,MH2,MH3,SUB,MARSTAT,SAP,EMPLOY,DETNLF,VETERAN,LIVARAG,NUMMHS,STATEFIP
0,8,-9,3,-9,1,-9,-9,-9,-9,-9,2,-9,-9,-9,-9,0,6
1,7,5,4,5,2,11,-9,-9,-9,1,2,4,-9,2,2,1,24
2,14,5,4,5,2,10,13,-9,8,4,1,5,1,2,2,2,18
3,3,4,4,5,1,7,-9,-9,-9,-9,-9,4,-9,-9,2,1,20
4,11,5,4,5,2,6,13,-9,-9,4,2,-9,-9,2,2,2,34


In [9]:
y = df['mh_risk'].values

In [10]:
##Split dataset into train and test data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [11]:
scaler = StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)

In [12]:
model_list = ['DecisionTree','RandomForest','Logistic Regression', 'Gradient Boosting Regression', 'Artificial Neural Network' ]
score=[]

In [13]:
def get_metrics(y_test, y_pred):
    print("F1 score:", f1_score(y_test, y_pred, average='binary'))
    print("ROC AUC:", roc_auc_score(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("Confusion Matrix:", confusion_matrix(y_test, y_pred))

In [14]:
#DT
dt_clf = DecisionTreeClassifier(max_leaf_nodes=10, random_state=0, criterion='entropy')
dt_clf.fit(X_train, y_train)
y_pred=pd.DataFrame(dt_clf.predict_proba(X_test))[1].values
score.append(roc_auc_score(y_test,y_pred))

np.array(score)

array([0.7706861])

In [15]:
#DT Metrics
y_pred_dt = cross_val_predict(dt_clf, X_test, y_test, cv=5)
get_metrics(y_test, y_pred_dt)

F1 score: 0.8580258786270785
ROC AUC: 0.6140974442973779
Accuracy: 0.7700465155976615
Precision: 0.7897237169802628
Recall: 0.9392613529844397
Confusion Matrix: [[ 35234  86711]
 [ 21059 325656]]


In [16]:
# RF
rf_clf = RandomForestClassifier()      
rf_clf.fit(X_train,y_train)
y_pred=pd.DataFrame(rf_clf.predict_proba(X_test))[1].values

score.append(roc_auc_score(y_test,y_pred))

np.array(score)

array([0.7706861, 0.9301789])

In [17]:
#RF Metrics
y_pred_rf = cross_val_predict(rf_clf, X_test, y_test, cv=5)
get_metrics(y_test, y_pred_rf)

F1 score: 0.9127869968030736
ROC AUC: 0.8136644547531631
Accuracy: 0.8687385311313105
Precision: 0.8975993308239231
Recall: 0.9284974691028655
Confusion Matrix: [[ 85219  36726]
 [ 24791 321924]]


In [18]:
#LR
lr_clf = LogisticRegression(solver='lbfgs', max_iter=1000)
lr_clf.fit(X_scaled,y_train)
y_pred=pd.DataFrame(lr_clf.predict_proba(X_test))[1].values
score.append(roc_auc_score(y_test,y_pred))

np.array(score)



array([0.7706861 , 0.9301789 , 0.60677017])

In [19]:
#Evaluate LR
y_pred_lr = cross_val_predict(lr_clf, X_test, y_test, cv=5)
get_metrics(y_test, y_pred_lr)

F1 score: 0.8485110470701249
ROC AUC: 0.5015719304619223
Accuracy: 0.7375368070669569
Precision: 0.7404103038053582
Recall: 0.9935739728595533
Confusion Matrix: [[  1167 120778]
 [  2228 344487]]


In [20]:
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)

y_pred=pd.DataFrame(gbr.predict(X_test))
score.append(roc_auc_score(y_test,y_pred))

np.array(score)

array([0.7706861 , 0.9301789 , 0.60677017, 0.88707568])

In [21]:
#Evaluate gbr
y_pred_gbr = cross_val_predict(gbr, X_test, y_test, cv=5)
get_metrics(y_test, y_pred_gbr)

ValueError: Classification metrics can't handle a mix of binary and continuous targets

GBR not suitable for evaluation

In [22]:
#ANN
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(32, activation ="relu"))
model.add(Dense(8, activation="relu"))

#Output Layer
model.add(Dense(1))

model.compile(loss="mse", optimizer="Adam", metrics=["mse"])
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=30, validation_split=0.1)
pred_y = model.predict(X_test)
score.append(roc_auc_score(y_test,pred_y))

np.array(score)

Epoch 1/30
[1m52725/52725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 245us/step - loss: 0.2075 - mse: 0.2075 - val_loss: 0.1299 - val_mse: 0.1299
Epoch 2/30
[1m52725/52725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 244us/step - loss: 0.1299 - mse: 0.1299 - val_loss: 0.1253 - val_mse: 0.1253
Epoch 3/30
[1m52725/52725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 244us/step - loss: 0.1261 - mse: 0.1261 - val_loss: 0.1251 - val_mse: 0.1251
Epoch 4/30
[1m52725/52725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 243us/step - loss: 0.1234 - mse: 0.1234 - val_loss: 0.1229 - val_mse: 0.1229
Epoch 5/30
[1m52725/52725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 244us/step - loss: 0.1221 - mse: 0.1221 - val_loss: 0.1210 - val_mse: 0.1210
Epoch 6/30
[1m52725/52725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 244us/step - loss: 0.1210 - mse: 0.1210 - val_loss: 0.1198 - val_mse: 0.1198
Epoch 7/30
[1m52725/52725[0m [32m━━━━━━━━━━

In [None]:
#Evaluate ann
y_pred_ann = cross_val_predict(model, X_test, y_test, cv=5)
get_metrics(y_test, y_pred_ann)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize']=20,8
sns.set_style('darkgrid')
ax = sns.barplot(x=model_list, y=score, palette = "husl", saturation =2.0)
plt.xlabel('Classifier Models', fontsize = 20 )
plt.ylabel('Probability', fontsize = 20)
plt.title('AUC-ROC of different Classifier Models', fontsize = 20)
plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 12)
for i in ax.patches:
    width, height = i.get_width(), i.get_height()
    x, y = i.get_xy() 
    ax.annotate(f'{round(height,2)}%', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()