In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../preprocessed/30j_maln_risk_preprocessed_ohe.csv")

df.head()

Unnamed: 0,low_PA,hosp_pastyr,sig_hlth_decln,func_diffhigh,low_edu,age_cat,mmorb_q3,memory_prob,mobility_lim,hlth_insur,...,poor_appetite,diff_eating,FSDAD,PROTDEN,maln_risk,RIAGENDR,RXDCOUNT,MHpro_pastyr,MH_Depressed,MH_lossofinterest
0,1.0,0.0,1.0,1.0,0.0,4,1,0.0,1.0,1.0,...,0.0,0.0,1.0,2.120103,0.0,1,5.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,4,0,0.0,1.0,1.0,...,0.0,0.0,1.0,2.26269,0.0,1,5.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,1.0,3,1,0.0,2.0,1.0,...,0.0,0.0,1.0,1.209179,0.0,2,5.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,4,0,0.0,1.0,1.0,...,0.0,0.0,1.0,1.335815,0.0,1,3.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,3,1,0.0,1.0,1.0,...,0.0,0.0,1.0,3.448622,0.0,1,1.0,0.0,0.0,0.0


In [3]:
df.shape

(3722, 28)

In [4]:
df.isnull().sum().sum()

0

In [5]:
df

Unnamed: 0,low_PA,hosp_pastyr,sig_hlth_decln,func_diffhigh,low_edu,age_cat,mmorb_q3,memory_prob,mobility_lim,hlth_insur,...,poor_appetite,diff_eating,FSDAD,PROTDEN,maln_risk,RIAGENDR,RXDCOUNT,MHpro_pastyr,MH_Depressed,MH_lossofinterest
0,1.0,0.0,1.0,1.0,0.0,4,1,0.0,1.0,1.0,...,0.0,0.0,1.0,2.120103,0.0,1,5.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,4,0,0.0,1.0,1.0,...,0.0,0.0,1.0,2.262690,0.0,1,5.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,1.0,3,1,0.0,2.0,1.0,...,0.0,0.0,1.0,1.209179,0.0,2,5.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,4,0,0.0,1.0,1.0,...,0.0,0.0,1.0,1.335815,0.0,1,3.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,3,1,0.0,1.0,1.0,...,0.0,0.0,1.0,3.448622,0.0,1,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3717,0.0,0.0,0.0,0.0,0.0,2,1,0.0,1.0,1.0,...,0.0,0.0,1.0,2.850099,0.0,1,1.0,0.0,0.0,1.0
3718,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,1.0,...,0.0,0.0,1.0,2.499464,0.0,1,7.0,0.0,1.0,0.0
3719,1.0,0.0,0.0,1.0,0.0,3,1,0.0,2.0,1.0,...,1.0,0.0,1.0,4.493410,0.0,2,5.0,1.0,0.0,0.0
3720,0.0,0.0,0.0,0.0,0.0,3,0,0.0,1.0,1.0,...,0.0,0.0,1.0,2.726736,0.0,1,4.0,0.0,0.0,0.0


In [6]:
df['maln_risk'].value_counts()

0.0    2871
1.0     490
2.0     361
Name: maln_risk, dtype: int64

In [7]:
# downsampling 
from imblearn.under_sampling import RandomUnderSampler

X_und, y_und = df.drop(columns='maln_risk'), df['maln_risk']
rus = RandomUnderSampler(random_state=42)
X_und, y_und = rus.fit_resample(X_und, y_und)

y_und.value_counts()

0.0    361
1.0    361
2.0    361
Name: maln_risk, dtype: int64

### ML modelling

In [8]:
from sklearn import svm, metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, KFold, train_test_split, cross_val_predict, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

import warnings
from sklearn.exceptions import ConvergenceWarning

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [9]:
def estimate_acc(classifiers, X, y, stratified_k_fold=False,
                conv_to_series=False):
    X_loc, y_loc = X, y

    if stratified_k_fold:
        cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
    else:
        cv = KFold(n_splits=10, shuffle=True, random_state=7)
    results = pd.DataFrame(columns=['clf', 'acc', 'std', 'class0_acc', 'class1_acc', 'class_2acc'])
    
    for i, clf in enumerate(classifiers):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=ConvergenceWarning) # ignore warns from Logreg
            scores = cross_val_score(clf, X_loc, y_loc, scoring='accuracy', cv=cv)

            y_pred = cross_val_predict(clf, X_loc, y_loc, cv=cv)
            results.loc[i] = (clf, accuracy_score(y_loc, y_pred), np.std(scores),
                              accuracy_score(y_loc[y_loc == 0.0], y_pred[y_loc == 0.0]),
                              accuracy_score(y_loc[y_loc == 1.0], y_pred[y_loc == 1.0]),
                              accuracy_score(y_loc[y_loc == 2.0], y_pred[y_loc == 2.0])
                              )
        
    return results

In [10]:
classifiers = (LogisticRegression(multi_class='multinomial', max_iter=2000),
               svm.SVC(),
               RandomForestClassifier(),
               GaussianNB(),
               DecisionTreeClassifier(), 
               LinearDiscriminantAnalysis(),
               KNeighborsClassifier())

estimate_acc(classifiers, X_und, y_und, stratified_k_fold=True)

Unnamed: 0,clf,acc,std,class0_acc,class1_acc,class_2acc
0,"LogisticRegression(max_iter=2000, multi_class=...",0.423823,0.037366,0.526316,0.263158,0.481994
1,SVC(),0.369344,0.031057,0.548476,0.462604,0.096953
2,RandomForestClassifier(),0.418283,0.030072,0.421053,0.360111,0.473684
3,GaussianNB(),0.390582,0.024427,0.698061,0.127424,0.34626
4,DecisionTreeClassifier(),0.394275,0.059942,0.376731,0.373961,0.432133
5,LinearDiscriminantAnalysis(),0.420129,0.041238,0.520776,0.254848,0.484765
6,KNeighborsClassifier(),0.397969,0.033005,0.551247,0.357341,0.285319


### Hyperparameters tuning

In [11]:
grid_params = {'log_reg': {"C":np.logspace(-3,3,7), "penalty":["l2"]},
               'svm': {'C': [0.1, 1, 10, 100, 1000],  
                       'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
                       'kernel': ['rbf']},
               'rf': {'n_estimators': [200, 500], 'max_features': ['sqrt', 'log2'], 
                      'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy']},
               'gnb': {'var_smoothing': np.logspace(0,-9, num=100)},
               'dt': {'max_features': ['sqrt', 'log2'], 'ccp_alpha': [0.1, .01, .001], 
                      'max_depth' : [5, 6, 7, 8, 9], 'criterion' :['gini', 'entropy']},
               'lda': {'solver': ['svd', 'lsqr', 'eigen']},
               'knn': {'n_neighbors': list(range(1, 31))}}

In [12]:
grid_params_vals = list(grid_params.values())

In [13]:
def estimate_acc_gridcv(classifiers, X, y, grid_params_vals):
    
    X_loc, y_loc = X, y
    
    X_train, X_test, y_train, y_test = train_test_split(X_loc, y_loc, test_size=.2)


    skf_grid_search = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
    
    results = pd.DataFrame(columns=['clf', 'mean_acc', 'mean_std', 'val_acc', 
                                    'val_class0', 'val_class1', 'val_class2'])
    
    feature_importances = pd.DataFrame(X.columns)
    
    for i, clf in enumerate(classifiers):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=ConvergenceWarning) # ignore warns from Logreg
            best_clf = GridSearchCV(estimator=clf, param_grid=grid_params_vals[i], 
                                    cv=skf_grid_search, verbose=True, n_jobs=-1)
            
            best_clf.fit(X_train, y_train)
            mean_test_acc, mean_test_std = (pd.DataFrame(best_clf.cv_results_)[pd.DataFrame(best_clf.cv_results_)['rank_test_score'] == 1][['mean_test_score', 'std_test_score']].iloc[0, 0], 
                                 pd.DataFrame(best_clf.cv_results_)[pd.DataFrame(best_clf.cv_results_)['rank_test_score'] == 1][['mean_test_score', 'std_test_score']].iloc[0, 1])
            
            y_pred = best_clf.predict(X_test)
            
            val_acc = accuracy_score(y_pred, y_test)
            
            class_0_val_score, class_1_val_score, class_2_val_score = (accuracy_score(y_test[y_test == 0.0], 
                                                                                      y_pred[y_test == 0.0]),
                                                                       accuracy_score(y_test[y_test == 1.0], 
                                                                                      y_pred[y_test == 1.0]),
                                                                       accuracy_score(y_test[y_test == 2.0],
                                                                                      y_pred[y_test == 2.0]))
            
                
            results.loc[i] = (clf, mean_test_acc, mean_test_std, val_acc, class_0_val_score, class_1_val_score, class_2_val_score)
            
            clf = best_clf.best_estimator_
            
            if hasattr(clf, 'feature_importances_'):
                importance = clf.feature_importances_
                feature_importances[clf.__class__.__name__] = importance
            elif hasattr(clf, 'coef_'):
                importance = np.abs(clf.coef_[0])
                feature_importances[clf.__class__.__name__] = importance
                
    
    return results, feature_importances

In [14]:
y_und.value_counts()

0.0    361
1.0    361
2.0    361
Name: maln_risk, dtype: int64

In [15]:
classifiers = (LogisticRegression(max_iter=2000),
               svm.SVC(),
               RandomForestClassifier(),
               GaussianNB(),
               DecisionTreeClassifier(),
               LinearDiscriminantAnalysis(),
               KNeighborsClassifier())

res, imps = estimate_acc_gridcv(classifiers, X_und, y_und, grid_params_vals)

Fitting 10 folds for each of 7 candidates, totalling 70 fits
Fitting 10 folds for each of 25 candidates, totalling 250 fits
Fitting 10 folds for each of 40 candidates, totalling 400 fits
Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 3 candidates, totalling 30 fits
Fitting 10 folds for each of 30 candidates, totalling 300 fits


In [16]:
res

Unnamed: 0,clf,mean_acc,mean_std,val_acc,val_class0,val_class1,val_class2
0,LogisticRegression(max_iter=2000),0.424873,0.040562,0.460829,0.513158,0.304348,0.555556
1,SVC(),0.433053,0.046656,0.474654,0.565789,0.318841,0.527778
2,RandomForestClassifier(),0.436474,0.067549,0.465438,0.447368,0.26087,0.680556
3,GaussianNB(),0.414421,0.04434,0.456221,0.684211,0.144928,0.513889
4,DecisionTreeClassifier(),0.400682,0.054582,0.419355,0.486842,0.318841,0.444444
5,LinearDiscriminantAnalysis(),0.42371,0.053902,0.474654,0.526316,0.318841,0.569444
6,KNeighborsClassifier(),0.424873,0.062078,0.382488,0.460526,0.246377,0.430556


# Neural Networks

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_und, y_und, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Build the model
model = tf.keras.models.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Assuming binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

# Make predictions
predictions = model.predict(X_test)