## **Dependencies**

In [13]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC 
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, DotProduct
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from imblearn.combine import SMOTETomek # because our data is unbalanced

random_state = 42
np.random.seed(random_state)


## **Data Processing**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dat_nurse = pd.read_csv('./prepared_1061.csv')

In [None]:
# drop na (outcome na)
dat_nurse = dat_nurse[dat_nurse['outcome'].notnull()]
# train-test-split
train_dat, test_dat = train_test_split(dat_nurse, test_size = 0.15,  stratify=dat_nurse['outcome'], random_state=random_state)

In [None]:
# organizing data
outcome_col = train_dat.columns[3]
text_col = train_dat.columns[-8:]

train_dat_outcome, test_dat_outcome = train_dat.loc[:, outcome_col], test_dat.loc[:, outcome_col]
train_dat_text, test_dat_text = train_dat.loc[:, text_col], test_dat.loc[:, text_col]

In [None]:
train_y = train_dat_outcome.to_numpy()
test_y = test_dat_outcome.to_numpy()

train/val/test shape:  (901, 7926) (159, 7926)


## **Modelling**

In [15]:
params = {
    'LogisticRegressionCV' : {
        'penalty' : ['l2']
    },
    'SVC' : {
        'kernel' : ['linear', 'poly', 'rbf'],
        'degree' : [3, 4, 5]
    },
    'GaussianProcessClassifier' : { # https://stackoverflow.com/questions/62755556/gaussian-process-regression-hyparameter-optimisation-using-python-grid-search
            "kernel": [RBF(l) for l in np.logspace(-1, 1, 20)]
        },
    'XGBClassifier' : { # got from some towardsdatascience article i forgot
            'gamma': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4, 200],
            'learning_rate': [0.01, 0.03, 0.06, 0.1, 0.15, 0.2, 0.25, 0.300000012, 0.4, 0.5, 0.6, 0.7],
            'max_depth': [5,6,7,8,9,10,11,12,13,14],
            'n_estimators': [50,65,80,100,115,130,150],
            'reg_alpha': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200],
            'reg_lambda': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200],
            'eval_metric': ['auc'],
            'objective': ['binary:logistic']
    },
    'RandomForestClassifier': {
        'n_estimators': [100],
        'min_samples_split': [2],
        'min_samples_leaf': [2],
        'max_depth': [30],
        'class_weight': ['balanced'],
        'bootstrap': [False]
    }
}

In [16]:
def fit(x, y, parameters, n_iter, random_state = 42):
    """
    Randomized hyperparameter tuning for fitting models

    parameters:
        n_iter -- number of parameter settings sampled
    """
    print("Training LogisticRegressionCV:")
    lr = LogisticRegressionCV(max_iter = int(1e6))
    rs_lr = RandomizedSearchCV(lr, parameters['LogisticRegressionCV'], n_iter = n_iter, random_state=random_state)
    rs_lr.fit(x, y) 

    print("Training SVC:")
    svc = SVC(probability = True)
    rs_svc = RandomizedSearchCV(svc, parameters['SVC'], n_iter = n_iter, random_state=random_state)
    rs_svc.fit(x, y)

    print("Training GaussianProcessClassifier:")
    gp = GaussianProcessClassifier()
    rs_gp = RandomizedSearchCV(gp, parameters['GaussianProcessClassifier'], n_iter = n_iter, random_state=random_state)
    rs_gp.fit(x, y)

    print("Training XGBClassifier:")
    xgb = XGBClassifier()
    rs_xgb = RandomizedSearchCV(xgb, parameters['XGBClassifier'], n_iter = n_iter, random_state=random_state)
    rs_xgb.fit(x, y) 

    print("Training Random Forest:")
    rf = RandomForestClassifier()
    rs_rf = RandomizedSearchCV(rf, parameters['RandomForestClassifier'], n_iter = n_iter, random_state=random_state)
    rs_rf.fit(x, y) 

    models = [rs_lr, rs_svc, rs_gp, rs_xgb, rs_rf]
    return models 


In [17]:
def evaluate(x, y, models):
    scores = {
        'ACC': [], 
        'AUC': [],
        'CMAT': []
    }
    for model in models:
        y_pred = model.predict(x)
        y_proba = model.predict_proba(x)[:, 1]
        scores['ACC'].append(accuracy_score(y, y_pred))
        scores['AUC'].append(roc_auc_score(y, y_proba))
        scores['CMAT'].append(confusion_matrix(y, y_pred))
        
    return scores 

In [None]:
evaluate(test_x, test_y, models)

{'ACC': [0.84375, 0.8541666666666666, 0.8541666666666666, 0.84375],
 'AUC': [0.7020905923344947,
  0.7168989547038327,
  0.7134146341463414,
  0.6114982578397212],
 'CMAT': [array([[81,  1],
         [14,  0]]),
  array([[82,  0],
         [14,  0]]),
  array([[82,  0],
         [14,  0]]),
  array([[81,  1],
         [14,  0]])]}

## **word2vec (w/o imbalance handling)**

In [18]:
import pandas as pd
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [19]:
train_processed_text = train_dat['TEXT'].apply(simple_preprocess)

In [20]:
word2vec_model = Word2Vec(sentences = train_processed_text, vector_size = 200, window = 5, min_count = 1)

In [21]:
def document_vector(doc, model):
    doc_vector = []
    num_words = 0
    for word in doc:
        if word in model.wv:
            doc_vector.append(model.wv[word])
            num_words += 1
    if num_words > 0:
        doc_vector = np.mean(doc_vector, axis=0)
    else:
        doc_vector = np.zeros(model.vector_size)
    return doc_vector

In [22]:
train_word2vec_repr = train_processed_text.apply(lambda x: document_vector(x, word2vec_model))

In [23]:
train_repr = np.vstack(train_word2vec_repr)

In [None]:
word2vec_models = fit(train_repr, train_y, params, n_iter = 20, random_state = random_state)

In [25]:
for model in word2vec_models:
    print(model.best_estimator_, model.best_params_, model.best_score_)

LogisticRegressionCV(max_iter=1000000) {'penalty': 'l2'} 0.8690362185389808
SVC(kernel='linear', probability=True) {'kernel': 'linear', 'degree': 3} 0.8690362185389808
GaussianProcessClassifier(kernel=RBF(length_scale=0.1)) {'kernel': RBF(length_scale=0.1)} 0.8690362185389808
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', feature_types=None,
              gamma=0, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.6, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=8, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=50, n_jobs=None, num_parallel_tree=None,
              predictor=None, ra

In [26]:
test_processed_text = test_dat['TEXT'].apply(simple_preprocess)
test_word2vec_repr = test_processed_text.apply(lambda x: document_vector(x, word2vec_model))
test_repr = np.vstack(test_word2vec_repr)

In [27]:
test_repr.shape

(159, 200)

In [28]:
evaluate(test_repr, test_y, word2vec_models)

{'ACC': [0.8679245283018868,
  0.8679245283018868,
  0.8679245283018868,
  0.8679245283018868,
  0.8742138364779874],
 'AUC': [0.5013802622498275,
  0.5100069013112492,
  0.4998274672187716,
  0.5833333333333334,
  0.5628019323671497],
 'CMAT': [array([[138,   0],
         [ 21,   0]]),
  array([[138,   0],
         [ 21,   0]]),
  array([[138,   0],
         [ 21,   0]]),
  array([[138,   0],
         [ 21,   0]]),
  array([[138,   0],
         [ 20,   1]])]}

## **word2vec (w/ Imbalance Handling)**

In [29]:
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [30]:
under_sampling_strategy = 0.2
over_sampling_strategy = 1.0

resampling_pipeline = Pipeline([
    ('under', RandomUnderSampler(sampling_strategy=under_sampling_strategy)),
    ('over', RandomOverSampler(sampling_strategy=over_sampling_strategy))
])

In [31]:
train_repr_resamp, train_y_resamp = resampling_pipeline.fit_resample(np.array(train_repr), np.array(train_y))

In [None]:
word2vec_models_oversamp = fit(train_repr_resamp, train_y_resamp, params, n_iter = 30, random_state = random_state)

In [35]:
evaluate(test_repr, test_y, word2vec_models_oversamp)

{'ACC': [0.5849056603773585,
  0.42138364779874216,
  0.8679245283018868,
  0.8050314465408805,
  0.8553459119496856],
 'AUC': [0.5255348516218081,
  0.5227743271221532,
  0.5,
  0.5800552104899931,
  0.5407177363699103],
 'CMAT': [array([[88, 50],
         [16,  5]]),
  array([[53, 85],
         [ 7, 14]]),
  array([[138,   0],
         [ 21,   0]]),
  array([[126,  12],
         [ 19,   2]]),
  array([[136,   2],
         [ 21,   0]])]}

### try penalization

In [36]:
from sklearn.metrics import make_scorer, confusion_matrix

def custom_scorer(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Adjust the weight according to your preference
    false_negative_weight = 5
    
    score = tp - false_negative_weight * fn
    return score
  
custom_scorer = make_scorer(custom_scorer, greater_is_better=True)

In [37]:
def fit_penalized(x, y, parameters, n_iter, random_state = 42):
    print("Training GaussianProcessClassifier:")
    gp = GaussianProcessClassifier()
    rs_gp = RandomizedSearchCV(gp, parameters['GaussianProcessClassifier'], n_iter = n_iter, random_state=random_state, scoring=custom_scorer)
    rs_gp.fit(x, y)

    print("Training XGBClassifier:")
    xgb = XGBClassifier()
    parameters['XGBClassifier']['scale_pos_weight'] = [5] 
    rs_xgb = RandomizedSearchCV(xgb, parameters['XGBClassifier'], n_iter = n_iter, random_state=random_state)
    rs_xgb.fit(x, y) 

    models = [rs_gp, rs_xgb]
    return models 

In [None]:
word2vec_models_penal = fit_penalized(train_repr_resamp, train_y_resamp, params, n_iter = 30, random_state = random_state)

In [40]:
evaluate(test_repr, test_y, word2vec_models_penal)

{'ACC': [0.8679245283018868, 0.7672955974842768],
 'AUC': [0.5, 0.48895790200138023],
 'CMAT': [array([[138,   0],
         [ 21,   0]]),
  array([[121,  17],
         [ 20,   1]])]}