In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

In [2]:
csv_path = os.path.join("..", "..", "data","LINKED_DATA", "TSR_EHR", "TSR_1_CLEANED.csv")
tsr_1 = pd.read_csv(csv_path)
tsr_1.head()

Unnamed: 0,height_nm,weight_nm,edu_id,pro_id,opc_id,ih_fl,ivtpamg_nm,hospitalised_time,nivtpa_id,nivtpa1_fl,...,nihs_6br_out,nihs_7_out,nihs_8_out,nihs_9_out,nihs_10_out,nihs_11_out,total_out,SexName,Age,mrs_tx_1
0,150.0,49.0,2,1,3,0,0.0,16.0,1,0,...,0,0,0,0,0,0,8,0,66.0,4
1,153.0,62.0,3,1,3,0,0.0,8.0,0,999,...,0,1,1,0,1,0,4,0,67.0,1
2,152.0,62.0,3,1,2,0,0.0,4.0,0,999,...,0,0,1,0,0,0,1,0,69.0,1
3,148.0,56.0,2,1,2,0,0.0,5.0,0,999,...,0,0,1,0,0,0,2,0,71.0,0
4,152.0,56.0,4,1,2,0,0.0,3.0,1,0,...,0,0,0,0,0,0,0,0,71.0,0


In [3]:
tsr_1_input = tsr_1.drop(["mrs_tx_1"], axis=1)
tsr_1_input[tsr_1_input == "N"] = 0
tsr_1_input[tsr_1_input == "Y"] = 1
tsr_1_input = tsr_1_input.astype("float64")
tsr_1_input = np.array(tsr_1_input.values)

# 6 classes

In [4]:
tsr_1_output = tsr_1.mrs_tx_1
tsr_1_output = tsr_1_output.astype("float64")
tsr_1_output = np.array(tsr_1_output.values)

## SVM

In [5]:
svc = CalibratedClassifierCV(LinearSVC(penalty = "l2", dual=False, loss = "squared_hinge", C = 1, multi_class = "ovr", 
                                       random_state = 19)) 
svc_scores = cross_val_score(svc,tsr_1_input,tsr_1_output,cv = 10, scoring='accuracy')
print(svc_scores)
print(svc_scores.mean(), svc_scores.std())



[0.53777778 0.60296296 0.60148148 0.5837037  0.59259259 0.59555556
 0.64592593 0.6162963  0.64444444 0.49851632]
0.5919257061215519 0.042684698077643066




In [6]:
svc.fit(tsr_1_input,tsr_1_output)
svc_predict =svc.predict_proba(tsr_1_input)
print(svc_predict)



[[2.94226221e-03 6.94550526e-02 2.04117201e-02 1.51821604e-01
  6.46611286e-01 1.08758076e-01]
 [1.49927560e-01 4.58113836e-01 2.58483116e-01 5.64449406e-02
  7.04024524e-02 6.62809536e-03]
 [1.61699064e-01 5.26369628e-01 1.90600135e-01 4.70856222e-02
  6.72523119e-02 6.99323950e-03]
 ...
 [5.74485032e-05 5.11739154e-03 1.14039250e-02 1.71036376e-02
  3.20906142e-02 9.34226983e-01]
 [5.25512810e-05 7.34149835e-03 2.09117323e-03 1.02760247e-01
  2.92274883e-01 5.95479647e-01]
 [3.29482781e-04 3.06614793e-02 1.28105606e-02 9.42863665e-02
  3.64832020e-01 4.97080091e-01]]




In [7]:
svc_pred = cross_val_predict(svc,tsr_1_input,tsr_1_output,cv = 10)
confusion_matrix(tsr_1_output, svc_pred)



array([[ 274,  219,   23,   30,   10,    2],
       [ 115,  881,  166,   58,   45,    4],
       [  20,  339,  345,  109,  112,    7],
       [   7,   64,  156,  224,  377,   27],
       [   1,   27,   39,  116,  980,  337],
       [   1,    8,   12,   15,  308, 1291]], dtype=int64)

## RF

In [8]:
rf = CalibratedClassifierCV(RandomForestClassifier(criterion = "gini", n_estimators = 15, bootstrap=True, random_state = 19,
                                                  max_features = 0.8)) 
rf_scores = cross_val_score(rf,tsr_1_input,tsr_1_output,cv = 10, scoring='accuracy')
print(rf_scores)
print(rf_scores.mean(), rf_scores.std())

[0.69481481 0.76592593 0.72296296 0.7037037  0.74074074 0.75555556
 0.80148148 0.79407407 0.78222222 0.64094955]
0.7402431036377622 0.047813339288996774


In [9]:
rf.fit(tsr_1_input,tsr_1_output)
rf_predict =rf.predict_proba(tsr_1_input)
print(rf_predict)

[[0.02161392 0.03984294 0.03374814 0.06501049 0.80230888 0.03747563]
 [0.03740463 0.82121215 0.04082079 0.03206861 0.04079358 0.02770024]
 [0.02284936 0.84885682 0.03493287 0.03083941 0.03628596 0.02623558]
 ...
 [0.0197908  0.03658606 0.03069316 0.02950689 0.03468654 0.84873654]
 [0.0197908  0.03658606 0.03069316 0.02950689 0.03468654 0.84873654]
 [0.02123273 0.03912224 0.03308034 0.03169159 0.58416506 0.29070806]]


In [10]:
rf_pred = cross_val_predict(rf,tsr_1_input,tsr_1_output,cv = 10)
confusion_matrix(tsr_1_output, rf_pred)

array([[ 348,  149,   41,   15,    4,    1],
       [  79,  848,  271,   47,   17,    7],
       [   5,   92,  646,  155,   31,    3],
       [   0,   15,   63,  583,  183,   11],
       [   2,   13,   21,  104, 1204,  156],
       [   1,    5,   10,   17,  235, 1367]], dtype=int64)

## XGBoost

In [None]:
xgb = CalibratedClassifierCV(XGBClassifier(booster = "gbtree", objective="multi:softprob", eval_metric = "auc", 
                                            use_label_encoder = False, random_state = 19)) 
xgb_scores = cross_val_score(xgb,tsr_1_input,tsr_1_output,cv = 10)
print(xgb_scores)
print(xgb_scores.mean(), xgb_scores.std())

In [None]:
xgb.fit(tsr_1_input,tsr_1_output)
xgb_predict =xgb.predict_proba(tsr_1_input)
print(xgb_predict)

In [None]:
xgb_pred = cross_val_predict(xgb,tsr_1_input,tsr_1_output,cv = 10)
confusion_matrix(tsr_1_output, xgb_pred)

# 2 classes

In [None]:
tsr_1_output[(tsr_1_output == 0)|(tsr_1_output == 1)|(tsr_1_output == 2)] = 0
tsr_1_output[(tsr_1_output == 3)|(tsr_1_output == 4)|(tsr_1_output == 5)] = 1

## SVM

In [None]:
svc2 = CalibratedClassifierCV(LinearSVC(penalty = "l2", dual=False, loss = "squared_hinge", C = 1, multi_class = "ovr", 
                                       random_state = 19)) 
svc_scores2 = cross_val_score(svc2,tsr_1_input,tsr_1_output,cv = 10, scoring='accuracy')
print(svc_scores2)
print(svc_scores2.mean(), svc_scores2.std())

In [None]:
svc2.fit(tsr_1_input,tsr_1_output)
svc_predict2 =svc2.predict_proba(tsr_1_input)
print(svc_predict2)

In [None]:
svc_pred2 = cross_val_predict(svc2,tsr_1_input,tsr_1_output,cv = 10)
confusion_matrix(tsr_1_output, svc_pred2)

## RF

In [None]:
rf2 = CalibratedClassifierCV(RandomForestClassifier(criterion = "gini", n_estimators = 15, bootstrap=True, random_state = 19,
                                                  max_features = 0.8)) 
rf_scores2 = cross_val_score(rf2,tsr_1_input,tsr_1_output,cv = 10, scoring='accuracy')
print(rf_scores2)
print(rf_scores2.mean(), rf_scores2.std())

In [None]:
rf2.fit(tsr_1_input,tsr_1_output)
rf_predict2 =rf2.predict_proba(tsr_1_input)
print(rf_predict2)

In [None]:
rf_pred2 = cross_val_predict(rf2,tsr_1_input,tsr_1_output,cv = 10)
confusion_matrix(tsr_1_output, rf_pred2)

## XGBoost

In [None]:
xgb2 = CalibratedClassifierCV(XGBClassifier(booster = "gbtree", objective="binary:logistic", eval_metric = "auc", 
                                            use_label_encoder = False, random_state = 19)) 
xgb_scores2 = cross_val_score(xgb2,tsr_1_input,tsr_1_output,cv = 10)
print(xgb_scores2)
print(xgb_scores2.mean(), xgb_scores2.std())

In [None]:
xgb2.fit(tsr_1_input,tsr_1_output)
xgb_predict2 =xgb2.predict_proba(tsr_1_input)
print(xgb_predict2)

In [None]:
xgb_pred2 = cross_val_predict(xgb2,tsr_1_input,tsr_1_output,cv = 10)
confusion_matrix(tsr_1_output, xgb_pred2)

# Summary

## Mean & Std

In [None]:
svc_mean = np.array([svc_scores.mean(), svc_scores.std(), svc_scores[0], svc_scores[1], svc_scores[2], svc_scores[3],
                     svc_scores[4], svc_scores[5], svc_scores[6], svc_scores[7], svc_scores[8], svc_scores[9]])
rf_mean = np.array([rf_scores.mean(), rf_scores.std(), rf_scores[0], rf_scores[1], rf_scores[2], rf_scores[3],rf_scores[4], 
                    rf_scores[5], rf_scores[6], rf_scores[7], rf_scores[8], rf_scores[9]])
xgb_mean = np.array([xgb_scores.mean(), xgb_scores.std(), xgb_scores[0], xgb_scores[1], xgb_scores[2], xgb_scores[3],
                     xgb_scores[4], xgb_scores[5], xgb_scores[6], xgb_scores[7], xgb_scores[8], xgb_scores[9]])
svc_mean2 = np.array([svc_scores2.mean(), svc_scores2.std(), svc_scores2[0], svc_scores2[1], svc_scores2[2], svc_scores2[3],
                     svc_scores2[4], svc_scores2[5], svc_scores2[6], svc_scores2[7], svc_scores2[8], svc_scores2[9]])
rf_mean2 = np.array([rf_scores2.mean(), rf_scores2.std(), rf_scores2[0], rf_scores2[1], rf_scores2[2], rf_scores2[3],rf_scores2[4], 
                    rf_scores2[5], rf_scores2[6], rf_scores2[7], rf_scores2[8], rf_scores2[9]])
xgb_mean2 = np.array([xgb_scores2.mean(), xgb_scores2.std(), xgb_scores2[0], xgb_scores2[1], xgb_scores2[2], xgb_scores2[3],
                     xgb_scores2[4], xgb_scores2[5], xgb_scores2[6], xgb_scores2[7], xgb_scores2[8], xgb_scores2[9]])

In [None]:
tsr_1_mean = pd.DataFrame([svc_mean, rf_mean, xgb_mean, svc_mean2, rf_mean2, xgb_mean2]).T
tsr_1_mean.index = ["Mean", "Std", "mean_1", "mean_2", "mean_3", "mean_4", "mean_5", "mean_6", "mean_7", "mean_8", "mean_9", "mean_10"]
tsr_1_mean.columns = ["svc", 'rf', 'xgb', 'svc2', 'rf2', 'xgb2']

In [None]:
csv_save = os.path.join(".", "tsr_1_mean.csv")
tsr_1_mean.to_csv(csv_save, index = True)

## Predicted Probability

In [None]:
svc_predict_0 = svc_predict[:, 0]
svc_predict_1 = svc_predict[:, 1]
svc_predict_2 = svc_predict[:, 2]
svc_predict_3 = svc_predict[:, 3]
svc_predict_4 = svc_predict[:, 4]
svc_predict_5 = svc_predict[:, 5]

rf_predict_0 = rf_predict[:, 0]
rf_predict_1 = rf_predict[:, 1]
rf_predict_2 = rf_predict[:, 2]
rf_predict_3 = rf_predict[:, 3]
rf_predict_4 = rf_predict[:, 4]
rf_predict_5 = rf_predict[:, 5]

xgb_predict_0 = xgb_predict[:, 0]
xgb_predict_1 = xgb_predict[:, 1]
xgb_predict_2 = xgb_predict[:, 2]
xgb_predict_3 = xgb_predict[:, 3]
xgb_predict_4 = xgb_predict[:, 4]
xgb_predict_5 = xgb_predict[:, 5]

svc_predict2_0 = svc_predict2[:, 0]
svc_predict2_1 = svc_predict2[:, 1]

rf_predict2_0 = rf_predict2[:, 0]
rf_predict2_1 = rf_predict2[:, 1]

xgb_predict2_0 = xgb_predict2[:, 0]
xgb_predict2_1 = xgb_predict2[:, 1]

In [None]:
tsr_1_pred_prob = pd.DataFrame([svc_predict_0, svc_predict_1, svc_predict_2, svc_predict_3, svc_predict_4, svc_predict_5, 
                               rf_predict_0, rf_predict_1, rf_predict_2, rf_predict_3, rf_predict_4, rf_predict_5,
                               xgb_predict_0, xgb_predict_1, xgb_predict_2, xgb_predict_3, xgb_predict_4, xgb_predict_5,
                               svc_predict2_0, svc_predict2_1, rf_predict2_0, rf_predict2_1, xgb_predict2_0, xgb_predict2_1]).T
tsr_1_pred_prob.columns = ["svc_predict_0", "svc_predict_1", "svc_predict_2", "svc_predict_3", "svc_predict_4", "svc_predict_5", 
                           "rf_predict_0", "rf_predict_1", "rf_predict_2", "rf_predict_3", "rf_predict_4", "rf_predict_5",
                           "xgb_predict_0", "xgb_predict_1", "xgb_predict_2", 'xgb_predict_3', "xgb_predict_4", "xgb_predict_5",
                           "svc_predict2_0", "svc_predict2_1", "rf_predict2_0", "rf_predict2_1", "xgb_predict2_0", "xgb_predict2_1"]

In [None]:
csv_save2 = os.path.join(".", "tsr_1_pred_prob.csv")
tsr_1_pred_prob.to_csv(csv_save2, index = False)