In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

In [2]:
csv_path = os.path.join("..", "..", "data","LINKED_DATA", "TSR_EHR", "TSR_3_CLEANED.csv")
tsr_3 = pd.read_csv(csv_path)
tsr_3.head()

Unnamed: 0,height_nm,weight_nm,edu_id,pro_id,opc_id,ih_fl,ivtpamg_nm,hospitalised_time,nivtpa_id,nivtpa1_fl,...,nihs_7_out,nihs_8_out,nihs_9_out,nihs_10_out,nihs_11_out,total_out,SexName,Age,mrs_tx_1,mrs_tx_3
0,153.0,62.0,3,1,3,0,0.0,8.0,0,999,...,1,1,0,1,0,4,0,67.0,1,1
1,152.0,62.0,3,1,2,0,0.0,4.0,0,999,...,0,1,0,0,0,1,0,69.0,1,0
2,148.0,56.0,2,999,2,0,0.0,5.0,0,999,...,0,1,0,0,0,2,0,71.0,0,0
3,152.0,56.0,4,1,2,0,0.0,3.0,1,0,...,0,0,0,0,0,0,0,71.0,0,0
4,160.0,60.0,2,1,3,0,0.0,4.0,0,999,...,0,0,0,0,0,4,0,62.0,3,3


In [3]:
tsr_3_input = tsr_3.drop(["mrs_tx_3"], axis=1)
tsr_3_input[tsr_3_input == "N"] = 0
tsr_3_input[tsr_3_input == "Y"] = 1
tsr_3_input = tsr_3_input.astype("float64")
tsr_3_input = np.array(tsr_3_input.values)

tsr_3_input_nomrs = tsr_3.drop(["mrs_tx_3", "mrs_tx_1"], axis=1)
tsr_3_input_nomrs[tsr_3_input_nomrs == "N"] = 0
tsr_3_input_nomrs[tsr_3_input_nomrs == "Y"] = 1
tsr_3_input_nomrs = tsr_3_input_nomrs.astype("float64")
tsr_3_input_nomrs = np.array(tsr_3_input_nomrs.values)

# 6 classes

In [4]:
tsr_3_output = tsr_3.mrs_tx_1
tsr_3_output = tsr_3_output.astype("float64")
tsr_3_output = np.array(tsr_3_output.values)

## SVM

In [5]:
svr = LinearSVR(epsilon = 0, dual=False, loss = "squared_epsilon_insensitive", C = 1, random_state = 19)
svr_scores = cross_val_score(svr,tsr_3_input,tsr_3_output,cv = 10, scoring='r2')
print(svr_scores)
print("Mean of R^2:", svr_scores.mean())
print("Std of R^2:", svr_scores.std())

[0.93599297 0.88191515 0.98220603 0.94713455 0.97598876 0.9974574
 0.99732306 0.97888391 0.99782017 0.95298329]
Mean of R^2: 0.9647705287103797
Std of R^2: 0.03463024653217385


In [6]:
acc_svr = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_3_input, tsr_3_output, test_size=0.3, random_state=i)
    svr.fit(X_train,y_train)
    svr_predict = svr.predict(X_test)
    print(svr_predict)
    svr_predict = np.round(svr_predict)
    print(svr_predict)
    accuracy = (y_test == svr_predict).sum() / len(svr_predict)
    acc_svr.append(accuracy)

print("Accuracy of RF:", acc_svr)
print("Mean of Accuracy of RF:", sum(acc_svr)/10)

[1.81219831 4.16162267 4.09291963 ... 4.94111497 2.82881127 0.72678814]
[2. 4. 4. ... 5. 3. 1.]
[3.38338982 1.0594635  5.04025154 ... 4.84764864 1.00523699 4.92438384]
[3. 1. 5. ... 5. 1. 5.]
[3.02866767 2.95381755 1.00970974 ... 2.98747512 4.01255912 4.79703189]
[3. 3. 1. ... 3. 4. 5.]
[5.07386239 1.09319273 0.07159221 ... 1.03178868 0.98747465 3.37266104]
[5. 1. 0. ... 1. 1. 3.]
[4.06904158 0.21523287 1.01682325 ... 1.91402104 0.98058005 0.81341916]
[4. 0. 1. ... 2. 1. 1.]
[1.0133584  1.01218503 1.98074178 ... 5.03213721 5.00022239 4.02010366]
[1. 1. 2. ... 5. 5. 4.]
[2.19067028 2.90904776 2.11010689 ... 4.12780531 4.01344941 0.35555861]
[2. 3. 2. ... 4. 4. 0.]
[1.76763495 4.06612526 2.63206654 ... 3.23125624 3.7764842  1.35613802]
[2. 4. 3. ... 3. 4. 1.]
[3.03144217 3.02794382 3.09843735 ... 0.9556929  2.11406468 1.2449477 ]
[3. 3. 3. ... 1. 2. 1.]
[0.93712542 4.03116493 0.09213964 ... 0.90258993 5.06425417 1.11036969]
[1. 4. 0. ... 1. 5. 1.]
Accuracy of RF: [0.9340813464235624, 0.8

In [7]:
svr_pred = cross_val_predict(svr,tsr_3_input,tsr_3_output,cv = 10)
svr_pred = np.round(svr_pred)
confusion_matrix(tsr_3_output, svr_pred)

array([[ 362,  135,    2,    0,    1,    0,    0],
       [   0, 1079,   63,    2,    0,    0,    0],
       [   0,   22,  718,   31,    0,    0,    0],
       [   0,    0,   30,  620,   17,    0,    0],
       [   0,    0,    1,   69,  876,   13,    0],
       [   0,    0,    1,    2,   60,  645,    2],
       [   0,    0,    0,    0,    0,    0,    0]], dtype=int64)

In [8]:
svr_pred[svr_pred == -1] = 0
confusion_matrix(tsr_3_output, svr_pred)

array([[ 362,  135,    2,    0,    1,    0,    0],
       [   0, 1079,   63,    2,    0,    0,    0],
       [   0,   22,  718,   31,    0,    0,    0],
       [   0,    0,   30,  620,   17,    0,    0],
       [   0,    0,    1,   69,  876,   13,    0],
       [   0,    0,    1,    2,   60,  645,    2],
       [   0,    0,    0,    0,    0,    0,    0]], dtype=int64)

In [9]:
svr1 = LinearSVR(epsilon = 0, dual=False, loss = "squared_epsilon_insensitive", C = 1, random_state = 19)
svr_scores1 = cross_val_score(svr1,tsr_3_input_nomrs,tsr_3_output,cv = 10, scoring='r2')
print(svr_scores1)
print("Mean of R^2:", svr_scores1.mean())
print("Std of R^2:", svr_scores1.std())

[0.73874921 0.78776362 0.82635448 0.79226118 0.82764897 0.8086996
 0.8045052  0.76058387 0.82346778 0.63811033]
Mean of R^2: 0.780814423056179
Std of R^2: 0.05490724101725365


In [10]:
acc_svr1 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_3_input_nomrs, tsr_3_output, test_size=0.3, random_state=i)
    svr1.fit(X_train,y_train)
    svr1_predict = svr1.predict(X_test)
    print(svr1_predict)
    svr1_predict = np.round(svr1_predict)
    print(svr1_predict)
    accuracy = (y_test == svr1_predict).sum() / len(svr1_predict)
    acc_svr1.append(accuracy)

print("Accuracy of RF:", acc_svr1)
print("Mean of Accuracy of RF:", sum(acc_svr1)/10)

[1.1735284  4.36123954 4.6735321  ... 4.99675659 2.65038567 2.01561096]
[1. 4. 5. ... 5. 3. 2.]
[3.74612928 1.07237627 5.11016476 ... 4.88099336 1.12759842 4.87281336]
[4. 1. 5. ... 5. 1. 5.]
[2.65697394 2.62152103 0.70418455 ... 2.36460611 4.0067122  3.80602598]
[3. 3. 1. ... 2. 4. 4.]
[4.99436057 1.06377284 0.64417782 ... 1.20042081 0.9595066  4.83344397]
[5. 1. 1. ... 1. 1. 5.]
[4.04909962 1.04322479 1.15638136 ... 1.14814745 0.90543619 0.63120805]
[4. 1. 1. ... 1. 1. 1.]
[1.83696534 0.7875255  3.00716647 ... 4.38843787 4.56990554 2.5614561 ]
[2. 1. 3. ... 4. 5. 3.]
[2.7083566  2.26172224 2.05001075 ... 4.11588021 4.54199933 1.98515245]
[3. 2. 2. ... 4. 5. 2.]
[2.5422221  4.21617348 4.6414552  ... 3.38468121 3.40469313 2.87369442]
[3. 4. 5. ... 3. 3. 3.]
[3.06512465 3.19830206 3.57080404 ... 0.93972824 1.83106507 2.02166228]
[3. 3. 4. ... 1. 2. 2.]
[0.76225063 4.38814067 0.74780256 ... 0.71741721 5.07668247 1.59847951]
[1. 4. 1. ... 1. 5. 2.]
Accuracy of RF: [0.5371669004207573, 0.5

In [11]:
svr1.fit(tsr_3_input_nomrs,tsr_3_output)
svr_predict1 =svr1.predict(tsr_3_input_nomrs)
print(svr_predict1)
svr_predict1 = np.round(svr_predict1)
print(svr_predict1)
print("Accuracy of RF:", (tsr_3_output == svr_predict1).sum() / len(svr_predict1))

[1.17417686 0.94851839 0.84817572 ... 5.3016844  4.39889774 4.00838992]
[1. 1. 1. ... 5. 4. 4.]
Accuracy of RF: 0.5611450221006103


In [12]:
svr_pred1 = cross_val_predict(svr1,tsr_3_input_nomrs,tsr_3_output,cv = 10)
svr_pred1 = np.round(svr_pred1)
confusion_matrix(tsr_3_output, svr_pred1)

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  2,   0,  19, 407,  59,  12,   0,   1,   0],
       [  0,   0,   3, 879, 217,  33,  10,   2,   0],
       [  1,   1,   0, 298, 363,  94,  13,   1,   0],
       [  0,   0,   0,  24, 194, 377,  64,   8,   0],
       [  0,   0,   1,   8,  28, 247, 543, 132,   0],
       [  0,   0,   2,   4,   5,  25, 196, 459,  19],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0]], dtype=int64)

In [13]:
svr_pred1[svr_pred1 == -1] = 0
svr_pred1[svr_pred1 == 6] = 5
confusion_matrix(tsr_3_output, svr_pred1)

array([[  0,   0,   0,   0,   0,   0,   0],
       [  2,  19, 407,  59,  12,   0,   1],
       [  0,   3, 879, 217,  33,  10,   2],
       [  1,   1, 298, 363,  94,  13,   1],
       [  0,   0,  24, 194, 377,  64,   8],
       [  0,   1,   8,  28, 247, 543, 132],
       [  0,   2,   4,   5,  25, 196, 478]], dtype=int64)

## RF

In [14]:
rfr = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
rfr_scores = cross_val_score(rfr,tsr_3_input,tsr_3_output,cv = 10, scoring='r2')
print(rfr_scores)
print("Mean of R^2:", rfr_scores.mean())
print("Std of R^2:", rfr_scores.std())

[0.9999604  0.99997409 0.99998647 0.99999664 0.99998574 0.99998577
 0.9999849  0.99998505 0.99999245 0.99997482]
Mean of R^2: 0.9999826326079451
Std of R^2: 9.827182229605757e-06


In [15]:
acc_rfr = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_3_input, tsr_3_output, test_size=0.3, random_state=i)
    rfr.fit(X_train,y_train)
    rfr_predict = rfr.predict(X_test)
    print(rfr_predict)
    rfr_predict = np.round(rfr_predict)
    print(rfr_predict)
    accuracy = (y_test == rfr_predict).sum() / len(rfr_predict)
    acc_rfr.append(accuracy)

print("Accuracy of RF:", acc_rfr)
print("Mean of Accuracy of RF:", sum(acc_rfr)/10)

[2. 4. 4. ... 5. 3. 0.]
[2. 4. 4. ... 5. 3. 0.]
[3. 1. 5. ... 5. 1. 5.]
[3. 1. 5. ... 5. 1. 5.]
[3.         3.         1.         ... 3.06666667 4.         5.        ]
[3. 3. 1. ... 3. 4. 5.]
[5. 1. 0. ... 1. 1. 3.]
[5. 1. 0. ... 1. 1. 3.]
[4. 0. 1. ... 2. 1. 1.]
[4. 0. 1. ... 2. 1. 1.]
[1. 1. 2. ... 5. 5. 4.]
[1. 1. 2. ... 5. 5. 4.]
[2. 3. 2. ... 4. 4. 0.]
[2. 3. 2. ... 4. 4. 0.]
[1. 4. 1. ... 3. 4. 0.]
[1. 4. 1. ... 3. 4. 0.]
[3. 3. 3. ... 1. 2. 1.]
[3. 3. 3. ... 1. 2. 1.]
[1. 4. 0. ... 1. 5. 1.]
[1. 4. 0. ... 1. 5. 1.]
Accuracy of RF: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Mean of Accuracy of RF: 1.0


In [16]:
rfr_pred = cross_val_predict(rfr,tsr_3_input,tsr_3_output,cv = 10)
rfr_pred = np.round(rfr_pred)
confusion_matrix(tsr_3_output, rfr_pred)

array([[ 500,    0,    0,    0,    0,    0],
       [   0, 1144,    0,    0,    0,    0],
       [   0,    0,  771,    0,    0,    0],
       [   0,    0,    0,  667,    0,    0],
       [   0,    0,    0,    0,  959,    0],
       [   0,    0,    0,    0,    0,  710]], dtype=int64)

In [17]:
rfr1 = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
rfr_scores1 = cross_val_score(rfr1,tsr_3_input_nomrs,tsr_3_output,cv = 10, scoring='r2')
print(rfr_scores1)
print("Mean of R^2:", rfr_scores1.mean())
print("Std of R^2:", rfr_scores1.std())

[0.79848646 0.83124349 0.86398329 0.8160926  0.84065532 0.85591027
 0.8449297  0.7813848  0.89386159 0.81143728]
Mean of R^2: 0.8337984798534303
Std of R^2: 0.03161128500485248


In [18]:
acc_rfr1 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_3_input_nomrs, tsr_3_output, test_size=0.3, random_state=i)
    rfr1.fit(X_train,y_train)
    rfr1_predict = rfr1.predict(X_test)
    print(rfr1_predict)
    rfr1_predict = np.round(rfr1_predict)
    print(rfr1_predict)
    accuracy = (y_test == rfr1_predict).sum() / len(rfr1_predict)
    acc_rfr1.append(accuracy)

print("Accuracy of RF:", acc_rfr1)
print("Mean of Accuracy of RF:", sum(acc_rfr1)/10)

[1.8        4.2        4.26666667 ... 5.         2.53333333 0.2       ]
[2. 4. 4. ... 5. 3. 0.]
[3.6        1.         4.86666667 ... 5.         0.93333333 4.93333333]
[4. 1. 5. ... 5. 1. 5.]
[2.93333333 2.33333333 0.8        ... 2.8        4.06666667 4.4       ]
[3. 2. 1. ... 3. 4. 4.]
[5.         1.66666667 0.2        ... 1.         1.         4.6       ]
[5. 2. 0. ... 1. 1. 5.]
[3.93333333 1.         0.86666667 ... 1.66666667 1.06666667 0.2       ]
[4. 1. 1. ... 2. 1. 0.]
[1.66666667 1.         2.86666667 ... 5.         4.93333333 3.        ]
[2. 1. 3. ... 5. 5. 3.]
[2.33333333 2.46666667 2.13333333 ... 4.         4.         2.06666667]
[2. 2. 2. ... 4. 4. 2.]
[2.86666667 3.8        4.46666667 ... 2.86666667 4.         2.53333333]
[3. 4. 4. ... 3. 4. 3.]
[3.2        2.73333333 3.66666667 ... 0.93333333 1.86666667 1.53333333]
[3. 3. 4. ... 1. 2. 2.]
[1.06666667 4.2        0.4        ... 0.86666667 5.         1.        ]
[1. 4. 0. ... 1. 5. 1.]
Accuracy of RF: [0.685133239831697, 0.67

In [19]:
rfr_pred1 = cross_val_predict(rfr1,tsr_3_input_nomrs,tsr_3_output,cv = 10)
rfr_pred1 = np.round(rfr_pred1)
confusion_matrix(tsr_3_output, rfr_pred1)

array([[296, 160,  31,  11,   1,   1],
       [ 76, 779, 235,  42,  10,   2],
       [  2, 125, 505, 118,  21,   0],
       [  0,  21, 125, 431,  87,   3],
       [  0,   9,  16, 167, 720,  47],
       [  0,   2,   4,  16, 178, 510]], dtype=int64)

## XGBoost

In [20]:
xgbr = XGBRegressor(n_estimators  = 15, objective="reg:squarederror", random_state = 19) 
xgbr_scores = cross_val_score(xgbr,tsr_3_input,tsr_3_output,cv = 10, scoring='r2')
print(xgbr_scores)
print("Mean of R^2:", xgbr_scores.mean())
print("Std of R^2:", xgbr_scores.std())

[0.99993267 0.99994881 0.99994648 0.99995066 0.99994196 0.99994731
 0.99994212 0.99993815 0.99993076 0.99991742]
Mean of R^2: 0.9999396339808422
Std of R^2: 9.726589976168125e-06


In [21]:
acc_xgbr = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_3_input, tsr_3_output, test_size=0.3, random_state=i)
    xgbr.fit(X_train,y_train)
    xgbr_predict = xgbr.predict(X_test)
    print(xgbr_predict)
    xgbr_predict = np.round(xgbr_predict)
    print(xgbr_predict)
    accuracy = (y_test == xgbr_predict).sum() / len(xgbr_predict)
    acc_xgbr.append(accuracy)

print("Accuracy of RF:", acc_xgbr)
print("Mean of Accuracy of RF:", sum(acc_xgbr)/10)

[1.9927903e+00 3.9832261e+00 3.9832261e+00 ... 4.9783568e+00 2.9879732e+00
 2.4178824e-03]
[2. 4. 4. ... 5. 3. 0.]
[2.9879718  0.99760693 4.9783516  ... 4.9783516  0.99760693 4.9783516 ]
[3. 1. 5. ... 5. 1. 5.]
[2.9879608  2.9879608  0.99760723 ... 2.9879608  3.9832263  4.978364  ]
[3. 3. 1. ... 3. 4. 5.]
[4.9783549e+00 9.9760765e-01 2.4186620e-03 ... 9.9760765e-01 9.9760765e-01
 2.9879653e+00]
[5. 1. 0. ... 1. 1. 3.]
[3.9832242e+00 2.4197432e-03 9.9760729e-01 ... 1.9927958e+00 9.9760729e-01
 9.9760729e-01]
[4. 0. 1. ... 2. 1. 1.]
[0.997607  0.997607  1.9927949 ... 4.9783487 4.9783487 3.983221 ]
[1. 1. 2. ... 5. 5. 4.]
[1.9927950e+00 2.9879689e+00 1.9927950e+00 ... 3.9832242e+00 3.9832242e+00
 2.4178824e-03]
[2. 3. 2. ... 4. 4. 0.]
[9.9760717e-01 3.9832234e+00 9.9760717e-01 ... 2.9879682e+00 3.9832234e+00
 2.4172552e-03]
[1. 4. 1. ... 3. 4. 0.]
[2.9879742 2.9879742 2.9879742 ... 0.9976067 1.9927931 0.9976067]
[3. 3. 3. ... 1. 2. 1.]
[9.9760669e-01 3.9832263e+00 2.4166447e-03 ... 9.9760

In [22]:
xgbr_pred = cross_val_predict(xgbr,tsr_3_input,tsr_3_output,cv = 10)
xgbr_pred = np.round(xgbr_pred)
confusion_matrix(tsr_3_output, xgbr_pred)

array([[ 500,    0,    0,    0,    0,    0],
       [   0, 1144,    0,    0,    0,    0],
       [   0,    0,  771,    0,    0,    0],
       [   0,    0,    0,  667,    0,    0],
       [   0,    0,    0,    0,  959,    0],
       [   0,    0,    0,    0,    0,  710]], dtype=int64)

In [23]:
xgbr1 = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
xgbr_scores1 = cross_val_score(xgbr1,tsr_3_input_nomrs,tsr_3_output,cv = 10, scoring='r2')
print(xgbr_scores1)
print("Mean of R^2:", xgbr_scores1.mean())
print("Std of R^2:", xgbr_scores1.std())

[0.79848646 0.83124349 0.86398329 0.8160926  0.84065532 0.85591027
 0.8449297  0.7813848  0.89386159 0.81143728]
Mean of R^2: 0.8337984798534303
Std of R^2: 0.03161128500485248


In [24]:
acc_xgbr1 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_3_input_nomrs, tsr_3_output, test_size=0.3, random_state=i)
    xgbr1.fit(X_train,y_train)
    xgbr1_predict = xgbr1.predict(X_test)
    print(xgbr1_predict)
    xgbr1_predict = np.round(xgbr1_predict)
    print(xgbr1_predict)
    accuracy = (y_test == xgbr1_predict).sum() / len(xgbr1_predict)
    acc_xgbr1.append(accuracy)

print("Accuracy of RF:", acc_xgbr1)
print("Mean of Accuracy of RF:", sum(acc_xgbr1)/10)

[1.8        4.2        4.26666667 ... 5.         2.53333333 0.2       ]
[2. 4. 4. ... 5. 3. 0.]
[3.6        1.         4.86666667 ... 5.         0.93333333 4.93333333]
[4. 1. 5. ... 5. 1. 5.]
[2.93333333 2.33333333 0.8        ... 2.8        4.06666667 4.4       ]
[3. 2. 1. ... 3. 4. 4.]
[5.         1.66666667 0.2        ... 1.         1.         4.6       ]
[5. 2. 0. ... 1. 1. 5.]
[3.93333333 1.         0.86666667 ... 1.66666667 1.06666667 0.2       ]
[4. 1. 1. ... 2. 1. 0.]
[1.66666667 1.         2.86666667 ... 5.         4.93333333 3.        ]
[2. 1. 3. ... 5. 5. 3.]
[2.33333333 2.46666667 2.13333333 ... 4.         4.         2.06666667]
[2. 2. 2. ... 4. 4. 2.]
[2.86666667 3.8        4.46666667 ... 2.86666667 4.         2.53333333]
[3. 4. 4. ... 3. 4. 3.]
[3.2        2.73333333 3.66666667 ... 0.93333333 1.86666667 1.53333333]
[3. 3. 4. ... 1. 2. 2.]
[1.06666667 4.2        0.4        ... 0.86666667 5.         1.        ]
[1. 4. 0. ... 1. 5. 1.]
Accuracy of RF: [0.685133239831697, 0.67

In [25]:
xgbr_pred1 = cross_val_predict(xgbr1,tsr_3_input_nomrs,tsr_3_output,cv = 10)
xgbr_pred1 = np.round(xgbr_pred1)
confusion_matrix(tsr_3_output, xgbr_pred1)

array([[296, 160,  31,  11,   1,   1],
       [ 76, 779, 235,  42,  10,   2],
       [  2, 125, 505, 118,  21,   0],
       [  0,  21, 125, 431,  87,   3],
       [  0,   9,  16, 167, 720,  47],
       [  0,   2,   4,  16, 178, 510]], dtype=int64)

# 2 classes

In [26]:
tsr_3_output[(tsr_3_output == 0)|(tsr_3_output == 1)|(tsr_3_output == 2)] = 0
tsr_3_output[(tsr_3_output == 3)|(tsr_3_output == 4)|(tsr_3_output == 5)] = 1

## SVM

In [27]:
svr2 = LinearSVR(epsilon = 0, dual=False, loss = "squared_epsilon_insensitive", C = 1, random_state = 19)
svr_scores2 = cross_val_score(svr2,tsr_3_input,tsr_3_output,cv = 10, scoring='r2')
print(svr_scores2)
print("Mean of R^2:", svr_scores2.mean())
print("Std of R^2:", svr_scores2.std())

[0.7589501  0.79709672 0.8141315  0.84887988 0.84468708 0.81412863
 0.8021148  0.81704023 0.82336342 0.73357385]
Mean of R^2: 0.805396621020947
Std of R^2: 0.0338244105018299


In [28]:
acc_svr2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_3_input, tsr_3_output, test_size=0.3, random_state=i)
    svr2.fit(X_train,y_train)
    svr2_predict = svr2.predict(X_test)
    print(svr2_predict)
    svr2_predict = np.round(svr2_predict)
    print(svr2_predict)
    accuracy = (y_test == svr2_predict).sum() / len(svr2_predict)
    acc_svr2.append(accuracy)

print("Accuracy of RF:", acc_svr2)
print("Mean of Accuracy of RF:", sum(acc_svr2)/10)

[ 0.23611926  0.9712239   0.81288732 ...  1.03343586  0.55910823
 -0.01203918]
[ 0.  1.  1. ...  1.  1. -0.]
[ 0.8067015  -0.00724352  0.99331772 ...  0.93200034  0.08783345
  1.10443544]
[ 1. -0.  1. ...  1.  0.  1.]
[0.68581401 0.6647161  0.01802136 ... 0.55138426 1.02241464 0.99163912]
[1. 1. 0. ... 1. 1. 1.]
[ 0.96710198 -0.01641694 -0.13294279 ...  0.0197862   0.00812979
  0.59347997]
[ 1. -0. -0. ...  0.  0.  1.]
[ 1.12097202 -0.23640088  0.03203904 ...  0.19088423  0.0222277
  0.06750609]
[ 1. -0.  0. ...  0.  0.  0.]
[ 0.10071383 -0.03440139  0.54762919 ...  1.12393684  1.23776756
  0.82754951]
[ 0. -0.  1. ...  1.  1.  1.]
[ 0.55157737  0.77834565  0.39282915 ...  1.08894962  1.05350519
 -0.09036225]
[ 1.  1.  0. ...  1.  1. -0.]
[ 0.16824873  1.07137133 -0.11712027 ...  0.72444743  0.7437035
  0.03326514]
[ 0.  1. -0. ...  1.  1.  0.]
[ 0.73189605  0.66212623  0.79853585 ... -0.00518656  0.24957945
  0.20801508]
[ 1.  1.  1. ... -0.  0.  0.]
[-0.04499815  0.93219313 -0.085912

In [29]:
svr_pred2 = cross_val_predict(svr2,tsr_3_input,tsr_3_output,cv = 10)
svr_pred2 = np.round(svr_pred2)
confusion_matrix(tsr_3_output, svr_pred2)

array([[2344,   71],
       [  75, 2261]], dtype=int64)

In [30]:
svr3 = LinearSVR(epsilon = 0, dual=False, loss = "squared_epsilon_insensitive", C = 1, random_state = 19)
svr_scores3 = cross_val_score(svr3,tsr_3_input_nomrs,tsr_3_output,cv = 10, scoring='r2')
print(svr_scores3)
print("Mean of R^2:", svr_scores3.mean())
print("Std of R^2:", svr_scores3.std())

[0.60849137 0.66896247 0.7379421  0.73399559 0.75512915 0.69983834
 0.71975856 0.67443361 0.7712184  0.53487333]
Mean of R^2: 0.6904642932258542
Std of R^2: 0.06888057174630827


In [31]:
acc_svr3 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_3_input_nomrs, tsr_3_output, test_size=0.3, random_state=i)
    svr3.fit(X_train,y_train)
    svr3_predict = svr3.predict(X_test)
    print(svr3_predict)
    svr3_predict = np.round(svr3_predict)
    print(svr3_predict)
    accuracy = (y_test == svr3_predict).sum() / len(svr3_predict)
    acc_svr3.append(accuracy)

print("Accuracy of RF:", acc_svr3)
print("Mean of Accuracy of RF:", sum(acc_svr3)/10)

[0.09532237 1.01960851 0.89062228 ... 0.98227168 0.49392201 0.32688418]
[0. 1. 1. ... 1. 0. 0.]
[0.92840957 0.01824745 1.01593597 ... 0.91492884 0.09675799 1.05647176]
[1. 0. 1. ... 1. 0. 1.]
[ 0.59167347  0.60152147 -0.02258882 ...  0.38719733  1.02554124
  0.75110607]
[ 1.  1. -0. ...  0.  1.  1.]
[ 0.97136848  0.0230034  -0.00712386 ...  0.04878741 -0.01608171
  0.94002416]
[ 1.  0. -0. ...  0. -0.  1.]
[ 1.15690061 -0.00321766  0.03244312 ...  0.04728446  0.01959241
 -0.05195042]
[ 1. -0.  0. ...  0.  0. -0.]
[ 0.22832572 -0.07954947  0.73150427 ...  1.0502451   1.16232615
  0.58297104]
[ 0. -0.  1. ...  1.  1.  1.]
[0.71354149 0.59680723 0.41053762 ... 1.12013295 1.13389168 0.29730149]
[1. 1. 0. ... 1. 1. 0.]
[0.53608568 1.12715687 0.70087427 ... 0.80845272 0.64282905 0.65037663]
[1. 1. 1. ... 1. 1. 1.]
[0.7397978  0.67504728 0.88574125 ... 0.01025207 0.24641167 0.3947984 ]
[1. 1. 1. ... 0. 0. 0.]
[-0.06910215  1.02285185  0.07593238 ... -0.03802618  1.04045996
  0.22018794]
[-0. 

In [32]:
svr_pred3 = cross_val_predict(svr3,tsr_3_input_nomrs,tsr_3_output,cv = 10)
svr_pred3 = np.round(svr_pred3)
confusion_matrix(tsr_3_output, svr_pred3)

array([[2220,  195],
       [ 238, 2098]], dtype=int64)

## RF

In [33]:
rfr2 = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
rfr_scores2 = cross_val_score(rfr2,tsr_3_input,tsr_3_output,cv = 10, scoring='r2')
print(rfr_scores2)
print("Mean of R^2:", rfr_scores2.mean())
print("Std of R^2:", rfr_scores2.std())

[1.         1.         1.         0.99996156 1.         1.
 1.         1.         1.         1.        ]
Mean of R^2: 0.9999961563048739
Std of R^2: 1.1531085378579942e-05


In [34]:
acc_rfr2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_3_input, tsr_3_output, test_size=0.3, random_state=i)
    rfr2.fit(X_train,y_train)
    rfr2_predict = rfr2.predict(X_test)
    print(rfr2_predict)
    rfr2_predict = np.round(rfr2_predict)
    print(rfr2_predict)
    accuracy = (y_test == rfr2_predict).sum() / len(rfr2_predict)
    acc_rfr2.append(accuracy)

print("Accuracy of RF:", acc_rfr2)
print("Mean of Accuracy of RF:", sum(acc_rfr2)/10)

[0. 1. 1. ... 1. 1. 0.]
[0. 1. 1. ... 1. 1. 0.]
[1. 0. 1. ... 1. 0. 1.]
[1. 0. 1. ... 1. 0. 1.]
[1. 1. 0. ... 1. 1. 1.]
[1. 1. 0. ... 1. 1. 1.]
[1. 0. 0. ... 0. 0. 1.]
[1. 0. 0. ... 0. 0. 1.]
[1. 0. 0. ... 0. 0. 0.]
[1. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 1. 1. 1.]
[0. 0. 0. ... 1. 1. 1.]
[0. 1. 0. ... 1. 1. 0.]
[0. 1. 0. ... 1. 1. 0.]
[0. 1. 0. ... 1. 1. 0.]
[0. 1. 0. ... 1. 1. 0.]
[1. 1. 1. ... 0. 0. 0.]
[1. 1. 1. ... 0. 0. 0.]
[0. 1. 0. ... 0. 1. 0.]
[0. 1. 0. ... 0. 1. 0.]
Accuracy of RF: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Mean of Accuracy of RF: 1.0


In [35]:
rf_pred2 = cross_val_predict(rfr2,tsr_3_input,tsr_3_output,cv = 10)
rf_pred2 = np.round(rf_pred2)
confusion_matrix(tsr_3_output, rf_pred2)

array([[2415,    0],
       [   0, 2336]], dtype=int64)

In [36]:
rfr3 = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
rfr_scores3 = cross_val_score(rfr3,tsr_3_input_nomrs,tsr_3_output,cv = 10, scoring='r2')
print(rfr_scores3)
print("Mean of R^2:", rfr_scores3.mean())
print("Std of R^2:", rfr_scores3.std())

[0.67450974 0.71671835 0.83315047 0.7694936  0.77265994 0.74529902
 0.75116166 0.77124322 0.84322987 0.62267818]
Mean of R^2: 0.7500144060955003
Std of R^2: 0.06323867093793825


In [37]:
acc_rfr3 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_3_input_nomrs, tsr_3_output, test_size=0.3, random_state=i)
    rfr3.fit(X_train,y_train)
    rfr3_predict = rfr3.predict(X_test)
    print(rfr3_predict)
    rfr3_predict = np.round(rfr3_predict)
    print(rfr3_predict)
    accuracy = (y_test == rfr3_predict).sum() / len(rfr3_predict)
    acc_rfr3.append(accuracy)

print("Accuracy of RF:", acc_rfr3)
print("Mean of Accuracy of RF:", sum(acc_rfr3)/10)

[0.         1.         1.         ... 1.         0.73333333 0.2       ]
[0. 1. 1. ... 1. 1. 0.]
[0.93333333 0.         1.         ... 0.93333333 0.         1.        ]
[1. 0. 1. ... 1. 0. 1.]
[0.66666667 0.66666667 0.         ... 0.46666667 1.         1.        ]
[1. 1. 0. ... 0. 1. 1.]
[1.         0.06666667 0.         ... 0.         0.         1.        ]
[1. 0. 0. ... 0. 0. 1.]
[1.         0.         0.         ... 0.         0.13333333 0.        ]
[1. 0. 0. ... 0. 0. 0.]
[0.06666667 0.         0.53333333 ... 1.         1.         0.73333333]
[0. 0. 1. ... 1. 1. 1.]
[0.53333333 0.6        0.53333333 ... 0.93333333 1.         0.06666667]
[1. 1. 1. ... 1. 1. 0.]
[0.73333333 1.         0.93333333 ... 1.         0.86666667 0.8       ]
[1. 1. 1. ... 1. 1. 1.]
[0.8        0.8        1.         ... 0.         0.         0.13333333]
[1. 1. 1. ... 0. 0. 0.]
[0. 1. 0. ... 0. 1. 0.]
[0. 1. 0. ... 0. 1. 0.]
Accuracy of RF: [0.923562412342216, 0.9130434782608695, 0.9277699859747546, 0.9172510518

In [38]:
rfr_pred3 = cross_val_predict(rfr3,tsr_3_input_nomrs,tsr_3_output,cv = 10)
rfr_pred3 = np.round(rfr_pred3)
confusion_matrix(tsr_3_output, rfr_pred3)

array([[2184,  231],
       [ 128, 2208]], dtype=int64)

## XGBoost

In [39]:
xgbr2 = XGBRegressor(n_estimators  = 15, objective="reg:squarederror", random_state = 19) 
xgbr_scores2 = cross_val_score(xgbr2,tsr_3_input,tsr_3_output,cv = 10, scoring='r2')
print(xgbr_scores2)
print("Mean of R^2:", xgbr_scores2.mean())
print("Std of R^2:", xgbr_scores2.std())

[0.99997726 0.99997712 0.99997712 0.99997671 0.99997732 0.99997719
 0.99997694 0.9999773  0.99997729 0.99997552]
Mean of R^2: 0.999976976946966
Std of R^2: 5.183694326287556e-07


In [40]:
acc_xgbr2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_3_input, tsr_3_output, test_size=0.3, random_state=i)
    xgbr2.fit(X_train,y_train)
    xgbr2_predict = xgbr2.predict(X_test)
    print(xgbr2_predict)
    xgbr2_predict = np.round(xgbr2_predict)
    print(xgbr2_predict)
    accuracy = (y_test == xgbr2_predict).sum() / len(xgbr2_predict)
    acc_xgbr2.append(accuracy)

print("Accuracy of RF:", acc_xgbr2)
print("Mean of Accuracy of RF:", sum(acc_xgbr2)/10)

[0.00238295 0.997617   0.997617   ... 0.997617   0.997617   0.00238295]
[0. 1. 1. ... 1. 1. 0.]
[0.997617   0.00238294 0.997617   ... 0.997617   0.00238294 0.997617  ]
[1. 0. 1. ... 1. 0. 1.]
[0.99761695 0.99761695 0.00238284 ... 0.99761695 0.99761695 0.99761695]
[1. 1. 0. ... 1. 1. 1.]
[0.9976169  0.00238283 0.00238283 ... 0.00238283 0.00238283 0.9976169 ]
[1. 0. 0. ... 0. 0. 1.]
[0.9976168  0.00238279 0.00238279 ... 0.00238279 0.00238279 0.00238279]
[1. 0. 0. ... 0. 0. 0.]
[0.00238271 0.00238271 0.00238271 ... 0.99761677 0.99761677 0.99761677]
[0. 0. 0. ... 1. 1. 1.]
[0.00238285 0.99761695 0.00238285 ... 0.99761695 0.99761695 0.00238285]
[0. 1. 0. ... 1. 1. 0.]
[0.00238284 0.9976169  0.00238284 ... 0.9976169  0.9976169  0.00238284]
[0. 1. 0. ... 1. 1. 0.]
[0.997617   0.997617   0.997617   ... 0.00238296 0.00238296 0.00238296]
[1. 1. 1. ... 0. 0. 0.]
[0.00238287 0.99761695 0.00238287 ... 0.00238287 0.99761695 0.00238287]
[0. 1. 0. ... 0. 1. 0.]
Accuracy of RF: [1.0, 1.0, 1.0, 1.0, 1.0

In [41]:
xgbr_pred2 = cross_val_predict(xgbr2,tsr_3_input,tsr_3_output,cv = 10)
xgbr_pred2 = np.round(xgbr_pred2)
confusion_matrix(tsr_3_output, xgbr_pred2)

array([[2415,    0],
       [   0, 2336]], dtype=int64)

In [42]:
xgbr3 = XGBRegressor(n_estimators  = 15, objective="reg:squarederror", random_state = 19) 
xgbr_scores3 = cross_val_score(xgbr3,tsr_3_input_nomrs,tsr_3_output,cv = 10, scoring='r2')
print(xgbr_scores3)
print("Mean of R^2:", xgbr_scores3.mean())
print("Std of R^2:", xgbr_scores3.std())

[0.71489409 0.74967602 0.84586303 0.76303676 0.75537171 0.7370609
 0.7565685  0.74812115 0.8265216  0.64069671]
Mean of R^2: 0.7537810472258741
Std of R^2: 0.05338066926479723


In [43]:
acc_xgbr3 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_3_input_nomrs, tsr_3_output, test_size=0.3, random_state=i)
    xgbr3.fit(X_train,y_train)
    xgbr3_predict = xgbr3.predict(X_test)
    print(xgbr3_predict)
    xgbr3_predict = np.round(xgbr3_predict)
    print(xgbr3_predict)
    accuracy = (y_test == xgbr3_predict).sum() / len(xgbr3_predict)
    acc_xgbr3.append(accuracy)

print("Accuracy of RF:", acc_xgbr3)
print("Mean of Accuracy of RF:", sum(acc_xgbr3)/10)

[0.00792131 1.0044457  1.0019529  ... 0.9636441  0.8538594  0.13691925]
[0. 1. 1. ... 1. 1. 0.]
[0.88034075 0.004326   0.99378794 ... 1.0189118  0.02138103 0.99096495]
[1. 0. 1. ... 1. 0. 1.]
[0.8065104  0.93387276 0.01854321 ... 0.74943817 1.0093267  1.0002041 ]
[1. 1. 0. ... 1. 1. 1.]
[ 1.0001733   0.00920923  0.0204541  ... -0.0088036   0.02456101
  0.98055446]
[ 1.  0.  0. ... -0.  0.  1.]
[ 1.0093838   0.01070464  0.01351931 ...  0.03159231  0.14472161
 -0.00214082]
[ 1.  0.  0. ...  0.  0. -0.]
[0.04267944 0.028213   0.47242048 ... 1.0064255  0.9921817  0.5672429 ]
[0. 0. 0. ... 1. 1. 1.]
[0.8015002  0.87240255 0.15583947 ... 1.0009992  0.99464756 0.17079748]
[1. 1. 0. ... 1. 1. 0.]
[0.85767436 1.0181955  0.89285475 ... 0.6202606  1.0485386  0.62295234]
[1. 1. 1. ... 1. 1. 1.]
[0.7777098  0.9181845  0.98620963 ... 0.0296317  0.02039381 0.31853446]
[1. 1. 1. ... 0. 0. 0.]
[ 0.00138025  0.9879035   0.00309756 ... -0.00814977  0.84304124
  0.08859486]
[ 0.  1.  0. ... -0.  1.  0.]
A

In [44]:
xgbr_pred3 = cross_val_predict(xgbr3,tsr_3_input_nomrs,tsr_3_output,cv = 10)
xgbr_pred3 = np.round(xgbr_pred3)
confusion_matrix(tsr_3_output, xgbr_pred3)

array([[2187,  228],
       [ 141, 2195]], dtype=int64)

# Summary

## Mean & Std

In [45]:
svr_mean = np.array([svr_scores.mean(), svr_scores.std(), svr_scores[0], svr_scores[1], svr_scores[2], svr_scores[3],
                     svr_scores[4], svr_scores[5], svr_scores[6], svr_scores[7], svr_scores[8], svr_scores[9]])
rfr_mean = np.array([rfr_scores.mean(), rfr_scores.std(), rfr_scores[0], rfr_scores[1], rfr_scores[2], rfr_scores[3],rfr_scores[4], 
                    rfr_scores[5], rfr_scores[6], rfr_scores[7], rfr_scores[8], rfr_scores[9]])
xgbr_mean = np.array([xgbr_scores.mean(), xgbr_scores.std(), xgbr_scores[0], xgbr_scores[1], xgbr_scores[2], xgbr_scores[3],
                     xgbr_scores[4], xgbr_scores[5], xgbr_scores[6], xgbr_scores[7], xgbr_scores[8], xgbr_scores[9]])
svr_mean2 = np.array([svr_scores2.mean(), svr_scores2.std(), svr_scores2[0], svr_scores2[1], svr_scores2[2], svr_scores2[3],
                     svr_scores2[4], svr_scores2[5], svr_scores2[6], svr_scores2[7], svr_scores2[8], svr_scores2[9]])
rfr_mean2 = np.array([rfr_scores2.mean(), rfr_scores2.std(), rfr_scores2[0], rfr_scores2[1], rfr_scores2[2], rfr_scores2[3],rfr_scores2[4], 
                    rfr_scores2[5], rfr_scores2[6], rfr_scores2[7], rfr_scores2[8], rfr_scores2[9]])
xgbr_mean2 = np.array([xgbr_scores2.mean(), xgbr_scores2.std(), xgbr_scores2[0], xgbr_scores2[1], xgbr_scores2[2], xgbr_scores2[3],
                     xgbr_scores2[4], xgbr_scores2[5], xgbr_scores2[6], xgbr_scores2[7], xgbr_scores2[8], xgbr_scores2[9]])

In [46]:
tsr_3_mean = pd.DataFrame([svr_mean, rfr_mean, xgbr_mean, svr_mean2, rfr_mean2, xgbr_mean2]).T
tsr_3_mean.index = ["Mean", "Std", "R^2_1", "R^2_2", "R^2_3", "R^2_4", "R^2_5", "R^2_6", "R^2_7", "R^2_8", "R^2_9", "R^2_10"]
tsr_3_mean.columns = ["svr", 'rfr', 'xgbr', 'svr2', 'rfr2', 'xgbr2']

In [47]:
csv_save = os.path.join(".", "tsr_3_mean_regression.csv")
tsr_3_mean.to_csv(csv_save, index = True)