In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

In [2]:
csv_path = os.path.join("..", "..", "data","LINKED_DATA", "TSR_EHR", "TSR_6_CLEANED.csv")
tsr_6 = pd.read_csv(csv_path)
tsr_6.head()

Unnamed: 0,height_nm,weight_nm,edu_id,pro_id,opc_id,ih_fl,ivtpamg_nm,hospitalised_time,nivtpa_id,nivtpa1_fl,...,nihs_8_out,nihs_9_out,nihs_10_out,nihs_11_out,total_out,SexName,Age,mrs_tx_1,mrs_tx_3,mrs_tx_6
0,153.0,62.0,3,1,3,0,0.0,8.0,0,999,...,1,0,1,0,4,0,67.0,1,1,1
1,152.0,62.0,3,1,2,0,0.0,4.0,0,999,...,1,0,0,0,1,0,69.0,1,0,0
2,148.0,56.0,2,1,2,0,0.0,5.0,0,999,...,1,0,0,0,2,0,71.0,0,0,0
3,152.0,56.0,4,1,2,0,0.0,3.0,1,0,...,0,0,0,0,0,0,71.0,0,0,0
4,160.0,60.0,2,1,3,0,0.0,4.0,0,999,...,0,0,0,0,4,0,62.0,3,3,3


In [3]:
tsr_6_input = tsr_6.drop(["mrs_tx_6"], axis=1)
tsr_6_input[tsr_6_input == "N"] = 0
tsr_6_input[tsr_6_input == "Y"] = 1
tsr_6_input = tsr_6_input.astype("float64")
tsr_6_input = np.array(tsr_6_input.values)

tsr_6_input_nomrs = tsr_6.drop(["mrs_tx_6", "mrs_tx_3", "mrs_tx_1"], axis=1)
tsr_6_input_nomrs[tsr_6_input_nomrs == "N"] = 0
tsr_6_input_nomrs[tsr_6_input_nomrs == "Y"] = 1
tsr_6_input_nomrs = tsr_6_input_nomrs.astype("float64")
tsr_6_input_nomrs = np.array(tsr_6_input_nomrs.values)

# 6 classes

In [4]:
tsr_6_output = tsr_6.mrs_tx_1
tsr_6_output = tsr_6_output.astype("float64")
tsr_6_output = np.array(tsr_6_output.values)

## SVM

In [5]:
svr = LinearSVR(epsilon = 0, dual=False, loss = "squared_epsilon_insensitive", C = 1, random_state = 19)
svr_scores = cross_val_score(svr,tsr_6_input,tsr_6_output,cv = 10, scoring='r2')
print(svr_scores)
print("Mean of R^2:", svr_scores.mean())
print("Std of R^2:", svr_scores.std())

[0.97298991 0.97765583 0.9822613  0.9043261  0.99057859 0.97934303
 0.97305357 0.98671394 0.96758493 0.92350859]
Mean of R^2: 0.9658015800762796
Std of R^2: 0.02705812134930812


In [6]:
acc_svr = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input, tsr_6_output, test_size=0.3, random_state=i)
    svr.fit(X_train,y_train)
    svr_predict = svr.predict(X_test)
    #print(svr_predict)
    svr_predict = np.round(svr_predict)
    #print(svr_predict)
    accuracy = (y_test == svr_predict).sum() / len(svr_predict)
    acc_svr.append(accuracy)

print("Accuracy of RF:", acc_svr)
print("Mean of Accuracy of RF:", sum(acc_svr)/10)

Accuracy of RF: [0.9735152487961477, 0.9157303370786517, 0.9614767255216693, 0.9791332263242376, 0.9791332263242376, 0.8731942215088283, 0.9454253611556982, 0.9646869983948636, 0.8940609951845907, 0.8234349919743178]
Mean of Accuracy of RF: 0.9309791332263243


In [7]:
svr_pred = cross_val_predict(svr,tsr_6_input,tsr_6_output,cv = 10)
svr_pred = np.round(svr_pred)
confusion_matrix(tsr_6_output, svr_pred)

array([[   0,    0,    0,    0,    0,    0,    0,    0],
       [   1,  402,   55,    8,    0,    0,    0,    0],
       [   0,    1, 1010,   55,    1,    0,    0,    0],
       [   0,    1,   21,  642,   19,    0,    0,    0],
       [   0,    0,    0,   32,  556,    8,    0,    0],
       [   0,    0,    0,    0,   57,  750,    9,    0],
       [   0,    0,    0,    0,    5,   47,  469,    2],
       [   0,    0,    0,    0,    0,    0,    0,    0]], dtype=int64)

In [8]:
svr_pred[svr_pred == -1] = 0
confusion_matrix(tsr_6_output, svr_pred)

array([[ 403,   55,    8,    0,    0,    0,    0],
       [   1, 1010,   55,    1,    0,    0,    0],
       [   1,   21,  642,   19,    0,    0,    0],
       [   0,    0,   32,  556,    8,    0,    0],
       [   0,    0,    0,   57,  750,    9,    0],
       [   0,    0,    0,    5,   47,  469,    2],
       [   0,    0,    0,    0,    0,    0,    0]], dtype=int64)

In [9]:
svr1 = LinearSVR(epsilon = 0, dual=False, loss = "squared_epsilon_insensitive", C = 1, random_state = 19)
svr_scores1 = cross_val_score(svr1,tsr_6_input_nomrs,tsr_6_output,cv = 10, scoring='r2')
print(svr_scores1)
print("Mean of R^2:", svr_scores1.mean())
print("Std of R^2:", svr_scores1.std())

[0.67045177 0.78329337 0.81744363 0.79407382 0.81889027 0.79818859
 0.83967165 0.74176129 0.75511674 0.64201807]
Mean of R^2: 0.7660909189303222
Std of R^2: 0.06180736630616617


In [10]:
acc_svr1 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input_nomrs, tsr_6_output, test_size=0.3, random_state=i)
    svr1.fit(X_train,y_train)
    svr1_predict = svr1.predict(X_test)
    #print(svr1_predict)
    svr1_predict = np.round(svr1_predict)
    #print(svr1_predict)
    accuracy = (y_test == svr1_predict).sum() / len(svr1_predict)
    acc_svr1.append(accuracy)

print("Accuracy of RF:", acc_svr1)
print("Mean of Accuracy of RF:", sum(acc_svr1)/10)

Accuracy of RF: [0.5184590690208668, 0.5457463884430177, 0.557784911717496, 0.5666131621187801, 0.5232744783306581, 0.5762439807383628, 0.5642054574638844, 0.579454253611557, 0.5617977528089888, 0.5473515248796148]
Mean of Accuracy of RF: 0.5540930979133225


In [11]:
svr_pred1 = cross_val_predict(svr1,tsr_6_input_nomrs,tsr_6_output,cv = 10)
svr_pred1 = np.round(svr_pred1)
confusion_matrix(tsr_6_output, svr_pred1)

array([[  0,   0,   0,   0,   0,   0,   0,   0],
       [  1,  33, 362,  58,  11,   0,   1,   0],
       [  1,   8, 813, 207,  30,   7,   1,   0],
       [  1,   3, 281, 296,  92,  10,   0,   0],
       [  0,   0,  23, 183, 330,  55,   5,   0],
       [  0,   0,   8,  32, 227, 447, 102,   0],
       [  0,   1,   5,   6,  24, 140, 339,   8],
       [  0,   0,   0,   0,   0,   0,   0,   0]], dtype=int64)

In [12]:
svr_pred1[svr_pred1 == 6] = 5
confusion_matrix(tsr_6_output, svr_pred1)

array([[  0,   0,   0,   0,   0,   0,   0],
       [  1,  33, 362,  58,  11,   0,   1],
       [  1,   8, 813, 207,  30,   7,   1],
       [  1,   3, 281, 296,  92,  10,   0],
       [  0,   0,  23, 183, 330,  55,   5],
       [  0,   0,   8,  32, 227, 447, 102],
       [  0,   1,   5,   6,  24, 140, 347]], dtype=int64)

## RF

In [13]:
rfr = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
rfr_scores = cross_val_score(rfr,tsr_6_input,tsr_6_output,cv = 10, scoring='r2')
print(rfr_scores)
print("Mean of R^2:", rfr_scores.mean())
print("Std of R^2:", rfr_scores.std())

[0.99996182 0.99996745 0.99998849 0.99991848 0.9999752  0.99996659
 0.99991892 0.99985967 0.99998559 0.99999551]
Mean of R^2: 0.9999537729717906
Std of R^2: 4.0187223876986936e-05


In [14]:
acc_rfr = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input, tsr_6_output, test_size=0.3, random_state=i)
    rfr.fit(X_train,y_train)
    rfr_predict = rfr.predict(X_test)
    #print(rfr_predict)
    rfr_predict = np.round(rfr_predict)
    #print(rfr_predict)
    accuracy = (y_test == rfr_predict).sum() / len(rfr_predict)
    acc_rfr.append(accuracy)

print("Accuracy of RF:", acc_rfr)
print("Mean of Accuracy of RF:", sum(acc_rfr)/10)

Accuracy of RF: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Mean of Accuracy of RF: 1.0


In [15]:
rfr_pred = cross_val_predict(rfr,tsr_6_input,tsr_6_output,cv = 10)
rfr_pred = np.round(rfr_pred)
confusion_matrix(tsr_6_output, rfr_pred)

array([[ 466,    0,    0,    0,    0,    0],
       [   0, 1067,    0,    0,    0,    0],
       [   0,    0,  683,    0,    0,    0],
       [   0,    0,    0,  596,    0,    0],
       [   0,    0,    0,    0,  816,    0],
       [   0,    0,    0,    0,    0,  523]], dtype=int64)

In [16]:
rfr1 = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
rfr_scores1 = cross_val_score(rfr1,tsr_6_input_nomrs,tsr_6_output,cv = 10, scoring='r2')
print(rfr_scores1)
print("Mean of R^2:", rfr_scores1.mean())
print("Std of R^2:", rfr_scores1.std())

[0.78461641 0.83812407 0.85364225 0.80021598 0.83401323 0.82829245
 0.88028419 0.75923783 0.8452423  0.79136409]
Mean of R^2: 0.8215032792722285
Std of R^2: 0.03482763312237961


In [17]:
acc_rfr1 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input_nomrs, tsr_6_output, test_size=0.3, random_state=i)
    rfr1.fit(X_train,y_train)
    rfr1_predict = rfr1.predict(X_test)
    #print(rfr1_predict)
    rfr1_predict = np.round(rfr1_predict)
    #print(rfr1_predict)
    accuracy = (y_test == rfr1_predict).sum() / len(rfr1_predict)
    acc_rfr1.append(accuracy)

print("Accuracy of RF:", acc_rfr1)
print("Mean of Accuracy of RF:", sum(acc_rfr1)/10)

Accuracy of RF: [0.666131621187801, 0.6613162118780096, 0.6966292134831461, 0.6597110754414125, 0.6677367576243981, 0.6717495987158909, 0.7014446227929374, 0.6837881219903692, 0.6902086677367576, 0.6717495987158909]
Mean of Accuracy of RF: 0.6770465489566614


In [18]:
rfr_pred1 = cross_val_predict(rfr1,tsr_6_input_nomrs,tsr_6_output,cv = 10)
rfr_pred1 = np.round(rfr_pred1)
confusion_matrix(tsr_6_output, rfr_pred1)

array([[282, 140,  26,  17,   0,   1],
       [ 54, 757, 205,  44,   5,   2],
       [  2, 132, 435, 102,  12,   0],
       [  0,  14, 130, 378,  72,   2],
       [  0,   7,  16, 149, 599,  45],
       [  0,   3,   2,  12, 146, 360]], dtype=int64)

## XGBoost

In [19]:
xgbr = XGBRegressor(n_estimators  = 15, objective="reg:squarederror", random_state = 19) 
xgbr_scores = cross_val_score(xgbr,tsr_6_input,tsr_6_output,cv = 10, scoring='r2')
print(xgbr_scores)
print("Mean of R^2:", xgbr_scores.mean())
print("Std of R^2:", xgbr_scores.std())

[0.99993897 0.99995103 0.99994665 0.99995117 0.99994241 0.99994807
 0.99994522 0.99994435 0.99993649 0.99991847]
Mean of R^2: 0.999942283896733
Std of R^2: 9.123852646233461e-06


In [20]:
acc_xgbr = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input, tsr_6_output, test_size=0.3, random_state=i)
    xgbr.fit(X_train,y_train)
    xgbr_predict = xgbr.predict(X_test)
    #print(xgbr_predict)
    xgbr_predict = np.round(xgbr_predict)
    #print(xgbr_predict)
    accuracy = (y_test == xgbr_predict).sum() / len(xgbr_predict)
    acc_xgbr.append(accuracy)

print("Accuracy of RF:", acc_xgbr)
print("Mean of Accuracy of RF:", sum(acc_xgbr)/10)

Accuracy of RF: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Mean of Accuracy of RF: 1.0


In [21]:
xgbr_pred = cross_val_predict(xgbr,tsr_6_input,tsr_6_output,cv = 10)
xgbr_pred = np.round(xgbr_pred)
confusion_matrix(tsr_6_output, xgbr_pred)

array([[ 466,    0,    0,    0,    0,    0],
       [   0, 1067,    0,    0,    0,    0],
       [   0,    0,  683,    0,    0,    0],
       [   0,    0,    0,  596,    0,    0],
       [   0,    0,    0,    0,  816,    0],
       [   0,    0,    0,    0,    0,  523]], dtype=int64)

In [22]:
xgbr1 = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
xgbr_scores1 = cross_val_score(xgbr1,tsr_6_input_nomrs,tsr_6_output,cv = 10, scoring='r2')
print(xgbr_scores1)
print("Mean of R^2:", xgbr_scores1.mean())
print("Std of R^2:", xgbr_scores1.std())

[0.78461641 0.83812407 0.85364225 0.80021598 0.83401323 0.82829245
 0.88028419 0.75923783 0.8452423  0.79136409]
Mean of R^2: 0.8215032792722285
Std of R^2: 0.03482763312237961


In [23]:
acc_xgbr1 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input_nomrs, tsr_6_output, test_size=0.3, random_state=i)
    xgbr1.fit(X_train,y_train)
    xgbr1_predict = xgbr1.predict(X_test)
    #print(xgbr1_predict)
    xgbr1_predict = np.round(xgbr1_predict)
    #print(xgbr1_predict)
    accuracy = (y_test == xgbr1_predict).sum() / len(xgbr1_predict)
    acc_xgbr1.append(accuracy)

print("Accuracy of RF:", acc_xgbr1)
print("Mean of Accuracy of RF:", sum(acc_xgbr1)/10)

Accuracy of RF: [0.666131621187801, 0.6613162118780096, 0.6966292134831461, 0.6597110754414125, 0.6677367576243981, 0.6717495987158909, 0.7014446227929374, 0.6837881219903692, 0.6902086677367576, 0.6717495987158909]
Mean of Accuracy of RF: 0.6770465489566614


In [24]:
xgbr_pred1 = cross_val_predict(xgbr1,tsr_6_input_nomrs,tsr_6_output,cv = 10)
xgbr_pred1 = np.round(xgbr_pred1)
confusion_matrix(tsr_6_output, xgbr_pred1)

array([[282, 140,  26,  17,   0,   1],
       [ 54, 757, 205,  44,   5,   2],
       [  2, 132, 435, 102,  12,   0],
       [  0,  14, 130, 378,  72,   2],
       [  0,   7,  16, 149, 599,  45],
       [  0,   3,   2,  12, 146, 360]], dtype=int64)

# 2 classes

In [25]:
tsr_6_output[(tsr_6_output == 0)|(tsr_6_output == 1)|(tsr_6_output == 2)] = 0
tsr_6_output[(tsr_6_output == 3)|(tsr_6_output == 4)|(tsr_6_output == 5)] = 1

## SVM

In [26]:
svr2 = LinearSVR(epsilon = 0, dual=False, loss = "squared_epsilon_insensitive", C = 1, random_state = 19)
svr_scores2 = cross_val_score(svr2,tsr_6_input,tsr_6_output,cv = 10, scoring='r2')
print(svr_scores2)
print("Mean of R^2:", svr_scores2.mean())
print("Std of R^2:", svr_scores2.std())

[0.727862   0.81356089 0.82028994 0.8450266  0.82888114 0.82684768
 0.78140753 0.80205846 0.80881142 0.73299616]
Mean of R^2: 0.7987741823736992
Std of R^2: 0.03779395446884126


In [27]:
acc_svr2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input, tsr_6_output, test_size=0.3, random_state=i)
    svr2.fit(X_train,y_train)
    svr2_predict = svr2.predict(X_test)
    #print(svr2_predict)
    svr2_predict = np.round(svr2_predict)
    #print(svr2_predict)
    accuracy = (y_test == svr2_predict).sum() / len(svr2_predict)
    acc_svr2.append(accuracy)

print("Accuracy of RF:", acc_svr2)
print("Mean of Accuracy of RF:", sum(acc_svr2)/10)

Accuracy of RF: [0.9646869983948636, 0.9582664526484751, 0.9815409309791332, 0.963884430176565, 0.9542536115569823, 0.9807383627608347, 0.9823434991974318, 0.9502407704654896, 0.9502407704654896, 0.9590690208667737]
Mean of Accuracy of RF: 0.9645264847512038


In [28]:
svr_pred2 = cross_val_predict(svr2,tsr_6_input,tsr_6_output,cv = 10)
svr_pred2 = np.round(svr_pred2)
confusion_matrix(tsr_6_output, svr_pred2)

array([[   0,    0,    0],
       [   2, 2147,   67],
       [   0,   71, 1864]], dtype=int64)

In [29]:
svr_pred2[svr_pred2 == -1] = 0
svr_pred2[svr_pred2 == 2] = 1
confusion_matrix(tsr_6_output, svr_pred2)

array([[2149,   67],
       [  71, 1864]], dtype=int64)

In [30]:
svr3 = LinearSVR(epsilon = 0, dual=False, loss = "squared_epsilon_insensitive", C = 1, random_state = 19)
svr_scores3 = cross_val_score(svr3,tsr_6_input_nomrs,tsr_6_output,cv = 10, scoring='r2')
print(svr_scores3)
print("Mean of R^2:", svr_scores3.mean())
print("Std of R^2:", svr_scores3.std())

[0.56391089 0.66895285 0.72511638 0.73974173 0.7378679  0.71606311
 0.71009548 0.66075518 0.735074   0.53413458]
Mean of R^2: 0.6791712103358022
Std of R^2: 0.07030762706798502


In [31]:
acc_svr3 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input_nomrs, tsr_6_output, test_size=0.3, random_state=i)
    svr3.fit(X_train,y_train)
    svr3_predict = svr3.predict(X_test)
    #print(svr3_predict)
    svr3_predict = np.round(svr3_predict)
    #print(svr3_predict)
    accuracy = (y_test == svr3_predict).sum() / len(svr3_predict)
    acc_svr3.append(accuracy)

print("Accuracy of RF:", acc_svr3)
print("Mean of Accuracy of RF:", sum(acc_svr3)/10)

Accuracy of RF: [0.9149277688603531, 0.8940609951845907, 0.9101123595505618, 0.9012841091492777, 0.9093097913322632, 0.9245585874799358, 0.9197431781701445, 0.9141252006420546, 0.9085072231139647, 0.913322632423756]
Mean of Accuracy of RF: 0.9109951845906903


In [32]:
svr3.fit(tsr_6_input_nomrs,tsr_6_output)
svr_predict3 =svr3.predict(tsr_6_input_nomrs)
print(svr_predict3)
svr_predict3 = np.round(svr_predict3)
print(svr_predict3)
print("Accuracy of RF:", (tsr_6_output == svr_predict3).sum() / len(svr_predict3))

[ 0.01243191 -0.02560191  0.00387048 ...  1.23686719  0.21027683
  0.10932557]
[ 0. -0.  0. ...  1.  0.  0.]
Accuracy of RF: 0.9159238737653578


In [33]:
svr_pred3 = cross_val_predict(svr3,tsr_6_input_nomrs,tsr_6_output,cv = 10)
svr_pred3 = np.round(svr_pred3)
confusion_matrix(tsr_6_output, svr_pred3)

array([[2034,  182],
       [ 223, 1712]], dtype=int64)

## RF

In [34]:
rfr2 = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
rfr_scores2 = cross_val_score(rfr2,tsr_6_input,tsr_6_output,cv = 10, scoring='r2')
print(rfr_scores2)
print("Mean of R^2:", rfr_scores2.mean())
print("Std of R^2:", rfr_scores2.std())

[1.         1.         1.         0.99986738 1.         1.
 1.         1.         1.         1.        ]
Mean of R^2: 0.9999867382481705
Std of R^2: 3.9785255488455905e-05


In [35]:
acc_rfr2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input, tsr_6_output, test_size=0.3, random_state=i)
    rfr2.fit(X_train,y_train)
    rfr2_predict = rfr2.predict(X_test)
    #print(rfr2_predict)
    rfr2_predict = np.round(rfr2_predict)
    #print(rfr2_predict)
    accuracy = (y_test == rfr2_predict).sum() / len(rfr2_predict)
    acc_rfr2.append(accuracy)

print("Accuracy of RF:", acc_rfr2)
print("Mean of Accuracy of RF:", sum(acc_rfr2)/10)

Accuracy of RF: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Mean of Accuracy of RF: 1.0


In [36]:
rf_pred2 = cross_val_predict(rfr2,tsr_6_input,tsr_6_output,cv = 10)
rf_pred2 = np.round(rf_pred2)
confusion_matrix(tsr_6_output, rf_pred2)

array([[2216,    0],
       [   0, 1935]], dtype=int64)

In [37]:
rfr3 = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
rfr_scores3 = cross_val_score(rfr3,tsr_6_input_nomrs,tsr_6_output,cv = 10, scoring='r2')
print(rfr_scores3)
print("Mean of R^2:", rfr_scores3.mean())
print("Std of R^2:", rfr_scores3.std())

[0.61955993 0.74186447 0.83176085 0.76283567 0.78815204 0.73151756
 0.75826261 0.72771581 0.82543574 0.62756759]
Mean of R^2: 0.7414672274832117
Std of R^2: 0.06800337265622568


In [38]:
acc_rfr3 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input_nomrs, tsr_6_output, test_size=0.3, random_state=i)
    rfr3.fit(X_train,y_train)
    rfr3_predict = rfr3.predict(X_test)
    #print(rfr3_predict)
    rfr3_predict = np.round(rfr3_predict)
    #print(rfr3_predict)
    accuracy = (y_test == rfr3_predict).sum() / len(rfr3_predict)
    acc_rfr3.append(accuracy)

print("Accuracy of RF:", acc_rfr3)
print("Mean of Accuracy of RF:", sum(acc_rfr3)/10)

Accuracy of RF: [0.9325842696629213, 0.9085072231139647, 0.9237560192616372, 0.9093097913322632, 0.9269662921348315, 0.9365971107544141, 0.9261637239165329, 0.9213483146067416, 0.9245585874799358, 0.9093097913322632]
Mean of Accuracy of RF: 0.9219101123595503


In [39]:
rfr_pred3 = cross_val_predict(rfr3,tsr_6_input_nomrs,tsr_6_output,cv = 10)
rfr_pred3 = np.round(rfr_pred3)
confusion_matrix(tsr_6_output, rfr_pred3)

array([[2009,  207],
       [ 121, 1814]], dtype=int64)

## XGBoost

In [40]:
xgbr2 = XGBRegressor(n_estimators  = 15, objective="reg:squarederror", random_state = 19) 
xgbr_scores2 = cross_val_score(xgbr2,tsr_6_input,tsr_6_output,cv = 10, scoring='r2')
print(xgbr_scores2)
print("Mean of R^2:", xgbr_scores2.mean())
print("Std of R^2:", xgbr_scores2.std())

[0.99997719 0.99997704 0.99997708 0.99997658 0.99997729 0.99997701
 0.99997684 0.9999771  0.99997714 0.99997599]
Mean of R^2: 0.999976926155173
Std of R^2: 3.6484455271521257e-07


In [41]:
acc_xgbr2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input, tsr_6_output, test_size=0.3, random_state=i)
    xgbr2.fit(X_train,y_train)
    xgbr2_predict = xgbr2.predict(X_test)
    #print(xgbr2_predict)
    xgbr2_predict = np.round(xgbr2_predict)
    #print(xgbr2_predict)
    accuracy = (y_test == xgbr2_predict).sum() / len(xgbr2_predict)
    acc_xgbr2.append(accuracy)

print("Accuracy of RF:", acc_xgbr2)
print("Mean of Accuracy of RF:", sum(acc_xgbr2)/10)

Accuracy of RF: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Mean of Accuracy of RF: 1.0


In [42]:
xgbr_pred2 = cross_val_predict(xgbr2,tsr_6_input,tsr_6_output,cv = 10)
xgbr_pred2 = np.round(xgbr_pred2)
confusion_matrix(tsr_6_output, xgbr_pred2)

array([[2216,    0],
       [   0, 1935]], dtype=int64)

In [43]:
xgbr3 = XGBRegressor(n_estimators  = 15, objective="reg:squarederror", random_state = 19) 
xgbr_scores3 = cross_val_score(xgbr3,tsr_6_input_nomrs,tsr_6_output,cv = 10, scoring='r2')
print(xgbr_scores3)
print("Mean of R^2:", xgbr_scores3.mean())
print("Std of R^2:", xgbr_scores3.std())

[0.66684919 0.7667298  0.81267796 0.76909761 0.77169132 0.7166649
 0.78748053 0.68365637 0.83661724 0.61305075]
Mean of R^2: 0.7424515666316938
Std of R^2: 0.06675556623211351


In [44]:
acc_xgbr3 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input_nomrs, tsr_6_output, test_size=0.3, random_state=i)
    xgbr3.fit(X_train,y_train)
    xgbr3_predict = xgbr3.predict(X_test)
    #print(xgbr3_predict)
    xgbr3_predict = np.round(xgbr3_predict)
    #print(xgbr3_predict)
    accuracy = (y_test == xgbr3_predict).sum() / len(xgbr3_predict)
    acc_xgbr3.append(accuracy)

print("Accuracy of RF:", acc_xgbr3)
print("Mean of Accuracy of RF:", sum(acc_xgbr3)/10)

Accuracy of RF: [0.9269662921348315, 0.9101123595505618, 0.9229534510433387, 0.9077046548956661, 0.9189406099518459, 0.9333868378812199, 0.92776886035313, 0.9157303370786517, 0.9197431781701445, 0.9052969502407705]
Mean of Accuracy of RF: 0.9188603531300161


In [45]:
xgbr_pred3 = cross_val_predict(xgbr3,tsr_6_input_nomrs,tsr_6_output,cv = 10)
xgbr_pred3 = np.round(xgbr_pred3)
confusion_matrix(tsr_6_output, xgbr_pred3)

array([[2008,  208],
       [ 118, 1817]], dtype=int64)

# Summary

## Mean & Std

In [46]:
svr_mean = np.array([svr_scores.mean(), svr_scores.std(), svr_scores[0], svr_scores[1], svr_scores[2], svr_scores[3],
                     svr_scores[4], svr_scores[5], svr_scores[6], svr_scores[7], svr_scores[8], svr_scores[9]])
rfr_mean = np.array([rfr_scores.mean(), rfr_scores.std(), rfr_scores[0], rfr_scores[1], rfr_scores[2], rfr_scores[3],rfr_scores[4], 
                    rfr_scores[5], rfr_scores[6], rfr_scores[7], rfr_scores[8], rfr_scores[9]])
xgbr_mean = np.array([xgbr_scores.mean(), xgbr_scores.std(), xgbr_scores[0], xgbr_scores[1], xgbr_scores[2], xgbr_scores[3],
                     xgbr_scores[4], xgbr_scores[5], xgbr_scores[6], xgbr_scores[7], xgbr_scores[8], xgbr_scores[9]])
svr_mean2 = np.array([svr_scores2.mean(), svr_scores2.std(), svr_scores2[0], svr_scores2[1], svr_scores2[2], svr_scores2[3],
                     svr_scores2[4], svr_scores2[5], svr_scores2[6], svr_scores2[7], svr_scores2[8], svr_scores2[9]])
rfr_mean2 = np.array([rfr_scores2.mean(), rfr_scores2.std(), rfr_scores2[0], rfr_scores2[1], rfr_scores2[2], rfr_scores2[3],rfr_scores2[4], 
                    rfr_scores2[5], rfr_scores2[6], rfr_scores2[7], rfr_scores2[8], rfr_scores2[9]])
xgbr_mean2 = np.array([xgbr_scores2.mean(), xgbr_scores2.std(), xgbr_scores2[0], xgbr_scores2[1], xgbr_scores2[2], xgbr_scores2[3],
                     xgbr_scores2[4], xgbr_scores2[5], xgbr_scores2[6], xgbr_scores2[7], xgbr_scores2[8], xgbr_scores2[9]])

In [47]:
tsr_6_mean = pd.DataFrame([svr_mean, rfr_mean, xgbr_mean, svr_mean2, rfr_mean2, xgbr_mean2]).T
tsr_6_mean.index = ["Mean", "Std", "R^2_1", "R^2_2", "R^2_3", "R^2_4", "R^2_5", "R^2_6", "R^2_7", "R^2_8", "R^2_9", "R^2_10"]
tsr_6_mean.columns = ["svr", 'rfr', 'xgbr', 'svr2', 'rfr2', 'xgbr2']

In [48]:
csv_save = os.path.join(".", "tsr_6_mean_regression.csv")
tsr_6_mean.to_csv(csv_save, index = True)