## Version Description:
 In this version - we predict just the __CLR no. of holes__ (`R610_HS1, R611_HS1, R612_HS1, R613_HS1`)in TZ6 using XGB Regressor.<br>For training the model we use the artificially synthesized data from **CycleGAN**.

 * The input features considered for the prediction are `CAO_LH_SumFlow`,`CAO_RH_SumFlow`,`LAO_LH_SumFlow`,`LAO_RH_SumFlow`,`MIXP`,`AMBP`,`AMBT`,`R600_HD` and the CAOR Restrictors (`R620` and `R621`).

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
from functools import reduce

#Set some numpy print options for displaying numpy arrays to fit maximum width of cell
np.set_printoptions(precision=3, edgeitems=30, linewidth=1000,formatter=dict(float=lambda x: "%.6g" % x)) 

# Disable Warnings for chained assignments Eg:Setting with Copy Warning
pd.options.mode.chained_assignment = None 

from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg' 
plt.style.use('seaborn-whitegrid')

In [4]:
# List of Available Style Sheets
# plt.style.available

### Loading Data

In [5]:
input_features = ['CAOLH_SumFlow','CAORH_SumFlow','LAOLH_SumFlow','LAORH_SumFlow','MIXP','AMBP','AMBT']
dependent_restrictors  = ['R600_HD','R620_HS1','R620_HS2','R620_HS3','R621_HS1','R621_HS2','R621_HS3']
output_variables = ['R610_HS1','R611_HS1','R612_HS1','R613_HS1']
usecols = ['HoV']+input_features+dependent_restrictors+output_variables

LTR_df = pd.read_csv('../data/TZ6_dataset.csv', usecols = usecols)[usecols]
HoVs_LTR = LTR_df["HoV"].tolist()
LTR_df.iloc[:,0:11]

Unnamed: 0,HoV,CAOLH_SumFlow,CAORH_SumFlow,LAOLH_SumFlow,LAORH_SumFlow,MIXP,AMBP,AMBT,R600_HD,R620_HS1,R620_HS2
0,A1,217.197005,222.443249,133.373448,136.693697,2600.0,101401.6,299.386667,148,75,38
1,A1,217.248929,223.698078,132.780168,135.684064,2600.0,101576.3,298.448667,149,75,38
2,A2,229.778639,226.83174,140.650844,143.718662,2606.1928,102136.6035,297.109024,152,75,38
3,A3,225.091032,227.788705,137.109226,142.122616,2599.8998,103195.6642,295.060027,154,75,38
4,A4,225.200989,222.557674,130.494552,135.055208,2600.0,102856.2,294.755833,148,75,38
5,A5,219.262425,224.116355,134.5927,136.976394,2600.0,101325.0,293.15,148,75,38
6,C1,223.440382,221.545157,131.58784,135.045592,2600.0,100650.4,294.146833,148,75,38
7,C2,234.831632,227.430207,140.009418,140.335213,2601.856275,100356.9848,295.465578,153,75,38
8,C3,233.883252,227.395643,138.322614,140.139693,2595.41505,100252.7728,297.643472,152,75,38
9,C4,223.948083,221.782661,131.34649,134.297502,2600.0,102484.2,295.5075,150,75,38


In [6]:
# Check for imbalances in the output variables by computing value_counts of the output_variables columns
LTR_df[output_variables].apply(pd.Series.value_counts)

Unnamed: 0,R610_HS1,R611_HS1,R612_HS1,R613_HS1
110,,,1.0,1.0
114,,,2.0,
118,1.0,,,
120,,,29.0,25.0
122,,1.0,,
125,1.0,,1.0,6.0
128,,,,1.0
130,,1.0,2.0,2.0
131,27.0,,,
135,2.0,,,


### Load Synthesized data from FDDN Simulator

In [7]:
df_final = pd.read_csv('../data/CycleGAN_ASD_dataset.csv', usecols = usecols)[usecols]
df_final

Unnamed: 0,HoV,CAOLH_SumFlow,CAORH_SumFlow,LAOLH_SumFlow,LAORH_SumFlow,MIXP,AMBP,AMBT,R600_HD,R620_HS1,R620_HS2,R620_HS3,R621_HS1,R621_HS2,R621_HS3,R610_HS1,R611_HS1,R612_HS1,R613_HS1
0,CGAN_ASD0,172.43266,176.38574,103.862540,106.087960,1560.3193,102388.140,298.07788,149.80307,75.123180,38.086280,112.976776,75.170000,38.056010,113.009950,130.361510,135.15912,118.263160,120.482760
1,CGAN_ASD1,212.15857,212.36919,128.104030,128.255190,2496.9760,101141.750,300.36334,148.41788,74.724410,37.967747,113.048640,74.779300,37.932343,112.979080,131.645690,137.05370,119.102250,122.240524
2,CGAN_ASD2,243.89989,242.00603,144.826000,147.998370,3138.4011,102511.680,299.79680,148.59084,74.839710,38.027250,113.004220,74.970450,37.973133,112.997650,130.198880,135.35751,118.725390,120.947920
3,CGAN_ASD3,184.84874,188.79890,109.596040,111.989660,1825.4059,101617.210,294.93200,148.23580,78.969170,38.989030,113.039140,77.997180,38.979897,112.989914,128.976530,135.18220,115.578310,117.512146
4,CGAN_ASD4,217.64412,219.52258,128.878050,130.810470,2482.8599,101265.370,294.14136,147.34987,78.964325,39.014550,113.012245,77.968090,38.979030,112.995720,130.301820,136.83356,116.797450,118.667310
5,CGAN_ASD5,251.04100,250.43329,148.325580,151.173930,3176.0864,101960.470,295.03772,147.91151,78.896835,39.037140,112.983740,77.935670,38.982727,113.001630,129.785250,135.64902,117.118820,118.475350
6,CGAN_ASD6,171.93361,177.85432,103.661630,105.220085,1512.0090,102519.700,295.82750,151.37503,74.591430,37.912300,113.004960,74.844574,37.905586,112.995020,126.665020,133.24149,114.704216,117.727200
7,CGAN_ASD7,208.26662,207.62265,124.975560,125.002710,2277.9854,102562.050,297.38223,149.85650,74.821840,37.966410,113.030426,74.922940,37.956837,112.987680,128.523330,133.71625,116.290440,118.883520
8,CGAN_ASD8,242.16054,246.80998,146.976780,147.471000,3149.1096,100634.880,300.76904,149.08975,74.778244,38.012300,113.059525,74.784090,37.924343,112.955400,130.763080,137.07616,119.001755,122.036440
9,CGAN_ASD9,179.85982,186.22223,108.431076,109.967064,1736.0009,101167.840,296.18823,149.20688,79.016140,39.002840,113.046740,78.012900,38.985690,112.973976,128.878650,134.65224,115.895690,117.926030


In [8]:
# Check for CLR delta in FDDN Simulation df
df_final[['HoV','R610_HS1','R611_HS1']].loc[(df_final['R611_HS1'] - df_final['R610_HS1']) < 0]

Unnamed: 0,HoV,R610_HS1,R611_HS1
206,CGAN_ASD206,115.741425,114.92687
349,CGAN_ASD349,116.563866,116.21398
350,CGAN_ASD350,115.16333,114.438225
492,CGAN_ASD492,116.73119,116.18213
780,CGAN_ASD780,118.38567,118.20043
852,CGAN_ASD852,120.9994,120.48966
925,CGAN_ASD925,116.70551,115.46706
996,CGAN_ASD996,119.223785,117.35658


In [9]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1080 entries, 0 to 1079
Data columns (total 19 columns):
HoV              1080 non-null object
CAOLH_SumFlow    1080 non-null float64
CAORH_SumFlow    1080 non-null float64
LAOLH_SumFlow    1080 non-null float64
LAORH_SumFlow    1080 non-null float64
MIXP             1080 non-null float64
AMBP             1080 non-null float64
AMBT             1080 non-null float64
R600_HD          1080 non-null float64
R620_HS1         1080 non-null float64
R620_HS2         1080 non-null float64
R620_HS3         1080 non-null float64
R621_HS1         1080 non-null float64
R621_HS2         1080 non-null float64
R621_HS3         1080 non-null float64
R610_HS1         1080 non-null float64
R611_HS1         1080 non-null float64
R612_HS1         1080 non-null float64
R613_HS1         1080 non-null float64
dtypes: float64(18), object(1)
memory usage: 160.4+ KB


In [10]:
# Check for imbalances in the output variables by computing value_counts of the output_variables columns
df_final[output_variables].apply(pd.Series.value_counts)

Unnamed: 0,R610_HS1,R611_HS1,R612_HS1,R613_HS1
107.119910,,,1.0,
107.815950,,,1.0,
108.019170,,,1.0,
108.121070,,,1.0,
108.324770,,,1.0,
108.333220,,,1.0,
108.417430,,,1.0,
108.419580,,,1.0,
108.512070,,,1.0,
108.521740,,,1.0,


### Train and Test Data Split:

In [11]:
# Set Random Seed for Reproducability
seed = 27
np.random.seed(seed)

# Split dataframe into features (x) and target (y) variable
X,y = df_final[input_features+dependent_restrictors],df_final[output_variables].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=seed)

In [12]:
# Sanity Check - Shape of Train-Test Data
print("Train Input dataset contains {0} rows and {1} columns".format(X_train.shape[0], X_train.shape[1]))
print("Test Input dataset contains {0} rows and {1} columns".format(X_test.shape[0], X_test.shape[1]))
# Sanity Check - Shape of Train-Test Data
print("Train Output dataset contains {0} rows and {1} columns".format(y_train.shape[0], y_train.shape[1]))
print("Test Output dataset contains {0} rows and {1} columns".format(y_test.shape[0], y_test.shape[1]))

Train Input dataset contains 1026 rows and 14 columns
Test Input dataset contains 54 rows and 14 columns
Train Output dataset contains 1026 rows and 4 columns
Test Output dataset contains 54 rows and 4 columns


### XG Boost Model Architecture

In [13]:
params = {
    'objective': 'reg:linear',
    'silent': 1,
    'max_depth': 16,
    'learning_rate': 0.0075,    
    'n_estimators': 1000,
    'subsample': 0.5,
    'min_child_weight': 6,
    'gamma':0,    
    'colsample_bytree':0.8,
    'reg_alpha':0.005,    
    'n_jobs':4,
    'scale_pos_weight':1,
    'random_state':42
}

In [14]:
%%time
model = MultiOutputRegressor(XGBRegressor(**params)).fit(X_train, y_train)
predictions = model.predict(X_test)

Wall time: 8.11 s


In [15]:
# show the inputs and predicted outputs
Original_X = X_test.values
# for i in range(len(X_test)):
#     print("Original_X  = %s" % (Original_X[i]))
#     print("Predicted_Y = %s" % np.rint(predictions[i]))    
#     print("Actual_Y    = %s" % (y_test[i]),'\n')

In [16]:
# XG Boost model Performance
# Mean squared error with Inverse Scaled values
print("Mean squared error: %.2f"% mean_squared_error(y_test, predictions))
# RMSE
print("\nRoot Mean squared error: %.2f"% np.sqrt(mean_squared_error(y_test, predictions)))
# Explained variance score: 1 is perfect prediction
print('\nVariance score: %.2f' % r2_score(y_test, predictions))

Mean squared error: 7.99

Root Mean squared error: 2.83

Variance score: 0.72


In [17]:
# Mean squared error with Inverse Scaled values for all output variables individually
mse_output_variables = [round(mean_squared_error(y_test[i], predictions[i]),2) for i in range(len(output_variables))]
r2score_output_variables = [round(r2_score(y_test[i], predictions[i]),2) for i in range(len(output_variables))]
print("MSE for {}:".format(output_variables),mse_output_variables)
# RMSE
print("\nRMSE for {}:".format(output_variables), np.sqrt(mse_output_variables))
# Explained variance score: 1 is perfect prediction
print('\nR2 score for {}:'.format(output_variables), r2score_output_variables)

MSE for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [0.08, 0.04, 0.14, 8.14]

RMSE for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [0.282843 0.2 0.374166 2.85307]

R2 score for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [1.0, 1.0, 1.0, 0.75]


### Cross Validation MSE Scores with actual outputs

In [18]:
def model_performance_df(X_test,y_test,test_df,model):
    test_data_idx = X_test.index.values.tolist()
    hovs_test_df = test_df['HoV'].iloc[test_data_idx]
    hov_testX = hovs_test_df.tolist()    

    
    # Make a prediction with the trained model
    predictions = model.predict(X_test)
    
    # Construct Multi-index dataframe
    sub_level_index_headers = ['Predicted','Actual','Delta_P-A']
    index_array = [hov_testX,sub_level_index_headers]
    index = pd.MultiIndex.from_product(index_array, names=['HoV', 'NNParameters'])

    # show the inputs and predicted outputs    
    data = []
    for i in range(len(X_test)):    
        preds  = (predictions[i])
        data.append(preds)
        actuals = (y_test[i])
        data.append(actuals)
        delta = preds - actuals
        data.append(delta)

    df_output_testdata = pd.DataFrame(data, index=index)
    df_output_testdata.columns = output_variables
    return df_output_testdata    

In [19]:
def pred_score(model,test_X,test_Y):
    '''
    Computes Performance score for all output variables collectively
    '''
    predicted_output  = model.predict(test_X)
    mse = mean_squared_error(test_Y, predicted_output)
    rmse = np.sqrt(mean_squared_error(test_Y, predicted_output))
    variance_score = r2_score(test_Y, predicted_output,multioutput='variance_weighted')
    return mse,rmse,variance_score

In [20]:
mse,rmse,variance_score = pred_score(model,X_test,y_test)
# Mean squared error with Inverse Scaled values
print("Mean squared error: %.2f"% mse)
# RMSE
print("\nRoot Mean squared error: %.2f"% rmse)
# Explained variance score: 1 is perfect prediction
print('\nVariance score: %.2f' % variance_score)

Mean squared error: 7.99

Root Mean squared error: 2.83

Variance score: 0.72


In [21]:
def pred_score_separate_outputvar(model,test_X,test_Y):
    '''
    Computes Performance score for all output variables
    '''
    predicted_output  = model.predict(test_X)
    mse = [round(mean_squared_error(test_Y[i], predicted_output[i]),2) for i in range(len(output_variables))]
    rmse = np.sqrt(mse)
    variance_score = [round(r2_score(test_Y[i], predicted_output[i]),2) for i in range(len(output_variables))]    
    return mse,rmse,variance_score

In [22]:
mse_ov, rmse_ov, r2score_ov = pred_score_separate_outputvar(model,X_test,y_test)
print("MSE for {}:".format(output_variables),mse_ov)
print("\nRMSE for {}:".format(output_variables), rmse_ov)
print('\nVariance score for {}:'.format(output_variables), r2score_ov)

MSE for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [0.08, 0.04, 0.14, 8.14]

RMSE for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [0.282843 0.2 0.374166 2.85307]

Variance score for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [1.0, 1.0, 1.0, 0.75]


### Make Predictions with Original LTR Dataframe (Test Data)

In [23]:
# Contruct test dataframe of original LTR values
df_test = LTR_df[usecols].copy()
# Split dataframe into features (x) and target (y) variable
LTR_X,LTR_y = df_test.iloc[:,1:-len(output_variables)],df_test.iloc[:,-len(output_variables):].values

In [24]:
mse_ltr,rmse_ltr,variance_score_ltr = pred_score(model,LTR_X,LTR_y)
# Mean squared error with Inverse Scaled values
print("Mean squared error: %.2f"% mse_ltr)
# RMSE
print("\nRoot Mean squared error: %.2f"% rmse_ltr)
# Explained variance score: 1 is perfect prediction
print('\nVariance score: %.2f' % variance_score_ltr)

Mean squared error: 25.82

Root Mean squared error: 5.08

Variance score: -0.22


In [25]:
mse_ov, rmse_ov, r2score_ov = pred_score_separate_outputvar(model,LTR_X,LTR_y)
print("MSE for {}:".format(output_variables),mse_ov)
print("\nRMSE for {}:".format(output_variables), rmse_ov)
print('\nVariance score for {}:'.format(output_variables), r2score_ov)

MSE for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [5.1, 1.48, 3.76, 7.16]

RMSE for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [2.25832 1.21655 1.93907 2.67582]

Variance score for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [0.9, 0.97, 0.95, 0.85]


In [26]:
df_output_testdata = model_performance_df(LTR_X,LTR_y,df_test,model)
df_output_testdata

Unnamed: 0_level_0,Unnamed: 1_level_0,R610_HS1,R611_HS1,R612_HS1,R613_HS1
HoV,NNParameters,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A1,Predicted,132.836304,138.814743,119.993660,123.018570
A1,Actual,131.000000,136.000000,120.000000,120.000000
A1,Delta_P-A,1.836304,2.814743,-0.006340,3.018570
A1,Predicted,130.573349,136.227127,118.101334,121.441917
A1,Actual,131.000000,136.000000,120.000000,120.000000
A1,Delta_P-A,-0.426651,0.227127,-1.898666,1.441917
A2,Predicted,129.185471,135.636810,117.365921,120.522469
A2,Actual,131.000000,136.000000,114.000000,120.000000
A2,Delta_P-A,-1.814529,-0.363190,3.365921,0.522469
A3,Predicted,128.238220,133.955719,115.969185,119.241936


In [27]:
df_output_testdata.xs('Delta_P-A', axis=0, level=1, drop_level=False).sort_values(by = output_variables,ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,R610_HS1,R611_HS1,R612_HS1,R613_HS1
HoV,NNParameters,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Q1,Delta_P-A,14.278992,16.030731,9.489769,12.263084
D2,Delta_P-A,5.412857,6.80957,-2.545731,-0.514786
T1,Delta_P-A,2.907852,3.493759,0.419006,3.05545
F1,Delta_P-A,2.471039,3.436462,0.275124,2.60141
A1,Delta_P-A,1.836304,2.814743,-0.00634,3.01857
Q2,Delta_P-A,1.783463,2.63855,0.006897,2.576347
S2,Delta_P-A,1.621613,2.694641,-0.214386,2.275589
E2,Delta_P-A,1.197266,3.398819,-0.58709,1.820518
E1C,Delta_P-A,1.197266,3.398819,-0.58709,1.820518
F2,Delta_P-A,0.851059,2.543884,-1.390282,1.242226
