## Version Description:
 In this version - we predict just the __CLR no. of holes__ (`R610_HS1, R611_HS1, R612_HS1, R613_HS1`)in TZ6 using XGB Regressor. For training the model we use the artificially synthesized data from FDDN.

 * The input features considered for the prediction are `CAO_LH_SumFlow`,`CAO_RH_SumFlow`,`LAO_LH_SumFlow`,`LAO_RH_SumFlow`,`MIXP`,`AMBP`,`AMBT`,`R600_HD` and the CAOR Restrictors (`R620` and `R621`).

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
from functools import reduce

#Set some numpy print options for displaying numpy arrays to fit maximum width of cell
np.set_printoptions(precision=3, edgeitems=30, linewidth=1000,formatter=dict(float=lambda x: "%.6g" % x)) 

# Disable Warnings for chained assignments Eg:Setting with Copy Warning
pd.options.mode.chained_assignment = None 

from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg' 
plt.style.use('seaborn-whitegrid')

In [4]:
# List of Available Style Sheets
# plt.style.available

### Loading Data

In [5]:
input_features = ['CAOLH_SumFlow','CAORH_SumFlow','LAOLH_SumFlow','LAORH_SumFlow','MIXP','AMBP','AMBT']
dependent_restrictors  = ['R600_HD','R620_HS1','R620_HS2','R620_HS3','R621_HS1','R621_HS2','R621_HS3']
output_variables = ['R610_HS1','R611_HS1','R612_HS1','R613_HS1']
usecols = ['HoV']+input_features+dependent_restrictors+output_variables

LTR_df = pd.read_csv('../data/TZ6_dataset.csv', usecols = usecols)[usecols]
HoVs_LTR = LTR_df["HoV"].tolist()
LTR_df.iloc[:,0:11]

Unnamed: 0,HoV,CAOLH_SumFlow,CAORH_SumFlow,LAOLH_SumFlow,LAORH_SumFlow,MIXP,AMBP,AMBT,R600_HD,R620_HS1,R620_HS2
0,A1,217.197005,222.443249,133.373448,136.693697,2600.0,101401.6,299.386667,148,75,38
1,A1,217.248929,223.698078,132.780168,135.684064,2600.0,101576.3,298.448667,149,75,38
2,A2,229.778639,226.83174,140.650844,143.718662,2606.1928,102136.6035,297.109024,152,75,38
3,A3,225.091032,227.788705,137.109226,142.122616,2599.8998,103195.6642,295.060027,154,75,38
4,A4,225.200989,222.557674,130.494552,135.055208,2600.0,102856.2,294.755833,148,75,38
5,A5,219.262425,224.116355,134.5927,136.976394,2600.0,101325.0,293.15,148,75,38
6,C1,223.440382,221.545157,131.58784,135.045592,2600.0,100650.4,294.146833,148,75,38
7,C2,234.831632,227.430207,140.009418,140.335213,2601.856275,100356.9848,295.465578,153,75,38
8,C3,233.883252,227.395643,138.322614,140.139693,2595.41505,100252.7728,297.643472,152,75,38
9,C4,223.948083,221.782661,131.34649,134.297502,2600.0,102484.2,295.5075,150,75,38


In [6]:
# Check for imbalances in the output variables by computing value_counts of the output_variables columns
LTR_df[output_variables].apply(pd.Series.value_counts)

Unnamed: 0,R610_HS1,R611_HS1,R612_HS1,R613_HS1
110,,,1.0,1.0
114,,,2.0,
118,1.0,,,
120,,,29.0,25.0
122,,1.0,,
125,1.0,,1.0,6.0
128,,,,1.0
130,,1.0,2.0,2.0
131,27.0,,,
135,2.0,,,


### Load Synthesized data from FDDN Simulator

In [7]:
df_final = pd.read_csv('../data/FDDN_dataset.csv', usecols = usecols)[usecols]
df_final

Unnamed: 0,HoV,CAOLH_SumFlow,CAORH_SumFlow,LAOLH_SumFlow,LAORH_SumFlow,MIXP,AMBP,AMBT,R600_HD,R620_HS1,R620_HS2,R620_HS3,R621_HS1,R621_HS2,R621_HS3,R610_HS1,R611_HS1,R612_HS1,R613_HS1
0,FDDN_ASD0,176.194306,183.838086,108.820955,107.380847,1800,102926.0,297.15,146,75,38,113,75,38,113,131,136,120,120
1,FDDN_ASD1,213.215038,222.451801,131.595030,129.860531,2600,102426.0,299.15,146,75,38,113,75,38,113,131,136,120,120
2,FDDN_ASD2,241.185680,251.630057,148.769313,146.805694,3400,103127.0,294.15,146,75,38,113,75,38,113,131,136,120,120
3,FDDN_ASD3,177.057588,184.967479,108.840240,107.394407,1800,101825.0,295.15,146,79,39,113,78,39,113,131,136,120,120
4,FDDN_ASD4,214.406259,223.974228,131.697294,129.953811,2600,100523.0,295.15,146,79,39,113,78,39,113,131,136,120,120
5,FDDN_ASD5,243.742634,254.602406,149.615493,147.646825,3400,101925.0,295.15,146,79,39,113,78,39,113,131,136,120,120
6,FDDN_ASD6,175.721146,178.678090,108.526406,107.090940,1800,103027.0,293.15,146,75,38,113,75,38,113,131,136,114,120
7,FDDN_ASD7,214.779029,218.389113,132.553728,130.810540,2600,100523.0,295.15,146,75,38,113,75,38,113,131,136,114,120
8,FDDN_ASD8,245.794632,249.927535,151.606684,149.594922,3400,101925.0,299.15,146,75,38,113,75,38,113,131,136,114,120
9,FDDN_ASD9,179.763606,182.960842,110.506254,109.038405,1800,100724.0,298.15,146,79,39,113,78,39,113,131,136,114,120


In [8]:
# Check for CLR delta in FDDN Simulation df
df_final[['HoV','R610_HS1','R611_HS1']].loc[(df_final['R611_HS1'] - df_final['R610_HS1']) < 0]

Unnamed: 0,HoV,R610_HS1,R611_HS1
54,FDDN_ASD54,145,140
55,FDDN_ASD55,145,140
56,FDDN_ASD56,145,140
57,FDDN_ASD57,145,140
58,FDDN_ASD58,145,140
59,FDDN_ASD59,145,140
126,FDDN_ASD126,145,140
127,FDDN_ASD127,145,140
128,FDDN_ASD128,145,140
129,FDDN_ASD129,145,140


In [9]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1080 entries, 0 to 1079
Data columns (total 19 columns):
HoV              1080 non-null object
CAOLH_SumFlow    1080 non-null float64
CAORH_SumFlow    1080 non-null float64
LAOLH_SumFlow    1080 non-null float64
LAORH_SumFlow    1080 non-null float64
MIXP             1080 non-null int64
AMBP             1080 non-null float64
AMBT             1080 non-null float64
R600_HD          1080 non-null int64
R620_HS1         1080 non-null int64
R620_HS2         1080 non-null int64
R620_HS3         1080 non-null int64
R621_HS1         1080 non-null int64
R621_HS2         1080 non-null int64
R621_HS3         1080 non-null int64
R610_HS1         1080 non-null int64
R611_HS1         1080 non-null int64
R612_HS1         1080 non-null int64
R613_HS1         1080 non-null int64
dtypes: float64(6), int64(12), object(1)
memory usage: 160.4+ KB


In [10]:
# Check for imbalances in the output variables by computing value_counts of the output_variables columns
df_final[output_variables].apply(pd.Series.value_counts)

Unnamed: 0,R610_HS1,R611_HS1,R612_HS1,R613_HS1
110,,,90.0,90.0
114,,,90.0,
118,90.0,,,
120,,,720.0,450.0
122,,90.0,,
125,90.0,,90.0,360.0
128,,,,90.0
130,,90.0,90.0,90.0
131,540.0,,,
135,90.0,,,


### Train and Test Data Split:

In [11]:
# Set Random Seed for Reproducability
seed = 27
np.random.seed(seed)

# Split dataframe into features (x) and target (y) variable
X,y = df_final[input_features+dependent_restrictors],df_final[output_variables].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=seed)

In [12]:
# Sanity Check - Shape of Train-Test Data
print("Train Input dataset contains {0} rows and {1} columns".format(X_train.shape[0], X_train.shape[1]))
print("Test Input dataset contains {0} rows and {1} columns".format(X_test.shape[0], X_test.shape[1]))
# Sanity Check - Shape of Train-Test Data
print("Train Output dataset contains {0} rows and {1} columns".format(y_train.shape[0], y_train.shape[1]))
print("Test Output dataset contains {0} rows and {1} columns".format(y_test.shape[0], y_test.shape[1]))

Train Input dataset contains 1026 rows and 14 columns
Test Input dataset contains 54 rows and 14 columns
Train Output dataset contains 1026 rows and 4 columns
Test Output dataset contains 54 rows and 4 columns


### XG Boost Model Architecture

In [13]:
params = {
    'objective': 'reg:linear',
    'silent': 1,
    'max_depth': 16,
    'learning_rate': 0.0075,    
    'n_estimators': 1000,
    'subsample': 0.5,
    'min_child_weight': 6,
    'gamma':0,    
    'colsample_bytree':0.8,
    'reg_alpha':0.005,    
    'n_jobs':4,
    'scale_pos_weight':1,
    'random_state':42
}

In [14]:
%%time
model = MultiOutputRegressor(XGBRegressor(**params)).fit(X_train, y_train)
predictions = model.predict(X_test)

Wall time: 10.4 s


In [15]:
# show the inputs and predicted outputs
Original_X = X_test.values
# for i in range(len(X_test)):
#     print("Original_X  = %s" % (Original_X[i]))
#     print("Predicted_Y = %s" % np.rint(predictions[i]))    
#     print("Actual_Y    = %s" % (y_test[i]),'\n')

In [16]:
# XG Boost model Performance
# Mean squared error with Inverse Scaled values
print("Mean squared error: %.2f"% mean_squared_error(y_test, predictions))
# RMSE
print("\nRoot Mean squared error: %.2f"% np.sqrt(mean_squared_error(y_test, predictions)))
# Explained variance score: 1 is perfect prediction
print('\nVariance score: %.2f' % r2_score(y_test, predictions))

Mean squared error: 19.09

Root Mean squared error: 4.37

Variance score: 0.38


In [17]:
# Mean squared error with Inverse Scaled values for all output variables individually
mse_output_variables = [round(mean_squared_error(y_test[i], predictions[i]),2) for i in range(len(output_variables))]
r2score_output_variables = [round(r2_score(y_test[i], predictions[i]),2) for i in range(len(output_variables))]
print("MSE for {}:".format(output_variables),mse_output_variables)
# RMSE
print("\nRMSE for {}:".format(output_variables), np.sqrt(mse_output_variables))
# Explained variance score: 1 is perfect prediction
print('\nR2 score for {}:'.format(output_variables), r2score_output_variables)

MSE for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [2.08, 0.78, 39.11, 14.24]

RMSE for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [1.44222 0.883176 6.2538 3.77359]

R2 score for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [0.96, 0.99, 0.7, 0.71]


### Cross Validation MSE Scores with actual outputs

In [18]:
def model_performance_df(X_test,y_test,test_df,model):
    test_data_idx = X_test.index.values.tolist()
    hovs_test_df = test_df['HoV'].iloc[test_data_idx]
    hov_testX = hovs_test_df.tolist()    

    
    # Make a prediction with the trained model
    predictions = model.predict(X_test)
    
    # Construct Multi-index dataframe
    sub_level_index_headers = ['Predicted','Actual','Delta_P-A']
    index_array = [hov_testX,sub_level_index_headers]
    index = pd.MultiIndex.from_product(index_array, names=['HoV', 'NNParameters'])

    # show the inputs and predicted outputs    
    data = []
    for i in range(len(X_test)):    
        preds  = (predictions[i])
        data.append(preds)
        actuals = (y_test[i])
        data.append(actuals)
        delta = preds - actuals
        data.append(delta)

    df_output_testdata = pd.DataFrame(data, index=index)
    df_output_testdata.columns = output_variables
    return df_output_testdata    

In [19]:
def pred_score(model,test_X,test_Y):
    '''
    Computes Performance score for all output variables collectively
    '''
    predicted_output  = model.predict(test_X)
    mse = mean_squared_error(test_Y, predicted_output)
    rmse = np.sqrt(mean_squared_error(test_Y, predicted_output))
    variance_score = r2_score(test_Y, predicted_output,multioutput='variance_weighted')
    return mse,rmse,variance_score

In [20]:
mse,rmse,variance_score = pred_score(model,X_test,y_test)
# Mean squared error with Inverse Scaled values
print("Mean squared error: %.2f"% mse)
# RMSE
print("\nRoot Mean squared error: %.2f"% rmse)
# Explained variance score: 1 is perfect prediction
print('\nVariance score: %.2f' % variance_score)

Mean squared error: 19.09

Root Mean squared error: 4.37

Variance score: 0.38


In [21]:
def pred_score_separate_outputvar(model,test_X,test_Y):
    '''
    Computes Performance score for all output variables
    '''
    predicted_output  = model.predict(test_X)
    mse = [round(mean_squared_error(test_Y[i], predicted_output[i]),2) for i in range(len(output_variables))]
    rmse = np.sqrt(mse)
    variance_score = [round(r2_score(test_Y[i], predicted_output[i]),2) for i in range(len(output_variables))]    
    return mse,rmse,variance_score

In [22]:
mse_ov, rmse_ov, r2score_ov = pred_score_separate_outputvar(model,X_test,y_test)
print("MSE for {}:".format(output_variables),mse_ov)
print("\nRMSE for {}:".format(output_variables), rmse_ov)
print('\nVariance score for {}:'.format(output_variables), r2score_ov)

MSE for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [2.08, 0.78, 39.11, 14.24]

RMSE for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [1.44222 0.883176 6.2538 3.77359]

Variance score for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [0.96, 0.99, 0.7, 0.71]


### Make Predictions with Original LTR Dataframe (Test Data)

In [23]:
# Contruct test dataframe of original LTR values
df_test = LTR_df[usecols].copy()
# Split dataframe into features (x) and target (y) variable
LTR_X,LTR_y = df_test.iloc[:,1:-len(output_variables)],df_test.iloc[:,-len(output_variables):].values

In [24]:
mse_ltr,rmse_ltr,variance_score_ltr = pred_score(model,LTR_X,LTR_y)
# Mean squared error with Inverse Scaled values
print("Mean squared error: %.2f"% mse_ltr)
# RMSE
print("\nRoot Mean squared error: %.2f"% rmse_ltr)
# Explained variance score: 1 is perfect prediction
print('\nVariance score: %.2f' % variance_score_ltr)

Mean squared error: 36.55

Root Mean squared error: 6.05

Variance score: -0.72


In [25]:
mse_ov, rmse_ov, r2score_ov = pred_score_separate_outputvar(model,LTR_X,LTR_y)
print("MSE for {}:".format(output_variables),mse_ov)
print("\nRMSE for {}:".format(output_variables), rmse_ov)
print('\nVariance score for {}:'.format(output_variables), r2score_ov)

MSE for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [18.87, 25.33, 15.1, 30.48]

RMSE for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [4.34396 5.03289 3.88587 5.52087]

Variance score for ['R610_HS1', 'R611_HS1', 'R612_HS1', 'R613_HS1']: [0.61, 0.48, 0.8, 0.37]


In [26]:
df_output_testdata = model_performance_df(LTR_X,LTR_y,df_test,model)
df_output_testdata

Unnamed: 0_level_0,Unnamed: 1_level_0,R610_HS1,R611_HS1,R612_HS1,R613_HS1
HoV,NNParameters,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A1,Predicted,133.897064,131.221466,114.172195,116.792427
A1,Actual,131.000000,136.000000,120.000000,120.000000
A1,Delta_P-A,2.897064,-4.778534,-5.827805,-3.207573
A1,Predicted,131.987717,129.204910,113.935097,115.828354
A1,Actual,131.000000,136.000000,120.000000,120.000000
A1,Delta_P-A,0.987717,-6.795090,-6.064903,-4.171646
A2,Predicted,138.539276,137.588593,114.984634,120.215508
A2,Actual,131.000000,136.000000,114.000000,120.000000
A2,Delta_P-A,7.539276,1.588593,0.984634,0.215508
A3,Predicted,131.413605,128.104523,114.082115,115.062790


In [27]:
# Check XGBoost prediction for Outlier point 'IBE01'
df_output_testdata.loc[(df_output_testdata.index.get_level_values('HoV') == 'IBE01')]

Unnamed: 0_level_0,Unnamed: 1_level_0,R610_HS1,R611_HS1,R612_HS1,R613_HS1
HoV,NNParameters,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [28]:
df_output_testdata.xs('Delta_P-A', axis=0, level=1, drop_level=False).sort_values(by = output_variables,ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,R610_HS1,R611_HS1,R612_HS1,R613_HS1
HoV,NNParameters,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
E2,Delta_P-A,13.59523,5.292343,2.231682,-1.377777
E1C,Delta_P-A,13.59523,5.292343,2.231682,-1.377777
Q1,Delta_P-A,13.322769,10.832428,4.975616,9.730919
A2,Delta_P-A,7.539276,1.588593,0.984634,0.215508
S3,Delta_P-A,5.690842,-3.607391,-2.179611,-3.117599
D2,Delta_P-A,5.648376,2.724075,-2.454796,3.571823
A5,Delta_P-A,5.44693,0.658661,-4.075653,0.527473
M1,Delta_P-A,5.049698,-2.73082,-0.347763,-1.267097
Q2,Delta_P-A,4.724716,-0.924011,-2.532341,-1.107422
S1,Delta_P-A,4.496475,-7.697418,-1.334938,-2.845551
