# Phase II: Satellite Derived Bathymetry

Import required modules.

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate

from sklearn.ensemble import RandomForestRegressor as rf
from sklearn.ensemble import GradientBoostingRegressor as gb

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2

from sklearn.preprocessing import StandardScaler

The following function computes the RMSE for different water depths starting in `h` +/ `delta` and ending in `maxh` (at jumps of `jump`) for a predicted `y_pred` against an actual `y_test`.

In [None]:
def rmseByDepth(y_test,y_pred,h = 2.5,delta = 2.5,jump = 5,maxh = 110):

    hList = []
    rmseList = []
    df = pd.DataFrame()

    while h < maxh:
        hList.append(h)
        idx = np.where((y_test >= h - delta) & (y_test < h + delta))
        rmse = mse(y_test[idx],y_pred[idx],squared = False)
        rmseList.append(rmse)        
        h = h + jump
    
    df['h'] = hList
    df['rmse'] = rmseList
    
    return df

Read the complete datasets for each study case.

In [None]:
data_A = pd.read_csv("../data/phase-II/complete-dataset/data_A.csv")
data_B = pd.read_csv("../data/phase-II/complete-dataset/data_B.csv")
data_C = pd.read_csv("../data/phase-II/complete-dataset/data_C.csv")
data_G = pd.read_csv("../data/phase-II/complete-dataset/data_G.csv")

Extract target values and input features for each dataset. Extract also 5% as test from each one.

In [None]:
X_A = data_A.iloc[:,3:]
y_A = data_A.iloc[:,0]

X_B = data_B.iloc[:,3:]
y_B = data_B.iloc[:,0]

X_C = data_C.iloc[:,3:]
y_C = data_C.iloc[:,0]

X_G = data_G.iloc[:,3:]
y_G = data_G.iloc[:,0]

X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X_A,y_A,test_size = 0.05,random_state = 20)
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(X_B,y_B,test_size = 0.05,random_state = 20)
X_train_C, X_test_C, y_train_C, y_test_C = train_test_split(X_C,y_C,test_size = 0.05,random_state = 20)
X_train_G, X_test_G, y_train_G, y_test_G = train_test_split(X_G,y_G,test_size = 0.05,random_state = 20)

Gather all training datasets in one (including target values). Gather target values of the test set.

In [None]:
X_train = pd.concat([X_train_A,X_train_B,X_train_C,X_train_G])
y_train = pd.concat([y_train_A,y_train_B,y_train_C,y_train_G])
y_test = np.array(pd.concat([y_test_A,y_test_B,y_test_C,y_test_G]))

Standardize data using the training dataset. Scale all other datasets. Test sets remain separated so each reservoir can be evaluated.

In [None]:
scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)

X_test_A = scaler.transform(X_test_A)
X_test_B = scaler.transform(X_test_B)
X_test_C = scaler.transform(X_test_C)
X_test_G = scaler.transform(X_test_G)

X_A = scaler.transform(X_A)
X_B = scaler.transform(X_B)
X_C = scaler.transform(X_C)
X_G = scaler.transform(X_G)

## Regression Models

### Linear Regression (LR)

10-fold cross-validate the linear regression model.

In [None]:
cv_results = pd.DataFrame(cross_validate(LinearRegression(),X_train,y_train,cv = 10,scoring = ["neg_root_mean_squared_error","r2"]))
cv_results['test_neg_root_mean_squared_error'] = cv_results['test_neg_root_mean_squared_error']*-1
cv_results.to_csv("../data/phase-II/results/LR/cvresults-LR.csv",index = False)

Print RMSE.

In [None]:
rmse_mean = cv_results['test_neg_root_mean_squared_error'].mean()
rmse_std = cv_results['test_neg_root_mean_squared_error'].std()
print("RMSE: %0.3f +/- %0.3f" % (rmse_mean,rmse_std/np.sqrt(10)))

Fit all training data.

In [None]:
reg = LinearRegression().fit(X_train,y_train)

Make predictions on the test set for each reservoir.

In [None]:
y_pred_A = reg.predict(X_test_A)
y_pred_B = reg.predict(X_test_B)
y_pred_C = reg.predict(X_test_C)
y_pred_G = reg.predict(X_test_G)

ys_A = pd.DataFrame({"y_test":y_test_A,"y_pred":y_pred_A})
ys_B = pd.DataFrame({"y_test":y_test_B,"y_pred":y_pred_B})
ys_C = pd.DataFrame({"y_test":y_test_C,"y_pred":y_pred_C})
ys_G = pd.DataFrame({"y_test":y_test_G,"y_pred":y_pred_G})

ys_A.to_csv("../data/phase-II/results/LR/ytest-ypred-A-LR.csv",index = False)
ys_B.to_csv("../data/phase-II/results/LR/ytest-ypred-B-LR.csv",index = False)
ys_C.to_csv("../data/phase-II/results/LR/ytest-ypred-C-LR.csv",index = False)
ys_G.to_csv("../data/phase-II/results/LR/ytest-ypred-G-LR.csv",index = False)

Print RMSE and $R^2$ for each reservoir.

In [None]:
print("Alto-Lindoso = R2: %0.2f, RMSE %0.3f" % (r2(y_test_A,y_pred_A),mse(y_test_A,y_pred_A,squared = False)))
print("Bubal = R2: %0.2f, RMSE %0.3f" % (r2(y_test_B,y_pred_B),mse(y_test_B,y_pred_B,squared = False)))
print("Canelles = R2: %0.2f, RMSE %0.3f" % (r2(y_test_C,y_pred_C),mse(y_test_C,y_pred_C,squared = False)))
print("Grado = R2: %0.2f, RMSE %0.3f" % (r2(y_test_G,y_pred_G),mse(y_test_G,y_pred_G,squared = False)))

Print global results.

In [None]:
y_pred = np.concatenate((y_pred_A,y_pred_B,y_pred_C,y_pred_G))
print("Total = R2: %0.2f, RMSE %0.3f" % (r2(y_test,y_pred),mse(y_test,y_pred,squared = False)))

Compute RMSE of the predicted values through water depth.

In [None]:
rmseDepth = rmseByDepth(y_test,y_pred)
rmseDepth.to_csv("../data/phase-II/results/LR/depth-rmse-LR.csv",index = False)

Save global test and predicted values.

In [None]:
ys = pd.DataFrame({"y_test":y_test,"y_pred":y_pred})
ys.to_csv("../data/phase-II/results/LR/ytest-ypred-GLOBAL-LR.csv",index = False)

Make predictions over all the dataset for each reservoir.

In [None]:
data_A['z_pred'] = reg.predict(X_A)
data_B['z_pred'] = reg.predict(X_B)
data_C['z_pred'] = reg.predict(X_C)
data_G['z_pred'] = reg.predict(X_G)

data_A.to_csv("../data/phase-II/results/LR/data-all-A-LR.csv",index = False)
data_B.to_csv("../data/phase-II/results/LR/data-all-B-LR.csv",index = False)
data_C.to_csv("../data/phase-II/results/LR/data-all-C-LR.csv",index = False)
data_G.to_csv("../data/phase-II/results/LR/data-all-G-LR.csv",index = False)

### Random Forest (RF)

#### Broader Tuning

First, the total estimators and the maximum features used for splitting are tuned in a broader approach. This tuning was 3-fold cross-validated.

In [None]:
parameters = {"n_estimators":[100,200,300,400,500],"max_features":["auto","sqrt"]}
rfReg = GridSearchCV(rf(),parameters,cv = 3,scoring = "neg_root_mean_squared_error")
rfReg.fit(X_train,y_train)

Save the results.

In [None]:
cv_results = pd.DataFrame(rfReg.cv_results_)
cv_results = cv_results.drop(columns = ['params'])
cv_results.to_csv("../data/phase-II/results/RF/gridsearchcv-results-rf-broader.csv",index = False)

The results of this cross-validation were analyzed by an ANOVA and LSD tests. The maximum number of features for splitting was selected as $\sqrt{n}$ and 100 total estimators. A narrower tuning was done taking these results.

#### Narrower Tuning

In the narrower tuning only the total estimators were tuned. This tuning was 5-fold cross-validated.

In [None]:
parameters = {"n_estimators":np.arange(10,151,10)}
rfReg = GridSearchCV(rf(max_features = "sqrt"),parameters,cv = 5,scoring = "neg_root_mean_squared_error")
rfReg.fit(X_train,y_train)

Save the results.

In [None]:
cv_results = pd.DataFrame(rfReg.cv_results_)
cv_results = cv_results.drop(columns = ['params'])
cv_results.to_csv("../data/phase-II/results/RF/gridsearchcv-results-rf-narrower.csv",index = False)

Cross-validation results were analyzed in R. Significant differences were not found and a 20 total estimators were selected taking time into account.

A Random Forest with the tuned parameters was fitted.

In [None]:
bestRF = rf(max_features = "sqrt",n_estimators = 20).fit(X_train,y_train)

Features importances were saved.

In [None]:
fidf = pd.DataFrame({"feature":X_A.columns,"feature_importances":bestRF.feature_importances_})
fidf.to_csv("../data/phase-II/results/RF/feature-importances-rf.csv",index = False)

Make predictions on the test sets.

In [None]:
y_pred_A = bestRF.predict(X_test_A)
y_pred_B = bestRF.predict(X_test_B)
y_pred_C = bestRF.predict(X_test_C)
y_pred_G = bestRF.predict(X_test_G)

ys_A = pd.DataFrame({"y_test":y_test_A,"y_pred":y_pred_A})
ys_B = pd.DataFrame({"y_test":y_test_B,"y_pred":y_pred_B})
ys_C = pd.DataFrame({"y_test":y_test_C,"y_pred":y_pred_C})
ys_G = pd.DataFrame({"y_test":y_test_G,"y_pred":y_pred_G})

ys_A.to_csv("../data/phase-II/results/RF/ytest-ypred-A-RF.csv",index = False)
ys_B.to_csv("../data/phase-II/results/RF/ytest-ypred-B-RF.csv",index = False)
ys_C.to_csv("../data/phase-II/results/RF/ytest-ypred-C-RF.csv",index = False)
ys_G.to_csv("../data/phase-II/results/RF/ytest-ypred-G-RF.csv",index = False)

Print results for each reservoir.

In [None]:
print("Alto-Lindoso = R2: %0.2f, RMSE %0.3f" % (r2(y_test_A,y_pred_A),mse(y_test_A,y_pred_A,squared = False)))
print("Bubal = R2: %0.2f, RMSE %0.3f" % (r2(y_test_B,y_pred_B),mse(y_test_B,y_pred_B,squared = False)))
print("Canelles = R2: %0.2f, RMSE %0.3f" % (r2(y_test_C,y_pred_C),mse(y_test_C,y_pred_C,squared = False)))
print("Grado = R2: %0.2f, RMSE %0.3f" % (r2(y_test_G,y_pred_G),mse(y_test_G,y_pred_G,squared = False)))

Print global results.

In [None]:
y_pred = np.concatenate((y_pred_A,y_pred_B,y_pred_C,y_pred_G))
print("Total = R2: %0.2f, RMSE %0.3f" % (r2(y_test,y_pred),mse(y_test,y_pred,squared = False)))

Compute RMSE for water depths.

In [None]:
rmseDepth = rmseByDepth(y_test,y_pred)
rmseDepth.to_csv("../data/phase-II/results/RF/depth-rmse-RF.csv",index = False)

Save global data (predicted and actual).

In [None]:
ys = pd.DataFrame({"y_test":y_test,"y_pred":y_pred})
ys.to_csv("../data/phase-II/results/RF/ytest-ypred-GLOBAL-RF.csv",index = False)

Predict over the whole datasets.

In [None]:
data_A['z_pred'] = bestRF.predict(X_A)
data_B['z_pred'] = bestRF.predict(X_B)
data_C['z_pred'] = bestRF.predict(X_C)
data_G['z_pred'] = bestRF.predict(X_G)

data_A.to_csv("../data/phase-II/results/RF/data-all-A-RF.csv",index = False)
data_B.to_csv("../data/phase-II/results/RF/data-all-B-RF.csv",index = False)
data_C.to_csv("../data/phase-II/results/RF/data-all-C-RF.csv",index = False)
data_G.to_csv("../data/phase-II/results/RF/data-all-G-RF.csv",index = False)

### Gradient Boosting (GB)

The maximum depth of estimators was tuned.

In [None]:
parameters = {"max_depth":np.arange(10,101,10)}
gbreg = gb(n_estimators = 500,max_features = "sqrt",validation_fraction = 0.05,n_iter_no_change = 10,random_state = 20,tol = 0.01)
rfReg = GridSearchCV(gbreg,parameters,cv = 5,scoring = "neg_root_mean_squared_error")
rfReg.fit(X_train,y_train)

Save the results.

In [None]:
cv_results = pd.DataFrame(rfReg.cv_results_)
cv_results = cv_results.drop(columns = ['params'])
cv_results.to_csv("../data/phase-II/results/GB/gridsearchcv-results-gb.csv",index = False)

Data was analyzed in R. Significant differences were not found. The best maximum depth was chosen as the one with the least time: 20 levels as maximum depth. Which was also the one with the lower RMSE.

In [None]:
bestGB = rfReg.best_estimator_

Save the feature importances.

In [None]:
fidf = pd.DataFrame({"feature":X_A.columns,"feature_importances":bestGB.feature_importances_})
fidf.to_csv("../data/phase-II/results/GB/feature-importances-gb.csv",index = False)

Save the Loss function decay.

In [None]:
score_estimators = pd.DataFrame({"estimator":np.arange(0,bestGB.train_score_.shape[0],1),"score":bestGB.train_score_})
score_estimators.to_csv("../data/phase-II/results/GB/score-estimators-gb.csv",index = False)

Make predictions on the test sets.

In [None]:
y_pred_A = bestGB.predict(X_test_A)
y_pred_B = bestGB.predict(X_test_B)
y_pred_C = bestGB.predict(X_test_C)
y_pred_G = bestGB.predict(X_test_G)

ys_A = pd.DataFrame({"y_test":y_test_A,"y_pred":y_pred_A})
ys_B = pd.DataFrame({"y_test":y_test_B,"y_pred":y_pred_B})
ys_C = pd.DataFrame({"y_test":y_test_C,"y_pred":y_pred_C})
ys_G = pd.DataFrame({"y_test":y_test_G,"y_pred":y_pred_G})

ys_A.to_csv("../data/phase-II/results/GB/ytest-ypred-A-GB.csv",index = False)
ys_B.to_csv("../data/phase-II/results/GB/ytest-ypred-B-GB.csv",index = False)
ys_C.to_csv("../data/phase-II/results/GB/ytest-ypred-C-GB.csv",index = False)
ys_G.to_csv("../data/phase-II/results/GB/ytest-ypred-G-GB.csv",index = False)

Print results for each reservoir.

In [None]:
print("Alto-Lindoso = R2: %0.2f, RMSE %0.3f" % (r2(y_test_A,y_pred_A),mse(y_test_A,y_pred_A,squared = False)))
print("Bubal = R2: %0.2f, RMSE %0.3f" % (r2(y_test_B,y_pred_B),mse(y_test_B,y_pred_B,squared = False)))
print("Canelles = R2: %0.2f, RMSE %0.3f" % (r2(y_test_C,y_pred_C),mse(y_test_C,y_pred_C,squared = False)))
print("Grado = R2: %0.2f, RMSE %0.3f" % (r2(y_test_G,y_pred_G),mse(y_test_G,y_pred_G,squared = False)))

Print the global results.

In [None]:
y_pred = np.concatenate((y_pred_A,y_pred_B,y_pred_C,y_pred_G))
print("Total = R2: %0.2f, RMSE %0.3f" % (r2(y_test,y_pred),mse(y_test,y_pred,squared = False)))

Compute RMSE by water depth.

In [None]:
rmseDepth = rmseByDepth(y_test,y_pred)
rmseDepth.to_csv("../data/phase-II/results/GB/depth-rmse-GB.csv",index = False)

Save global predictions.

In [None]:
ys = pd.DataFrame({"y_test":y_test,"y_pred":y_pred})
ys.to_csv("../data/phase-II/results/GB/ytest-ypred-GLOBAL-GB.csv",index = False)

Predict over the whole datasets.

In [None]:
data_A['z_pred'] = bestGB.predict(X_A)
data_B['z_pred'] = bestGB.predict(X_B)
data_C['z_pred'] = bestGB.predict(X_C)
data_G['z_pred'] = bestGB.predict(X_G)

data_A.to_csv("../data/phase-II/results/GB/data-all-A-GB.csv",index = False)
data_B.to_csv("../data/phase-II/results/GB/data-all-B-GB.csv",index = False)
data_C.to_csv("../data/phase-II/results/GB/data-all-C-GB.csv",index = False)
data_G.to_csv("../data/phase-II/results/GB/data-all-G-GB.csv",index = False)