# Import libraries

In [1]:
import numpy as np
import pandas as pd
from statistics import mean, stdev
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm

# Data preprocessing

### Read data from file

In [2]:
#Read the file
file_name = 'Harvesting data'
data = pd.read_excel(file_name + '.xlsx', header=0)

#Print number of rows and colums read
print("{0} rows and {1} columns".format(len(data.index), len(data.columns)))
print("")

152 rows and 7 columns



### Initialization

In [3]:
#Defining X and Y
X = data.drop(columns = ['Water_volume'], axis = 1)
Y = data.Water_volume

#Using Built in train test split function in sklearn
bins = np.linspace(Y.min(), Y.max() + 0.1, 5)
y_binned = np.digitize(Y, bins)


params = {'n_estimators' : [100, 150, 200],
          'learning_rate' : [0.05, 0.075, 0.1],
          'max_depth' : [2, 4, 6]}

gbr = GradientBoostingRegressor()
    
gs = GridSearchCV(estimator = gbr, param_grid = params, scoring = 'r2', cv = 5)

# Grid-search optimization

In [4]:
for i in tqdm(range(50)):
    data_train, data_test = train_test_split(data, test_size = 0.2,
                                                stratify = y_binned, random_state = i)
    
    #Hacking a scaling but keeping columns names since min_max_scaler does not return a dataframe
    minval = data_train.min()
    minmax = data_train.max() - data_train.min()
    data_train_scaled = (data_train - minval) / minmax
    data_test_scaled = (data_test - minval) / minmax
    
    #Define X and Y
    X_train = data_train_scaled.drop(columns = ['Water_volume'], axis=1)
    Y_train = data_train_scaled.Water_volume
    X_test = data_test_scaled.drop(columns = ['Water_volume'], axis=1)
    Y_test = data_test_scaled.Water_volume
    
    
    # fitting the model for grid search
    grid_result = gs.fit(X_train, Y_train)
    
    print('Best score = {:.4f} using {}'.format(gs.best_score_,
                                            gs.best_params_))

  2%|▏         | 1/50 [00:29<24:04, 29.48s/it]

Best score = 0.7687 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


  4%|▍         | 2/50 [00:39<14:14, 17.80s/it]

Best score = 0.8367 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 200}


  6%|▌         | 3/50 [00:48<10:58, 14.02s/it]

Best score = 0.8400 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}


  8%|▊         | 4/50 [00:57<09:07, 11.90s/it]

Best score = 0.8349 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 10%|█         | 5/50 [01:05<07:59, 10.66s/it]

Best score = 0.8191 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 12%|█▏        | 6/50 [01:14<07:17,  9.95s/it]

Best score = 0.7883 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}


 14%|█▍        | 7/50 [01:22<06:47,  9.47s/it]

Best score = 0.8255 using {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 200}


 16%|█▌        | 8/50 [01:31<06:25,  9.18s/it]

Best score = 0.7912 using {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 200}


 18%|█▊        | 9/50 [01:39<06:08,  8.98s/it]

Best score = 0.7623 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 20%|██        | 10/50 [01:48<05:53,  8.84s/it]

Best score = 0.8425 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 150}


 22%|██▏       | 11/50 [01:57<05:41,  8.77s/it]

Best score = 0.8505 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 150}


 24%|██▍       | 12/50 [02:06<05:39,  8.93s/it]

Best score = 0.8308 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 26%|██▌       | 13/50 [02:16<05:47,  9.38s/it]

Best score = 0.8345 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 100}


 28%|██▊       | 14/50 [02:27<05:57,  9.93s/it]

Best score = 0.8124 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 30%|███       | 15/50 [02:36<05:31,  9.47s/it]

Best score = 0.8460 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}


 32%|███▏      | 16/50 [02:44<05:09,  9.11s/it]

Best score = 0.8385 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}


 34%|███▍      | 17/50 [02:53<04:54,  8.92s/it]

Best score = 0.8231 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 36%|███▌      | 18/50 [03:01<04:40,  8.78s/it]

Best score = 0.8092 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100}


 38%|███▊      | 19/50 [03:10<04:33,  8.82s/it]

Best score = 0.8198 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 40%|████      | 20/50 [03:19<04:28,  8.96s/it]

Best score = 0.8021 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 100}


 42%|████▏     | 21/50 [03:28<04:20,  8.99s/it]

Best score = 0.8315 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 44%|████▍     | 22/50 [03:37<04:10,  8.94s/it]

Best score = 0.8151 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}


 46%|████▌     | 23/50 [03:46<03:57,  8.80s/it]

Best score = 0.8484 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 48%|████▊     | 24/50 [03:55<03:51,  8.91s/it]

Best score = 0.8244 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 150}


 50%|█████     | 25/50 [04:03<03:39,  8.78s/it]

Best score = 0.8019 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}


 52%|█████▏    | 26/50 [04:11<03:25,  8.57s/it]

Best score = 0.8062 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}


 54%|█████▍    | 27/50 [04:19<03:14,  8.45s/it]

Best score = 0.8286 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 200}


 56%|█████▌    | 28/50 [04:28<03:04,  8.38s/it]

Best score = 0.7936 using {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 200}


 58%|█████▊    | 29/50 [04:36<02:54,  8.30s/it]

Best score = 0.8306 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 60%|██████    | 30/50 [04:44<02:45,  8.28s/it]

Best score = 0.7988 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 62%|██████▏   | 31/50 [04:52<02:37,  8.27s/it]

Best score = 0.8290 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 200}


 64%|██████▍   | 32/50 [05:00<02:27,  8.20s/it]

Best score = 0.7954 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 200}


 66%|██████▌   | 33/50 [05:09<02:19,  8.23s/it]

Best score = 0.8070 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}


 68%|██████▊   | 34/50 [05:17<02:12,  8.28s/it]

Best score = 0.8093 using {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 200}


 70%|███████   | 35/50 [05:25<02:04,  8.32s/it]

Best score = 0.8307 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 72%|███████▏  | 36/50 [05:34<01:57,  8.39s/it]

Best score = 0.8028 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 74%|███████▍  | 37/50 [05:42<01:48,  8.38s/it]

Best score = 0.8137 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 200}


 76%|███████▌  | 38/50 [05:51<01:40,  8.38s/it]

Best score = 0.8012 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 78%|███████▊  | 39/50 [05:59<01:32,  8.43s/it]

Best score = 0.7676 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 80%|████████  | 40/50 [06:08<01:25,  8.52s/it]

Best score = 0.7788 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 200}


 82%|████████▏ | 41/50 [06:17<01:18,  8.73s/it]

Best score = 0.8330 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 200}


 84%|████████▍ | 42/50 [06:26<01:08,  8.60s/it]

Best score = 0.8241 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 200}


 86%|████████▌ | 43/50 [06:34<00:59,  8.51s/it]

Best score = 0.8171 using {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 150}


 88%|████████▊ | 44/50 [06:42<00:50,  8.45s/it]

Best score = 0.8124 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 200}


 90%|█████████ | 45/50 [06:50<00:41,  8.34s/it]

Best score = 0.8482 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 92%|█████████▏| 46/50 [06:58<00:33,  8.26s/it]

Best score = 0.8187 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 200}


 94%|█████████▍| 47/50 [07:07<00:24,  8.32s/it]

Best score = 0.8092 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 96%|█████████▌| 48/50 [07:15<00:16,  8.35s/it]

Best score = 0.8158 using {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 200}


 98%|█████████▊| 49/50 [07:24<00:08,  8.38s/it]

Best score = 0.8088 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 150}


100%|██████████| 50/50 [07:32<00:00,  9.05s/it]

Best score = 0.8351 using {'learning_rate': 0.075, 'max_depth': 2, 'n_estimators': 200}



