# Import libraries

In [5]:
import numpy as np
import pandas as pd
from statistics import mean, stdev
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm
from collections import defaultdict

# Data preprocessing

### Read data from file

In [2]:
#Read the file
file_name = 'Harvesting data'
data = pd.read_excel(file_name + '.xlsx', header=0)

#Print number of rows and colums read
print("{0} rows and {1} columns".format(len(data.index), len(data.columns)))
print("")

152 rows and 7 columns



### Initialization

In [4]:
#Defining X and Y
X = data.drop(columns = ['Water_volume'], axis = 1)
Y = data.Water_volume

#Using Built in train test split function in sklearn
bins = np.linspace(Y.min(), Y.max() + 0.1, 5)
y_binned = np.digitize(Y, bins)


params = {'n_estimators' : [100, 150, 200],
          'learning_rate' : [0.05, 0.085, 0.1],
          'max_depth' : [2, 4, 6]}

gbr = GradientBoostingRegressor()
    
gs = GridSearchCV(estimator = gbr, param_grid = params, scoring = 'r2', cv = 5)

# Grid-search optimization

In [6]:
results_counter = defaultdict(lambda: defaultdict(int))
optimization_results = []

for i in tqdm(range(50)):
    data_train, data_test = train_test_split(data, test_size = 0.2,
                                                stratify = y_binned, random_state = i)
    
    #Hacking a scaling but keeping columns names since min_max_scaler does not return a dataframe
    minval = data_train.min()
    minmax = data_train.max() - data_train.min()
    data_train_scaled = (data_train - minval) / minmax
    data_test_scaled = (data_test - minval) / minmax
    
    #Define X and Y
    X_train = data_train_scaled.drop(columns = ['Water_volume'], axis=1)
    Y_train = data_train_scaled.Water_volume
    X_test = data_test_scaled.drop(columns = ['Water_volume'], axis=1)
    Y_test = data_test_scaled.Water_volume
    
    
    # fitting the model for grid search
    grid_result = gs.fit(X_train, Y_train)

    optimization_results.append(gs.best_params_)
    
    print('Best score = {:.4f} using {}'.format(gs.best_score_,
                                            gs.best_params_))

  2%|▏         | 1/50 [00:10<08:37, 10.56s/it]

Best score = 0.7684 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


  4%|▍         | 2/50 [00:20<08:22, 10.48s/it]

Best score = 0.8417 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


  6%|▌         | 3/50 [00:31<08:04, 10.31s/it]

Best score = 0.8425 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


  8%|▊         | 4/50 [00:41<07:48, 10.18s/it]

Best score = 0.8355 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 10%|█         | 5/50 [00:50<07:32, 10.05s/it]

Best score = 0.8189 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


 12%|█▏        | 6/50 [01:01<07:30, 10.25s/it]

Best score = 0.7888 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


 14%|█▍        | 7/50 [01:11<07:12, 10.07s/it]

Best score = 0.8270 using {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 200}


 16%|█▌        | 8/50 [01:21<07:02, 10.05s/it]

Best score = 0.7954 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 150}


 18%|█▊        | 9/50 [01:31<06:55, 10.13s/it]

Best score = 0.7614 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


 20%|██        | 10/50 [01:41<06:49, 10.23s/it]

Best score = 0.8422 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100}


 22%|██▏       | 11/50 [01:52<06:46, 10.42s/it]

Best score = 0.8539 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 150}


 24%|██▍       | 12/50 [02:04<06:49, 10.78s/it]

Best score = 0.8308 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


 26%|██▌       | 13/50 [02:14<06:35, 10.70s/it]

Best score = 0.8342 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 100}


 28%|██▊       | 14/50 [02:26<06:32, 10.91s/it]

Best score = 0.8128 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 30%|███       | 15/50 [02:37<06:22, 10.92s/it]

Best score = 0.8465 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}


 32%|███▏      | 16/50 [02:48<06:09, 10.88s/it]

Best score = 0.8437 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


 34%|███▍      | 17/50 [02:59<06:03, 11.02s/it]

Best score = 0.8231 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}


 36%|███▌      | 18/50 [03:10<05:49, 10.92s/it]

Best score = 0.8091 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100}


 38%|███▊      | 19/50 [03:22<05:54, 11.45s/it]

Best score = 0.8197 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 40%|████      | 20/50 [03:33<05:37, 11.24s/it]

Best score = 0.7998 using {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 150}


 42%|████▏     | 21/50 [03:44<05:20, 11.06s/it]

Best score = 0.8301 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 44%|████▍     | 22/50 [03:55<05:10, 11.08s/it]

Best score = 0.8141 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}


 46%|████▌     | 23/50 [04:06<05:01, 11.17s/it]

Best score = 0.8500 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 48%|████▊     | 24/50 [04:17<04:50, 11.17s/it]

Best score = 0.8231 using {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 200}


 50%|█████     | 25/50 [04:28<04:34, 10.97s/it]

Best score = 0.8088 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


 52%|█████▏    | 26/50 [04:38<04:17, 10.71s/it]

Best score = 0.8055 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}


 54%|█████▍    | 27/50 [04:49<04:08, 10.83s/it]

Best score = 0.8286 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 150}


 56%|█████▌    | 28/50 [04:59<03:54, 10.67s/it]

Best score = 0.7950 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


 58%|█████▊    | 29/50 [05:10<03:44, 10.70s/it]

Best score = 0.8307 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 60%|██████    | 30/50 [05:20<03:29, 10.50s/it]

Best score = 0.7993 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


 62%|██████▏   | 31/50 [05:31<03:19, 10.51s/it]

Best score = 0.8256 using {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 200}


 64%|██████▍   | 32/50 [05:41<03:06, 10.33s/it]

Best score = 0.7919 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}


 66%|██████▌   | 33/50 [05:51<02:54, 10.25s/it]

Best score = 0.8072 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


 68%|██████▊   | 34/50 [06:01<02:42, 10.19s/it]

Best score = 0.8089 using {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 200}


 70%|███████   | 35/50 [06:11<02:31, 10.10s/it]

Best score = 0.8316 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 72%|███████▏  | 36/50 [06:21<02:21, 10.08s/it]

Best score = 0.8015 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 74%|███████▍  | 37/50 [06:31<02:12, 10.21s/it]

Best score = 0.8136 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


 76%|███████▌  | 38/50 [06:42<02:04, 10.36s/it]

Best score = 0.8023 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 78%|███████▊  | 39/50 [06:53<01:55, 10.47s/it]

Best score = 0.7725 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


 80%|████████  | 40/50 [07:03<01:44, 10.47s/it]

Best score = 0.7787 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


 82%|████████▏ | 41/50 [07:14<01:34, 10.51s/it]

Best score = 0.8336 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


 84%|████████▍ | 42/50 [07:24<01:23, 10.46s/it]

Best score = 0.8239 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


 86%|████████▌ | 43/50 [07:34<01:12, 10.41s/it]

Best score = 0.8169 using {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 200}


 88%|████████▊ | 44/50 [07:45<01:03, 10.61s/it]

Best score = 0.8122 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


 90%|█████████ | 45/50 [07:57<00:53, 10.76s/it]

Best score = 0.8489 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 92%|█████████▏| 46/50 [08:07<00:42, 10.60s/it]

Best score = 0.8272 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}


 94%|█████████▍| 47/50 [08:17<00:31, 10.53s/it]

Best score = 0.8087 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


 96%|█████████▌| 48/50 [08:27<00:20, 10.44s/it]

Best score = 0.8167 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 100}


 98%|█████████▊| 49/50 [08:38<00:10, 10.38s/it]

Best score = 0.8045 using {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 200}


100%|██████████| 50/50 [08:48<00:00, 10.57s/it]

Best score = 0.8351 using {'learning_rate': 0.085, 'max_depth': 2, 'n_estimators': 200}





In [7]:
# Update the counts for each result
for result in optimization_results:
    for param, value in result.items():
        results_counter[param][value] += 1

# Print the results counter
for param, counts in results_counter.items():
    print(f"Parameter: {param}")
    for value, count in counts.items():
        print(f"  Value: {value}, Count: {count}")

Parameter: learning_rate
  Value: 0.1, Count: 20
  Value: 0.085, Count: 23
  Value: 0.05, Count: 7
Parameter: max_depth
  Value: 2, Count: 50
Parameter: n_estimators
  Value: 200, Count: 37
  Value: 150, Count: 9
  Value: 100, Count: 4
