# Import libraries

In [10]:
import numpy as np
import pandas as pd
from statistics import mean, stdev
from sklearn.svm import SVR
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm
from collections import defaultdict

# Data preprocessing

### Read data from file

In [11]:
#Read the file
file_name = 'Harvesting data'
data = pd.read_excel(file_name + '.xlsx', header=0)

#Print number of rows and colums read
print("{0} rows and {1} columns".format(len(data.index), len(data.columns)))
print("")

152 rows and 7 columns



### Initialization

In [22]:
#Defining X and Y
X = data.drop(columns = ['Water_volume'], axis = 1)
Y = data.Water_volume

#Using Built in train test split function in sklearn
bins = np.linspace(Y.min(), Y.max() + 0.1, 5)
y_binned = np.digitize(Y, bins)


params = {'kernel' : ('sigmoid', 'rbf'),
          'epsilon' : [0.007, 0.01, 0.05, 0.1, 0.2],
          'C' : [0.01, 0.05, 0.1, 0.5]}

svr = SVR()
    
gs = GridSearchCV(estimator = svr, param_grid = params, scoring = 'r2', cv = 5)

# Grid-search optimization

In [24]:
results_counter = defaultdict(lambda: defaultdict(int))
optimization_results = []

for i in tqdm(range(50)):
    data_train, data_test = train_test_split(data, test_size = 0.2,
                                                stratify = y_binned, random_state = i)
    
    #Hacking a scaling but keeping columns names since min_max_scaler does not return a dataframe
    minval = data_train.min()
    minmax = data_train.max() - data_train.min()
    data_train_scaled = (data_train - minval) / minmax
    data_test_scaled = (data_test - minval) / minmax
    
    #Define X and Y
    X_train = data_train_scaled.drop(columns = ['Water_volume'], axis=1)
    Y_train = data_train_scaled.Water_volume
    X_test = data_test_scaled.drop(columns = ['Water_volume'], axis=1)
    Y_test = data_test_scaled.Water_volume
    
    
    # fitting the model for grid search
    grid_result = gs.fit(X_train, Y_train)
    
    optimization_results.append(gs.best_params_)

    print('Best score = {:.4f} using {}'.format(gs.best_score_,
                                            gs.best_params_))

  2%|▏         | 1/50 [00:00<00:32,  1.49it/s]

Best score = 0.8288 using {'C': 0.5, 'epsilon': 0.01, 'kernel': 'rbf'}


  4%|▍         | 2/50 [00:01<00:32,  1.49it/s]

Best score = 0.8577 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


  6%|▌         | 3/50 [00:02<00:31,  1.48it/s]

Best score = 0.8709 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


  8%|▊         | 4/50 [00:02<00:32,  1.41it/s]

Best score = 0.8471 using {'C': 0.5, 'epsilon': 0.05, 'kernel': 'rbf'}


 10%|█         | 5/50 [00:03<00:33,  1.35it/s]

Best score = 0.8595 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 12%|█▏        | 6/50 [00:04<00:31,  1.38it/s]

Best score = 0.8466 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 14%|█▍        | 7/50 [00:04<00:29,  1.46it/s]

Best score = 0.8363 using {'C': 0.5, 'epsilon': 0.05, 'kernel': 'rbf'}


 16%|█▌        | 8/50 [00:05<00:29,  1.45it/s]

Best score = 0.8214 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 18%|█▊        | 9/50 [00:06<00:28,  1.43it/s]

Best score = 0.8086 using {'C': 0.5, 'epsilon': 0.05, 'kernel': 'rbf'}


 20%|██        | 10/50 [00:06<00:27,  1.46it/s]

Best score = 0.8468 using {'C': 0.5, 'epsilon': 0.01, 'kernel': 'rbf'}


 22%|██▏       | 11/50 [00:07<00:26,  1.46it/s]

Best score = 0.8403 using {'C': 0.5, 'epsilon': 0.01, 'kernel': 'rbf'}


 24%|██▍       | 12/50 [00:08<00:25,  1.47it/s]

Best score = 0.8434 using {'C': 0.5, 'epsilon': 0.05, 'kernel': 'rbf'}


 26%|██▌       | 13/50 [00:09<00:25,  1.45it/s]

Best score = 0.8576 using {'C': 0.5, 'epsilon': 0.01, 'kernel': 'rbf'}


 28%|██▊       | 14/50 [00:09<00:24,  1.46it/s]

Best score = 0.8553 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 30%|███       | 15/50 [00:10<00:25,  1.39it/s]

Best score = 0.8381 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 32%|███▏      | 16/50 [00:11<00:23,  1.44it/s]

Best score = 0.8456 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 34%|███▍      | 17/50 [00:11<00:22,  1.44it/s]

Best score = 0.8351 using {'C': 0.5, 'epsilon': 0.01, 'kernel': 'rbf'}


 36%|███▌      | 18/50 [00:12<00:22,  1.45it/s]

Best score = 0.8363 using {'C': 0.5, 'epsilon': 0.05, 'kernel': 'rbf'}


 38%|███▊      | 19/50 [00:13<00:21,  1.46it/s]

Best score = 0.8589 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 40%|████      | 20/50 [00:13<00:20,  1.43it/s]

Best score = 0.8281 using {'C': 0.5, 'epsilon': 0.05, 'kernel': 'rbf'}


 42%|████▏     | 21/50 [00:14<00:20,  1.43it/s]

Best score = 0.8464 using {'C': 0.5, 'epsilon': 0.05, 'kernel': 'rbf'}


 44%|████▍     | 22/50 [00:15<00:19,  1.43it/s]

Best score = 0.8496 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 46%|████▌     | 23/50 [00:15<00:18,  1.45it/s]

Best score = 0.8516 using {'C': 0.5, 'epsilon': 0.05, 'kernel': 'rbf'}


 48%|████▊     | 24/50 [00:16<00:18,  1.41it/s]

Best score = 0.8661 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 50%|█████     | 25/50 [00:17<00:17,  1.41it/s]

Best score = 0.8232 using {'C': 0.5, 'epsilon': 0.05, 'kernel': 'rbf'}


 52%|█████▏    | 26/50 [00:18<00:17,  1.41it/s]

Best score = 0.8489 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 54%|█████▍    | 27/50 [00:18<00:16,  1.42it/s]

Best score = 0.8417 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 56%|█████▌    | 28/50 [00:19<00:15,  1.43it/s]

Best score = 0.8448 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 58%|█████▊    | 29/50 [00:20<00:15,  1.40it/s]

Best score = 0.8531 using {'C': 0.5, 'epsilon': 0.01, 'kernel': 'rbf'}


 60%|██████    | 30/50 [00:20<00:14,  1.42it/s]

Best score = 0.8480 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 62%|██████▏   | 31/50 [00:21<00:13,  1.41it/s]

Best score = 0.8667 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 64%|██████▍   | 32/50 [00:22<00:12,  1.43it/s]

Best score = 0.8178 using {'C': 0.5, 'epsilon': 0.05, 'kernel': 'rbf'}


 66%|██████▌   | 33/50 [00:23<00:12,  1.36it/s]

Best score = 0.8468 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 68%|██████▊   | 34/50 [00:23<00:11,  1.36it/s]

Best score = 0.8307 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 70%|███████   | 35/50 [00:24<00:12,  1.22it/s]

Best score = 0.8650 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 72%|███████▏  | 36/50 [00:25<00:11,  1.26it/s]

Best score = 0.8498 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 74%|███████▍  | 37/50 [00:26<00:10,  1.30it/s]

Best score = 0.8338 using {'C': 0.5, 'epsilon': 0.01, 'kernel': 'rbf'}


 76%|███████▌  | 38/50 [00:27<00:09,  1.32it/s]

Best score = 0.8757 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 78%|███████▊  | 39/50 [00:27<00:08,  1.32it/s]

Best score = 0.8434 using {'C': 0.5, 'epsilon': 0.01, 'kernel': 'rbf'}


 80%|████████  | 40/50 [00:28<00:07,  1.32it/s]

Best score = 0.8406 using {'C': 0.5, 'epsilon': 0.01, 'kernel': 'rbf'}


 82%|████████▏ | 41/50 [00:29<00:06,  1.38it/s]

Best score = 0.8612 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 84%|████████▍ | 42/50 [00:30<00:05,  1.35it/s]

Best score = 0.8725 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 86%|████████▌ | 43/50 [00:30<00:05,  1.37it/s]

Best score = 0.8326 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 88%|████████▊ | 44/50 [00:31<00:04,  1.42it/s]

Best score = 0.8319 using {'C': 0.5, 'epsilon': 0.01, 'kernel': 'rbf'}


 90%|█████████ | 45/50 [00:32<00:03,  1.43it/s]

Best score = 0.8430 using {'C': 0.5, 'epsilon': 0.01, 'kernel': 'rbf'}


 92%|█████████▏| 46/50 [00:32<00:02,  1.42it/s]

Best score = 0.8471 using {'C': 0.5, 'epsilon': 0.01, 'kernel': 'rbf'}


 94%|█████████▍| 47/50 [00:33<00:02,  1.42it/s]

Best score = 0.8579 using {'C': 0.5, 'epsilon': 0.01, 'kernel': 'rbf'}


 96%|█████████▌| 48/50 [00:34<00:01,  1.46it/s]

Best score = 0.8525 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


 98%|█████████▊| 49/50 [00:34<00:00,  1.43it/s]

Best score = 0.8284 using {'C': 0.5, 'epsilon': 0.007, 'kernel': 'rbf'}


100%|██████████| 50/50 [00:35<00:00,  1.40it/s]

Best score = 0.8399 using {'C': 0.5, 'epsilon': 0.05, 'kernel': 'rbf'}





In [25]:
# Update the counts for each result
for result in optimization_results:
    for param, value in result.items():
        results_counter[param][value] += 1

# Print the results counter
for param, counts in results_counter.items():
    print(f"Parameter: {param}")
    for value, count in counts.items():
        print(f"  Value: {value}, Count: {count}")

Parameter: C
  Value: 0.5, Count: 50
Parameter: epsilon
  Value: 0.01, Count: 13
  Value: 0.007, Count: 26
  Value: 0.05, Count: 11
Parameter: kernel
  Value: rbf, Count: 50
