# Import libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm
from collections import defaultdict
from pathlib import Path

# Data preprocessing

### Read data from file

In [None]:
# Read the file
data_path = Path("../data")
file_name = "harvesting_data.xlsx"
data = pd.read_excel(Path.joinpath(data_path, file_name), header=0)

# Print number of rows and colums read
print(f"{len(data.index)} rows and {len(data.columns)} columns")
print("")

### Initialization

In [3]:
# Defining X and Y
X = data.drop(columns=["Water_volume"], axis=1)
Y = data.Water_volume

# Using Built in train test split function in sklearn
bins = np.linspace(Y.min(), Y.max() + 0.1, 5)
y_binned = np.digitize(Y, bins)


params = {
    "activation": ("logistic", "tanh", "relu"),
    "hidden_layer_sizes": [6, 9, 12],
    "solver": ["lbfgs"],
    "max_iter": [2000],
}

mlp = MLPRegressor()

gs = GridSearchCV(estimator=mlp, param_grid=params, scoring="r2", cv=5)

# Grid-search optimization

In [None]:
results_counter = defaultdict(lambda: defaultdict(int))
optimization_results = []

for i in tqdm(range(50)):
    data_train, data_test = train_test_split(
        data, test_size=0.2, stratify=y_binned, random_state=i
    )

    # Hacking a scaling but keeping columns names since min_max_scaler does not return a dataframe
    minval = data_train.min()
    minmax = data_train.max() - data_train.min()
    data_train_scaled = (data_train - minval) / minmax
    data_test_scaled = (data_test - minval) / minmax

    # Define X and Y
    X_train = data_train_scaled.drop(columns=["Water_volume"], axis=1)
    Y_train = data_train_scaled.Water_volume
    X_test = data_test_scaled.drop(columns=["Water_volume"], axis=1)
    Y_test = data_test_scaled.Water_volume

    # fitting the model for grid search
    grid_result = gs.fit(X_train, Y_train)

    optimization_results.append(gs.best_params_)

    print(f"Best score = {gs.best_score_:.4f} using {gs.best_params_}")

In [None]:
# Update the counts for each result
for result in optimization_results:
    for param, value in result.items():
        results_counter[param][value] += 1

# Print the results counter
for param, counts in results_counter.items():
    print(f"Parameter: {param}")
    for value, count in counts.items():
        print(f"  Value: {value}, Count: {count}")