# Artificial Neural Networks

In [1]:
import os
os.environ['PYTHONHASHSEED']=str(4)

import random
import pandas as pd
import numpy as np 
from tqdm import tqdm
import collections
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.keras import backend as K
import keras
from keras.models import Sequential
from keras.optimizers import Nadam, Adam
from keras.layers import Dense, Activation, Dropout
from keras.constraints import maxnorm
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, KFold, GridSearchCV, cross_validate
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score

In [2]:
def set_random_seed(x):
    tf.random.set_seed(x) # Set the `tensorflow` pseudo-random generator at a fixed value
    np.random.seed(x)     # Set the `numpy` pseudo-random generator at a fixed value
    random.seed(x)        # Set the `python` built-in pseudo-random generator at a fixed value   

In [3]:
def coeff_determination(y_true, y_pred):
        SS_res =  K.sum(K.square( y_true-y_pred )) 
        SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) ) 
        return ( 1 - SS_res/(SS_tot + K.epsilon()) )

my_scorer = make_scorer(r2_score)

###############################################################
# Reproducibility is a Problem when using parallel processing #
###############################################################
n_jobs = -1

In [4]:
def build_model(optimizer='adam'):    # Optimizers update the weight parameters to minimize the loss function
    # create model
    model = Sequential()
    model.add(Dense(9, activation='sigmoid', input_shape=(Nfeatures,))) # Hidden layer with 9 nodes # input_shape: input layer (n° features)
    model.add(Dense(4, activation='sigmoid')) # Hidden Layer with 4 nodes
    model.add(Dense(1, activation='sigmoid')) # Output Layer: n° output variables
    # Compile model
    model.compile(loss='mse', optimizer=optimizer, metrics=[coeff_determination])
    return model

In [6]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\ANN'
gf_folders = ['withoutGF','withGF']
# set random seed
seed = 4
set_random_seed(seed) 

GF = []
DATE = []
Nb_S2_used = []
optim = []
r2 = []
for gf_folder in gf_folders:    
    subdir1 = os.path.join(maindir1,gf_folder,'France')
    files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]
    
    for n in tqdm(range(len(files_temp))):
        file1 = files_temp[n]
        for file2 in os.listdir(os.path.join(subdir1,file1)):
            if 'Best_4' not in file2:
                # step0: Read and split data
                excel_file = pd.read_excel(os.path.join(subdir1,file1,file2))
                y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                        # Target data
                excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                X = excel_file.values
                # Data Normalization
                y = MinMaxScaler().fit_transform(y)
                X = MinMaxScaler().fit_transform(X)
                train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) # Split data

                ############### Tune Optimizer ###############            
                Nfeatures = train_X.shape[1]
                # create model
                model = KerasRegressor(build_fn=build_model, epochs=500, batch_size=100, verbose=0)
                # define the grid search parameters
                param_grid = {'optimizer':['SGD', 'Adam', 'Adamax', 'Nadam']}
                grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring = my_scorer, n_jobs=n_jobs)
                grid_result = grid.fit(train_X, train_y) # Shuffle False

                # step6: Save results
                GF.append(gf_folder)
                DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                Nb_S2_used.append(int(file2[17:18]))
                optim.append(grid_result.best_params_['optimizer'])  
                r2.append(round(grid_result.best_score_*100,2))

results = pd.DataFrame({'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'Optimizer':optim,
                        'R2':r2
                        })

100%|█████████████████████████████████████████████████████████████████████████████████| 15/15 [54:47<00:00, 219.14s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 15/15 [53:42<00:00, 214.83s/it]


In [7]:
results['Optimizer'].value_counts().idxmax() # Most frequent 

'Adam'

In [4]:
def build_model(learn_rate=0.01):    
    # create model
    model = Sequential()
    model.add(Dense(9, activation='sigmoid', input_shape=(Nfeatures,))) # Hidden layer with 9 nodes # input_shape: input layer (n° features)
    model.add(Dense(4, activation='sigmoid')) # Hidden Layer with 4 nodes
    model.add(Dense(1, activation='sigmoid')) # Output Layer: n° output variables
    # Compile model
    optimizer = Adam(lr=learn_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-07, name="Adam")
    model.compile(loss='mse', optimizer=optimizer, metrics=[coeff_determination])
    return model

In [5]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\ANN'
gf_folders = ['withoutGF','withGF']
# set random seed
seed = 4
set_random_seed(seed) 

GF = []
DATE = []
Nb_S2_used = []
lr = []
r2 = []
for gf_folder in gf_folders:    
    subdir1 = os.path.join(maindir1,gf_folder,'France')
    files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]
    
    for n in tqdm(range(len(files_temp))):
        file1 = files_temp[n]
        for file2 in os.listdir(os.path.join(subdir1,file1)):
            if 'Best_4' not in file2:
                # step0: Read and split data
                excel_file = pd.read_excel(os.path.join(subdir1,file1,file2))
                y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                        # Target data
                excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                X = excel_file.values
                # Data Normalization
                y = MinMaxScaler().fit_transform(y)
                X = MinMaxScaler().fit_transform(X)
                train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) # Split data

                ############### Learning Rate & Momentum ###############
                # lr: # how much to update the weight at the end of each batch
                # beta_1 and beta_2: # adjust momentum # how much to let the previous update influence the current weight update
                # epsilon: stabilize learning process

                Nfeatures = train_X.shape[1]
                # create model
                model = KerasRegressor(build_fn=build_model, epochs=500, batch_size=100, verbose=0)
                # define the grid search parameters
                param_grid = {'learn_rate':[0.001, 0.01, 0.1, 0.2, 0.3]}
                grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring = my_scorer, n_jobs=n_jobs)
                grid_result = grid.fit(train_X, train_y)  

                # step6: Save results
                GF.append(gf_folder)
                DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                Nb_S2_used.append(int(file2[17:18]))
                lr.append(grid_result.best_params_['learn_rate'])  
                r2.append(round(grid_result.best_score_*100,2))
    
results = pd.DataFrame({'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'learn_rate':lr,
                        'R2':r2
                        })

100%|███████████████████████████████████████████████████████████████████████████████| 15/15 [1:14:44<00:00, 298.97s/it]
100%|███████████████████████████████████████████████████████████████████████████████| 15/15 [1:14:27<00:00, 297.87s/it]


In [6]:
results['learn_rate'].value_counts().idxmax() # Most frequent 

0.01

In [4]:
def build_model(learn_rate=0.01):    
    # create model
    model = Sequential()
    model.add(Dense(9, activation='sigmoid', input_shape=(Nfeatures,))) # Hidden layer with 9 nodes # input_shape: input layer (n° features)
    model.add(Dense(4, activation='sigmoid')) # Hidden Layer with 4 nodes
    model.add(Dense(1, activation='sigmoid')) # Output Layer: n° output variables
    # Compile model
    optimizer = Adam(lr=learn_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-07, name="Adam")
    model.compile(loss='mse', optimizer=optimizer, metrics=[coeff_determination])
    return model

In [5]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\ANN'
gf_folders = ['withoutGF','withGF']
# set random seed
seed = 4
set_random_seed(seed) 

GF = []
DATE = []
Nb_S2_used = []
batchSize = []
epochsN = []
r2 = []
for gf_folder in gf_folders:    
    subdir1 = os.path.join(maindir1,gf_folder,'France')
    files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]
    
    for n in tqdm(range(len(files_temp))):
        file1 = files_temp[n]
        for file2 in os.listdir(os.path.join(subdir1,file1)):
            if 'Best_4' not in file2:
                # step0: Read and split data
                excel_file = pd.read_excel(os.path.join(subdir1,file1,file2))
                y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                        # Target data
                excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                X = excel_file.values
                # Data Normalization
                y = MinMaxScaler().fit_transform(y)
                X = MinMaxScaler().fit_transform(X)
                train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) # Split data

                ############### Tune Batch Size & Number of Epochs ###############
                # Batch size: n° of samples shown to the network before the weights are updated 
                # Epochs: n° of times that the entire training dataset is shown to the network during training
                Nfeatures = train_X.shape[1]
                # create model
                model = KerasRegressor(build_fn=build_model, verbose=0)
                # define the grid search parameters
                param_grid = {'batch_size':[10, 20, 40, 60, 80, 100],
                              'epochs':[10, 50, 100]}
                grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring = my_scorer, n_jobs=n_jobs)
                grid_result = grid.fit(train_X, train_y)

                # step6: Save results
                GF.append(gf_folder)
                DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                Nb_S2_used.append(int(file2[17:18]))
                batchSize.append(grid_result.best_params_['batch_size'])  
                epochsN.append(grid_result.best_params_['epochs'])
                r2.append(round(grid_result.best_score_*100,2))
    
results = pd.DataFrame({'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'batch_size':batchSize,
                        'epochs':epochsN,
                        'R2':r2
                        })

100%|███████████████████████████████████████████████████████████████████████████████| 15/15 [1:34:58<00:00, 379.93s/it]
100%|███████████████████████████████████████████████████████████████████████████████| 15/15 [1:36:16<00:00, 385.13s/it]


In [6]:
results['epochs'].value_counts().idxmax() # Most frequent 

100

In [7]:
results['batch_size'].value_counts().idxmax() # Most frequent 

10

In [4]:
def build_model(units1=1,units2=1):    
    # create model
    model = Sequential()
    model.add(Dense(units1, activation='sigmoid', input_shape=(Nfeatures,))) # Hidden layer with 9 nodes # input_shape: input layer (n° features)
    model.add(Dense(units2, activation='sigmoid')) # Hidden Layer with 4 nodes
    model.add(Dense(1, activation='sigmoid')) # Output Layer: n° output variables
    # Compile model
    optimizer = Adam(lr=0.01)
    model.compile(loss='mse', optimizer=optimizer, metrics=[coeff_determination])
    return model

In [None]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\ANN'
gf_folders = ['withoutGF','withGF']
# set random seed
seed = 4
set_random_seed(seed) 

GF = []
DATE = []
Nb_S2_used = []
units1 = []
units2 = []
r2 = []
for gf_folder in gf_folders:    
    subdir1 = os.path.join(maindir1,gf_folder,'France')
    files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]
    
    for n in tqdm(range(len(files_temp))):
        file1 = files_temp[n]
        for file2 in os.listdir(os.path.join(subdir1,file1)):
            if 'Best_4' not in file2:
                # step0: Read and split data
                excel_file = pd.read_excel(os.path.join(subdir1,file1,file2))
                y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                        # Target data
                excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                X = excel_file.values
                # Data Normalization
                y = MinMaxScaler().fit_transform(y)
                X = MinMaxScaler().fit_transform(X)
                train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) # Split data

                ############### Tune the Network Design ###############
                # N° Layers: fixed as 2
                # N° Neurons: a large enough single layer network can approximate any other neural network, at least in theory.

                Nfeatures = train_X.shape[1]
                # create model
                model = KerasRegressor(build_fn=build_model, epochs=100, batch_size=10, verbose=0)
                # define the grid search parameters
                param_grid = {'units1':[i for i in range(2,15,2)],
                              'units2':[i for i in range(2,15,2)]}
                grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring = my_scorer, n_jobs=n_jobs)
                grid_result = grid.fit(train_X, train_y)

                # step6: Save results
                GF.append(gf_folder)
                DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                Nb_S2_used.append(int(file2[17:18]))
                units1.append(grid_result.best_params_['units1'])  
                units2.append(grid_result.best_params_['units2'])
                r2.append(round(grid_result.best_score_*100,2))
    
results = pd.DataFrame({'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'units1':units1,
                        'units2':units2,
                        'R2':r2
                        })

 40%|██████████████████████████████▍                                             | 6/15 [8:06:07<12:09:10, 4861.19s/it]

In [None]:
results['units1'].value_counts().idxmax() # Most frequent 

In [None]:
results['units2'].value_counts().idxmax() # Most frequent 

In [None]:
def build_model(units1=14,units2=12,init_mode1='uniform', init_mode2='uniform' , init_mode3='uniform', activ_func1='sigmoid', activ_func2='sigmoid', activ_func3='sigmoid'):    
    # create model
    model = Sequential()
    model.add(Dense(units1, kernel_initializer=init_mode1, activation=activ_func1, input_shape=(Nfeatures,))) # Hidden layer with 9 nodes # input_shape: input layer (n° features)
    model.add(Dense(units2, kernel_initializer=init_mode2, activation=activ_func2)) # Hidden Layer with 4 nodes
    model.add(Dense(1, kernel_initializer=init_mode3, activation=activ_func3)) # Output Layer: n° output variables
    # Compile model
    optimizer = Adam(lr=0.01)
    model.compile(loss='mse', optimizer=optimizer, metrics=[coeff_determination])
    return model

In [None]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\ANN'
gf_folders = ['withoutGF','withGF']
# set random seed
seed = 4
set_random_seed(seed) 

GF = []
DATE = []
Nb_S2_used = []
activ_func1 = []
activ_func2 = []
activ_func3 = []
r2 = []
for gf_folder in gf_folders:    
    subdir1 = os.path.join(maindir1,gf_folder,'France')
    files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]
    
    for n in tqdm(range(len(files_temp))):
        file1 = files_temp[n]
        for file2 in os.listdir(os.path.join(subdir1,file1)):
            if 'Best_4' not in file2:
                # step0: Read and split data
                excel_file = pd.read_excel(os.path.join(subdir1,file1,file2))
                y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                        # Target data
                excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                X = excel_file.values
                # Data Normalization
                y = MinMaxScaler().fit_transform(y)
                X = MinMaxScaler().fit_transform(X)
                train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) # Split data

                ############### Tune Network Weight Initialization & Activation Function ###############
                Nfeatures = train_X.shape[1]
                # create model
                model = KerasRegressor(build_fn=build_model, epochs=100, batch_size=10, verbose=0)
                # define the grid search parameters
                activ_func = ['softmax', 'relu', 'tanh', 'sigmoid']
                param_grid = {'activ_func1':activ_func,
                              'activ_func2':activ_func,
                              'activ_func3':activ_func}
                grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring = my_scorer, n_jobs=n_jobs)
                grid_result = grid.fit(train_X, train_y)

                # step6: Save results
                GF.append(gf_folder)
                DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                Nb_S2_used.append(int(file2[17:18]))
                activ_func1.append(grid_result.best_params_['activ_func1'])
                activ_func2.append(grid_result.best_params_['activ_func2'])
                activ_func3.append(grid_result.best_params_['activ_func3'])
                r2.append(round(grid_result.best_score_*100,2))

results = pd.DataFrame({'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'activ_func1':activ_func1,
                        'activ_func2':activ_func2,
                        'activ_func3':activ_func3,
                        'R2':r2
                        })              

In [None]:
results['activ_func1'].value_counts().idxmax() # Most frequent 

In [None]:
results['activ_func2'].value_counts().idxmax() # Most frequent 

In [None]:
results['activ_func3'].value_counts().idxmax() # Most frequent 

In [None]:
# optimizer = 'Adam'
# learn_rate = 0.01
# Epochs = 100
# Batch_size = 10
# Units1 = 14
# Units2 = 12
# ActivFunc1 = 'sigmoid'
# ActivFunc2 = 'sigmoid'
# ActivFunc3 = 'sigmoid'

# K-Nearest Neighbors

In [1]:
import os
import pandas as pd
import numpy as np 
from tqdm import tqdm
import collections
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\KNN'
# gf_folders = ['withoutGF','withGF']
gf_folders = ['withGF']
seed = 4
scoring = {'r2': 'r2',
           'mae': 'neg_mean_absolute_error',
           'rmse':'neg_root_mean_squared_error'}

for gf_folder in gf_folders:    
    subdir1 = os.path.join(maindir1,gf_folder,'France')
    files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]
    GF = []
    DATE = []
    Nb_S2_used = []
    n_neighbors = []
    leaf_size = []
    weights = []         
    MAE = []            
    RMSE = []  
    r2 = []
    for n in tqdm(range(len(files_temp))):
        file1 = files_temp[n]
        for file2 in os.listdir(os.path.join(subdir1,file1)):
            if 'Best_4' not in file2:
                # step0: Read and split data
                excel_file = pd.read_excel(os.path.join(subdir1,file1,file2))
                y = excel_file.L0                                                                         # Target data
                features = ['L'+str(i) for i in range(1,len(excel_file.columns)-3)]                       # Predictor variables
                X = excel_file[features]                                                                  # Subset df            
                train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) # Split data

                # step1: create random grid (construct a specified n° of combinations by randomly choosing possible parameter values)
                random_grid = {'n_neighbors':[4,5,6,7,8],
                              'leaf_size':[1,2,3,5],
                              'weights':['uniform', 'distance']
                              }                                                       

                # step2: initiate the search for best combination
                random_search = RandomizedSearchCV(estimator = KNeighborsRegressor(),     # Create base model
                                                   param_distributions = random_grid,      # random selection of parameters
                                                   n_iter = 40,                           # across all different combinations
                                                   cv = 5,                                 # using 5 fold cross validation
                                                   verbose=0,                              # Set it to see more information about the tree building process
                                                   random_state=seed,                      # Note: A Random Forest uses randomised decision trees, and as such, each time you fit, the result will change
                                                   n_jobs = -1)                            # use all available cores
                random_search.fit(train_X, train_y)                                        # Fit the random search model

                # step3: Calculate model using best parameters
                kfold_indexes = list(KFold(10,shuffle=True,random_state=seed).split(train_X)) # split training into Kfolds and shuffle            
                model = KNeighborsRegressor(n_neighbors=random_search.best_params_['n_neighbors'],
                                            leaf_size=random_search.best_params_['leaf_size'],
                                            weights=random_search.best_params_['weights']
                                           )                                        # Define model parameters

                # step4: Get scores of KFolds CV for verification
                scores = cross_validate(model,train_X,train_y,cv=kfold_indexes,scoring=scoring,return_estimator=True)            
                df = pd.DataFrame({'test_r2':scores['test_r2'],
                                  'test_mae':scores['test_mae'],
                                  'test_rmse':scores['test_rmse']})
                
                path1 = os.path.join(maindir2,'France',gf_folder,'initialTesting')
                os.makedirs(path1, exist_ok=True)
                outputdir1 = os.path.join(path1,'scores10Folds_'+file2[17:])
                df.to_excel(outputdir1, encoding='utf-8', index=False)   

                # step5: Get scores using all data
                model.fit(train_X, train_y)                                          # Fit model based on training data
                predictions = model.predict(val_X)                                         # Apply model on validation data            
                MAE1 = mean_absolute_error(val_y, predictions)                      # Measure MAE (less sensitive to outliers compared to RMSE)
                RMSE1 = mean_squared_error(val_y, predictions, squared=False)        # Measure RMSE (average error performed by the model in predicting the outcome for an observation)
                Rsquared1 = r2_score(val_y, predictions)                            # Measure accuracy (coefficient of determination R^2: the proportion of variation in the outcome that is explained by the predictor variables)

                # step6: Save results
                GF.append(gf_folder)
                DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                Nb_S2_used.append(int(file2[17:18]))
                n_neighbors.append(random_search.best_params_['n_neighbors'])
                leaf_size.append(random_search.best_params_['leaf_size'])
                weights.append(random_search.best_params_['weights'])
                MAE.append(round(MAE1,2))            
                RMSE.append(round(RMSE1,2))  
                r2.append(round(Rsquared1*100,2))
            
    results = pd.DataFrame({'Date':DATE,
                            'Gapfilling':GF,
                            'Nb_S2_used':Nb_S2_used,
                            'n_neighbors':n_neighbors,
                            'leaf_size':leaf_size,
                            'weights':weights,
                            'MAE':MAE,
                            'RMSE':RMSE,
                            'R2':r2
                            })

    # step7: Export as excel files
    path2 = os.path.join(maindir2,'France',gf_folder,'initialTesting')
    os.makedirs(path2, exist_ok=True)
    outputdir2 = os.path.join(path2,'Performance_'+gf_folder+'.xlsx')
    results.to_excel(outputdir2, encoding='utf-8', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [03:59<00:00, 15.98s/it]


In [13]:
maindir = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\KNN\France'
folder = 'withGF'   # withGF or withoutGF
Nb_S2_used = 3       # 1, 2, 3, 4
excel_file1 = pd.read_excel(os.path.join(maindir,folder,'initialTesting','Performance_'+folder+'.xlsx'))
df = pd.DataFrame(excel_file1)
df1 = df[df["Nb_S2_used"]==Nb_S2_used]
lst1 = list(df1["n_neighbors"])
lst2 = list(df1["leaf_size"])
lst3 = list(df1["weights"])
print(collections.Counter(lst1))
print(collections.Counter(lst2))
print(collections.Counter(lst3))

Counter({4: 6, 5: 3, 7: 3, 6: 2, 8: 1})
Counter({1: 15})
Counter({'distance': 15})


In [None]:
######################## WithoutGF ########################
# HyperParameters (most frequent)
##### 1
n_neighbors = 8
leaf_size = 1
weights = 'distance'
##### 2
n_neighbors = 8
leaf_size = 1
weights = 'distance'
##### 3
n_neighbors = 6
leaf_size = 1
weights = 'distance'
######################## WithGF ########################
# HyperParameters (most frequent)
##### 1
n_neighbors = 8
leaf_size = 1
weights = 'distance'
##### 2
n_neighbors = 8
leaf_size = 1
weights = 'distance'
##### 3
n_neighbors = 4
leaf_size = 1
weights = 'distance'

In [None]:
################ Selected config (both withGF or withoutGF) ################

n_neighbors = 8
leaf_size = 1
weights = 'distance'

# Random Forests

In [4]:
import os
import pandas as pd
import numpy as np 
from tqdm import tqdm
import collections
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, GridSearchCV, cross_validate
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [5]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\RF'
# gf_folders = ['withoutGF','withGF']
gf_folders = ['withGF']
seed = 4
scoring = {'r2': 'r2',
           'mae': 'neg_mean_absolute_error',
           'rmse':'neg_root_mean_squared_error'}

for gf_folder in gf_folders:    
    subdir1 = os.path.join(maindir1,gf_folder,'France')
    files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]
    GF = []
    DATE = []
    Nb_S2_used = []
    n_estimators = []
    max_features = []
    max_depth = []         
    MAE = []            
    RMSE = []  
    r2 = []
    for n in tqdm(range(len(files_temp))):
        file1 = files_temp[n]
        for file2 in os.listdir(os.path.join(subdir1,file1)):
            if 'Best_4' not in file2:
                # step0: Read and split data
                excel_file = pd.read_excel(os.path.join(subdir1,file1,file2))
                y = excel_file.L0                                                                         # Target data
                features = ['L'+str(i) for i in range(1,len(excel_file.columns)-3)]                       # Predictor variables
                X = excel_file[features]                                                                  # Subset df            
                train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) # Split data

                # step1: create random grid (construct a specified n° of combinations by randomly choosing possible parameter values)
                random_grid = {'n_estimators': [int(x) for x in range(50,501,50)],            # Number of trees in random forest
                               'max_features': [int(len(features) / 3.0)],                    # Number of features to consider at every split
                               'max_depth': [int(x) for x in range(5,61,5)]                   # Maximum number of levels in tree
                               }                                                       

                # step2: initiate the search for best combination
                random_search = RandomizedSearchCV(estimator = RFR(random_state=seed),     # Create base model
                                                   param_distributions = random_grid,      # random selection of parameters
                                                   n_iter = 100,                            # across 200 different combinations
                                                   cv = 5,                                 # using 5 fold cross validation
                                                   verbose=1,                              # Set it to see more information about the tree building process
                                                   random_state=seed,                      # Note: A Random Forest uses randomised decision trees, and as such, each time you fit, the result will change
                                                   n_jobs = -1)                            # use all available cores
                random_search.fit(train_X, train_y)                                        # Fit the random search model

                # step3: Calculate model using best parameters
                kfold_indexes = list(KFold(10,shuffle=True,random_state=seed).split(train_X)) # split training into Kfolds and shuffle            
                forestModel = RFR(n_estimators=random_search.best_params_['n_estimators'],
                                  max_features=random_search.best_params_['max_features'],
                                  max_depth=random_search.best_params_['max_depth'],
                                  random_state=seed)                                        # Define model parameters
                # step4: Get scores of KFolds CV for verification
                scores = cross_validate(forestModel,train_X,train_y,cv=kfold_indexes,scoring=scoring,return_estimator=True)            
                df = pd.DataFrame({'test_r2':scores['test_r2'],
                                  'test_mae':scores['test_mae'],
                                  'test_rmse':scores['test_rmse']})
                
                path1 = os.path.join(maindir2,'France',gf_folder,'initialTesting')
                os.makedirs(path1, exist_ok=True)
                outputdir1 = os.path.join(path1,'scores10Folds_'+file2[17:])
                df.to_excel(outputdir1, encoding='utf-8', index=False)   

                # step5: Get scores using all data
                forestModel.fit(train_X, train_y)                                          # Fit model based on training data
                predictions = forestModel.predict(val_X)                                         # Apply model on validation data            
                MAE1 = mean_absolute_error(val_y, predictions)                      # Measure MAE (less sensitive to outliers compared to RMSE)
                RMSE1 = mean_squared_error(val_y, predictions, squared=False)        # Measure RMSE (average error performed by the model in predicting the outcome for an observation)
                Rsquared1 = r2_score(val_y, predictions)                            # Measure accuracy (coefficient of determination R^2: the proportion of variation in the outcome that is explained by the predictor variables)

                # step6: Save results
                GF.append(gf_folder)
                DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                Nb_S2_used.append(int(file2[17:18]))
                n_estimators.append(random_search.best_params_['n_estimators'])
                max_features.append(random_search.best_params_['max_features'])
                max_depth.append(random_search.best_params_['max_depth'])            
                MAE.append(round(MAE1,2))            
                RMSE.append(round(RMSE1,2))  
                r2.append(round(Rsquared1*100,2))
    
    results = pd.DataFrame({'Date':DATE,
                            'Gapfilling':GF,
                            'Nb_S2_used':Nb_S2_used,
                            'n_estimators':n_estimators,
                            'max_features':max_features,
                            'max_depth':max_depth,
                            'MAE':MAE,
                            'RMSE':RMSE,
                            'R2':r2
                            })

    # step7: Export as excel files
    path2 = os.path.join(maindir2,'France',gf_folder,'initialTesting')
    os.makedirs(path2, exist_ok=True)                
    outputdir2 = os.path.join(path2,'Performance_'+gf_folder+'.xlsx')
    results.to_excel(outputdir2, encoding='utf-8', index=False)

  0%|                                                                                           | 0/15 [00:00<?, ?it/s]

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.6min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   26.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.5min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.1min finished
  7%|█████▎                                                                         | 1/15 [20:59<4:53:56, 1259.75s/it]

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.4min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.5min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.6min finished
 13%|██████████▌                                                                    | 2/15 [39:08<4:21:49, 1208.41s/it]

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.4min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.4min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   26.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.6min finished
 20%|███████████████▊                                                               | 3/15 [58:19<3:58:13, 1191.14s/it]

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.3min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.4min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   25.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.4min finished
 27%|████████████████████▌                                                        | 4/15 [1:16:14<3:32:00, 1156.41s/it]

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.3min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.4min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   26.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.6min finished
 33%|█████████████████████████▋                                                   | 5/15 [1:36:22<3:15:19, 1171.96s/it]

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.6min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.2min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  8.4min finished
 40%|██████████████████████████████▊                                              | 6/15 [1:58:07<3:01:44, 1211.64s/it]

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.4min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.7min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   25.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.8min finished
 47%|███████████████████████████████████▉                                         | 7/15 [2:18:42<2:42:29, 1218.72s/it]

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.6min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.0min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  8.2min finished
 53%|█████████████████████████████████████████                                    | 8/15 [2:41:37<2:27:39, 1265.59s/it]

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.4min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.6min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   25.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.9min finished
 60%|██████████████████████████████████████████████▏                              | 9/15 [3:01:35<2:04:32, 1245.38s/it]

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.4min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.0min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.8min finished
 67%|██████████████████████████████████████████████████▋                         | 10/15 [3:21:37<1:42:41, 1232.27s/it]

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.2min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.2min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.9min finished
 73%|███████████████████████████████████████████████████████▋                    | 11/15 [3:38:12<1:17:24, 1161.20s/it]

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.4min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.6min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.2min finished
 80%|██████████████████████████████████████████████████████████████▍               | 12/15 [3:56:13<56:51, 1137.27s/it]

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.4min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.4min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   24.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.1min finished
 87%|███████████████████████████████████████████████████████████████████▌          | 13/15 [4:13:27<36:52, 1106.19s/it]

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.3min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.3min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.1min finished
 93%|████████████████████████████████████████████████████████████████████████▊     | 14/15 [4:32:43<18:41, 1121.19s/it]

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.4min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.4min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.3min finished
100%|██████████████████████████████████████████████████████████████████████████████| 15/15 [4:49:53<00:00, 1159.60s/it]


In [18]:
maindir = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\RF\France'
folder = 'withGF'   # withGF or withoutGF
Nb_S2_used = 3         # 1, 2, 3, 4
excel_file1 = pd.read_excel(os.path.join(maindir,folder,'initialTesting','Performance_'+folder+'.xlsx'))
df = pd.DataFrame(excel_file1)
df1 = df[df["Nb_S2_used"]==Nb_S2_used]
lst1 = list(df1["n_estimators"])
lst2 = list(df1["max_depth"])
print(collections.Counter(lst1))
print(collections.Counter(lst2))

Counter({500: 6, 100: 3, 200: 2, 50: 2, 300: 1, 450: 1})
Counter({20: 5, 25: 4, 60: 1, 30: 1, 15: 1, 35: 1, 40: 1, 10: 1})


In [None]:
######################## WithoutGF ########################
# HyperParameters (most frequent)
##### 1
n_estimators = 450
max_depth = 10
##### 2
n_estimators = 50 or 150 or 500
max_depth = 10
##### 3
n_estimators = 150
max_depth = 15
######################## WithGF ########################
# HyperParameters (most frequent)
##### 1
n_estimators = 450
max_depth = 10
##### 2
n_estimators = 200 or 500
max_depth = 25
##### 3
n_estimators = 500
max_depth = 20

In [None]:
################ Selected config (both withGF or withoutGF) ################
n_estimators = 500
max_depth = 25
max_features=int(len(features) / 3.0)