In [1]:
import os
os.environ['PYTHONHASHSEED']=str(4)

import random
import pandas as pd
import numpy as np 
import rasterio as rio
from copy import deepcopy
from joblib import Parallel, delayed
from tqdm import tqdm
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.optimizers import Nadam, Adam
from keras.layers import Dense, Activation
from keras.wrappers.scikit_learn import KerasRegressor as ANN
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.colors as colors
from rasterio.plot import show

In [2]:
def set_random_seed(x):
    tf.random.set_seed(x) # Set the `tensorflow` pseudo-random generator at a fixed value
    np.random.seed(x)     # Set the `numpy` pseudo-random generator at a fixed value
    random.seed(x)        # Set the `python` built-in pseudo-random generator at a fixed value      

In [3]:
############################################################################
# Reproducibility is a Problem when using parallel processing  (n_jobs = 1)#
############################################################################ 
seed = 4
set_random_seed(seed) 

In [6]:
def build_model(learn_rate=0.01, units1=14,units2=12,activ_func1='sigmoid',activ_func2='sigmoid',activ_func3='sigmoid'):    
    model = Sequential()
    model.add(Dense(units1, kernel_initializer='uniform', activation=activ_func1, input_shape=(Nfeatures,))) 
    model.add(Dense(units2, kernel_initializer='uniform', activation=activ_func2))                           
    model.add(Dense(1, kernel_initializer='uniform', activation=activ_func3))
    optimizer = Adam(lr=learn_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-07, name="Adam")
    model.compile(loss='mse', optimizer=optimizer, metrics=['accuracy'])
    return model

In [7]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning'
ml_models = ['KNN','RF','ANN']
# gf_folders = ['withoutGF','withGF']
gf_folders = ['withGF']
scoring = {'mse':'neg_mean_squared_error', 'r2': 'r2'}

Model = []
GF = []
DATE = []
Nb_S2_used = []
mse = []
r2 = []
        
for ml_model in ml_models:    
    for gf_folder in gf_folders:    
        subdir1 = os.path.join(maindir1,gf_folder,'France')
        files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]        
        
        for n in tqdm(range(len(files_temp))):
            file1 = files_temp[n]
            for file2 in os.listdir(os.path.join(subdir1,file1)): 
                if 'Best_4' not in file2:
                    excel_file = pd.read_excel(os.path.join(subdir1,file1,file2)) # step0: Read and split data
                    y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                        # Target data
                    excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                    features = ['L'+str(i) for i in range(1,len(excel_file.columns)-3)]<
                    X = excel_file.values                
                    y = MinMaxScaler().fit_transform(y) # Data Normalization
                    X = MinMaxScaler().fit_transform(X)
                    train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) # Split data

                    kfold_indexes = list(KFold(10,shuffle=True,random_state=seed).split(train_X)) # split training into Kfolds and shuffle            
                    Nfeatures = train_X.shape[1]
                    ############### Model with selected hyper-parameters w/o cv (get scores using all data) (n_jobs=1 : to ensure replicability) ###############
                    if ml_model == 'ANN':                    
                        model = ANN(build_fn=build_model, epochs=100, batch_size=10, verbose=0)  # create model 
                    elif ml_model == 'KNN':                    
                        train_y = train_y.ravel() # flatten to 1d array # data is in a column format while it expected it in a row.
                        model = KNN(n_neighbors=8, leaf_size=1, weights='distance')
                    elif ml_model == 'RF':
                        train_y = train_y.ravel() # flatten to 1d array # data is in a column format while it expected it in a row.
                        model = RFR(n_estimators=500, max_features=int(len(features)/3.0), max_depth=25, random_state=seed)

                    scores = cross_validate(model,train_X,train_y,cv=kfold_indexes,scoring=scoring,return_estimator=True)            
                    avg_mse = np.mean(scores['test_mse'])                
                    avg_r2 = np.mean(scores['test_r2'])  
                    ############### Save Results ###############
                    Model.append(ml_model)
                    GF.append(gf_folder)
                    DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                    Nb_S2_used.append(int(file2[17:18]))
                    mse.append(round(-avg_mse,4)) # the computed values are negative
                    r2.append(round(avg_r2,4))    

############### Export Results ###############                
results = pd.DataFrame({'Model':Model,
                        'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'mse':mse,
                        'r2':r2
                        })
outputdir2 = os.path.join(maindir2,'10k_cvResults_all3Models.xlsx')
results.to_excel(outputdir2, encoding='utf-8', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:33<00:00,  6.23s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 15/15 [56:39<00:00, 226.66s/it]
100%|███████████████████████████████████████████████████████████████████████████████| 15/15 [2:38:48<00:00, 635.25s/it]


# Select best learner

In [7]:
# Selected model: RF
# N°S2: 2

# run the algorithm on all training data
# check MSE and R2 are similar to previous
# Get feature importance results

In [9]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning'
ml_models = ['RF']
# gf_folders = ['withoutGF', 'withGF']
gf_folders = ['withGF']

Model = []
GF = []
DATE = []
Nb_S2_used = []
mse = []
r2 = []
best3Features = []
        
for ml_model in ml_models:    
    for gf_folder in gf_folders:    
        subdir1 = os.path.join(maindir1,gf_folder,'France')
        files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]        
        
        for n in tqdm(range(len(files_temp))):
            file1 = files_temp[n]
            for file2 in os.listdir(os.path.join(subdir1,file1)): 
                if 'Best_2' in file2:
                    excel_file = pd.read_excel(os.path.join(subdir1,file1,file2)) # step0: Read and split data
                    y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                        # Target data
                    excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                    features = ['L'+str(i) for i in range(1,len(excel_file.columns)-3)]
                    X = excel_file.values                
                    y = MinMaxScaler().fit_transform(y) # Data Normalization
                    X = MinMaxScaler().fit_transform(X)
                    train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) # Split data

                    Nfeatures = train_X.shape[1]
                    train_y = train_y.ravel() # flatten to 1d array # data is in a column format while it expected it in a row.
                    
                    ############### Predict turbidity using RF ()###############
                    model = RFR(n_estimators=500, max_features=int(len(features)/3.0), max_depth=10, random_state=seed)                    
                    model.fit(train_X, train_y)
                    y_pred = model.predict(val_X)
                    
                    importance = model.feature_importances_
                    indices = sorted(range(len(importance)), key=lambda i: importance[i])[-3:]
                    dict_temp = {'B':[1,10,19,28],'G':[4,13,22,31],'R':[7,16,25,34],
                                 'BG':[2,11,20,29],'GB':[5,14,23,32],'RB':[8,17,26,35],
                                 'BR':[3,12,21,30],'GR':[6,15,24,33],'RG':[9,18,27,36]}
                    best3F = ''
                    for i in indices:
                        for key,values in dict_temp.items():
                            if i+1 in values: # Add 1 because the indices were counted from 0 whilst layer names start from L1
                                best3F+=key+' '
                    
                    Model.append(ml_model)
                    GF.append(gf_folder)
                    DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                    Nb_S2_used.append(int(file2[17:18]))
                    mse.append(round(mean_squared_error(val_y, y_pred, squared=True),4))
                    r2.append(round(r2_score(val_y, y_pred),4)) 
                    best3Features.append(best3F)

results = pd.DataFrame({'Model':Model,
                        'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'mse':mse,
                        'r2':r2,
                        'best3Features':best3Features
                        })

# step7: Export as excel files
outputdir2 = os.path.join(maindir2,'performanceResults_1Model.xlsx')
results.to_excel(outputdir2, encoding='utf-8', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [02:46<00:00, 11.10s/it]


In [5]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning'
ml_models = ['RF']
gf_folders = ['withoutGF', 'withGF']

Model = []
GF = []
DATE = []
Nb_S2_used = []
mse = []
r2 = []
        
for ml_model in ml_models:    
    for gf_folder in gf_folders:    
        subdir1 = os.path.join(maindir1,gf_folder,'France')
        files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]        
        
        for n in tqdm(range(len(files_temp))):
            file1 = files_temp[n]
            for file2 in os.listdir(os.path.join(subdir1,file1)): 
                if 'Best_2' in file2:
                    excel_file = pd.read_excel(os.path.join(subdir1,file1,file2)) # step0: Read and split data
                    y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                        # Target data
                    excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                    features = ['L'+str(i) for i in range(1,len(excel_file.columns)-3)]
                    X = excel_file.values                
                    y = MinMaxScaler().fit_transform(y) # Data Normalization
                    X = MinMaxScaler().fit_transform(X)
                    train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) # Split data

                    Nfeatures = train_X.shape[1]
                    
                    ############### Predict turbidity using ANN ()###############
                    model = ANN(build_fn=build_model, epochs=100, batch_size=10, verbose=0)  # create model                    
                    model.fit(train_X, train_y)
                    y_pred = model.predict(val_X)
                    
                    Model.append(ml_model)
                    GF.append(gf_folder)
                    DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                    Nb_S2_used.append(int(file2[17:18]))
                    mse.append(round(mean_squared_error(val_y, y_pred, squared=True),4))
                    r2.append(round(r2_score(val_y, y_pred),4)) 
                    
results = pd.DataFrame({'Model':Model,
                        'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'mse':mse,
                        'r2':r2
                        })

# step7: Export as excel files
outputdir2 = os.path.join(maindir2,'perfANN.xlsx')
results.to_excel(outputdir2, encoding='utf-8', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [10:16<00:00, 41.13s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [06:50<00:00, 27.39s/it]


In [6]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning'
ml_models = ['RF']
gf_folders = ['withoutGF', 'withGF']

Model = []
GF = []
DATE = []
Nb_S2_used = []
mse = []
r2 = []
        
for ml_model in ml_models:    
    for gf_folder in gf_folders:    
        subdir1 = os.path.join(maindir1,gf_folder,'France')
        files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]        
        
        for n in tqdm(range(len(files_temp))):
            file1 = files_temp[n]
            for file2 in os.listdir(os.path.join(subdir1,file1)): 
                if 'Best_2' in file2:
                    excel_file = pd.read_excel(os.path.join(subdir1,file1,file2)) # step0: Read and split data
                    y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                        # Target data
                    excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                    features = ['L'+str(i) for i in range(1,len(excel_file.columns)-3)]
                    X = excel_file.values                
                    y = MinMaxScaler().fit_transform(y) # Data Normalization
                    X = MinMaxScaler().fit_transform(X)
                    train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) # Split data

                    Nfeatures = train_X.shape[1]
                    train_y = train_y.ravel() # flatten to 1d array # data is in a column format while it expected it in a row.
                    
                    ############### Predict turbidity using KNN ()###############
                    model = KNN(n_neighbors=8, leaf_size=1, weights='distance')
                    model.fit(train_X, train_y)
                    y_pred = model.predict(val_X)
                    
                    Model.append(ml_model)
                    GF.append(gf_folder)
                    DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                    Nb_S2_used.append(int(file2[17:18]))
                    mse.append(round(mean_squared_error(val_y, y_pred, squared=True),4))
                    r2.append(round(r2_score(val_y, y_pred),4)) 
                    
results = pd.DataFrame({'Model':Model,
                        'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'mse':mse,
                        'r2':r2
                        })

# step7: Export as excel files
outputdir2 = os.path.join(maindir2,'perfKNN.xlsx')
results.to_excel(outputdir2, encoding='utf-8', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:20<00:00,  1.34s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:20<00:00,  1.34s/it]
