In [1]:
import os
os.environ['PYTHONHASHSEED']=str(4)

import random
import pandas as pd
import numpy as np 
import rasterio as rio
from copy import deepcopy
from joblib import Parallel, delayed
from tqdm import tqdm
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.optimizers import Nadam, Adam
from keras.layers import Dense, Activation
from keras.wrappers.scikit_learn import KerasRegressor as ANN
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.colors as colors
from rasterio.plot import show

In [2]:
def set_random_seed(x):
    tf.random.set_seed(x) # Set the `tensorflow` pseudo-random generator at a fixed value
    np.random.seed(x)     # Set the `numpy` pseudo-random generator at a fixed value
    random.seed(x)        # Set the `python` built-in pseudo-random generator at a fixed value      

In [3]:
############################################################################
# Reproducibility is a Problem when using parallel processing  (n_jobs = 1)#
############################################################################ 
seed = 4
set_random_seed(seed) 

In [6]:
def build_model(learn_rate=0.01, units1=14,units2=12,activ_func1='sigmoid',activ_func2='sigmoid',activ_func3='sigmoid'):    
    model = Sequential()
    model.add(Dense(units1, kernel_initializer='uniform', activation=activ_func1, input_shape=(Nfeatures,))) 
    model.add(Dense(units2, kernel_initializer='uniform', activation=activ_func2))                           
    model.add(Dense(1, kernel_initializer='uniform', activation=activ_func3))
    optimizer = Adam(lr=learn_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-07, name="Adam")
    model.compile(loss='mse', optimizer=optimizer, metrics=['accuracy'])
    return model

In [7]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning'
ml_models = ['KNN','RF','ANN']
# gf_folders = ['withoutGF','withGF']
gf_folders = ['withGF']
scoring = {'mse':'neg_mean_squared_error', 'r2': 'r2'}

Model = []
GF = []
DATE = []
Nb_S2_used = []
mse = []
r2 = []
        
for ml_model in ml_models:    
    for gf_folder in gf_folders:    
        subdir1 = os.path.join(maindir1,gf_folder,'France')
        files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]        
        
        for n in tqdm(range(len(files_temp))):
            file1 = files_temp[n]
            for file2 in os.listdir(os.path.join(subdir1,file1)): 
                if 'Best_4' not in file2:
                    excel_file = pd.read_excel(os.path.join(subdir1,file1,file2)) # step0: Read and split data
                    y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                        # Target data
                    excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                    features = ['L'+str(i) for i in range(1,len(excel_file.columns)-3)]<
                    X = excel_file.values                
                    y = MinMaxScaler().fit_transform(y) # Data Normalization
                    X = MinMaxScaler().fit_transform(X)
                    train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) # Split data

                    kfold_indexes = list(KFold(10,shuffle=True,random_state=seed).split(train_X)) # split training into Kfolds and shuffle            
                    Nfeatures = train_X.shape[1]
                    ############### Model with selected hyper-parameters w/o cv (get scores using all data) (n_jobs=1 : to ensure replicability) ###############
                    if ml_model == 'ANN':                    
                        model = ANN(build_fn=build_model, epochs=100, batch_size=10, verbose=0)  # create model 
                    elif ml_model == 'KNN':                    
                        train_y = train_y.ravel() # flatten to 1d array # data is in a column format while it expected it in a row.
                        model = KNN(n_neighbors=8, leaf_size=1, weights='distance')
                    elif ml_model == 'RF':
                        train_y = train_y.ravel() # flatten to 1d array # data is in a column format while it expected it in a row.
                        model = RFR(n_estimators=500, max_features=int(len(features)/3.0), max_depth=25, random_state=seed)

                    scores = cross_validate(model,train_X,train_y,cv=kfold_indexes,scoring=scoring,return_estimator=True)            
                    avg_mse = np.mean(scores['test_mse'])                
                    avg_r2 = np.mean(scores['test_r2'])  
                    ############### Save Results ###############
                    Model.append(ml_model)
                    GF.append(gf_folder)
                    DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                    Nb_S2_used.append(int(file2[17:18]))
                    mse.append(round(-avg_mse,4)) # the computed values are negative
                    r2.append(round(avg_r2,4))    

############### Export Results ###############                
results = pd.DataFrame({'Model':Model,
                        'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'mse':mse,
                        'r2':r2
                        })
outputdir2 = os.path.join(maindir2,'10k_cvResults_all3Models.xlsx')
results.to_excel(outputdir2, encoding='utf-8', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:33<00:00,  6.23s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 15/15 [56:39<00:00, 226.66s/it]
100%|███████████████████████████████████████████████████████████████████████████████| 15/15 [2:38:48<00:00, 635.25s/it]


# Select best learner

In [7]:
# Selected model: RF
# N°S2: 2

# run the algorithm on all training data
# check MSE and R2 are similar to previous
# Get feature importance results

In [9]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning'
ml_models = ['RF']
# gf_folders = ['withoutGF', 'withGF']
gf_folders = ['withGF']

Model = []
GF = []
DATE = []
Nb_S2_used = []
mse = []
r2 = []
best3Features = []
        
for ml_model in ml_models:    
    for gf_folder in gf_folders:    
        subdir1 = os.path.join(maindir1,gf_folder,'France')
        files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]        
        
        for n in tqdm(range(len(files_temp))):
            file1 = files_temp[n]
            for file2 in os.listdir(os.path.join(subdir1,file1)): 
                if 'Best_2' in file2:
                    excel_file = pd.read_excel(os.path.join(subdir1,file1,file2)) # step0: Read and split data
                    y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                        # Target data
                    excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                    features = ['L'+str(i) for i in range(1,len(excel_file.columns)-3)]
                    X = excel_file.values                
                    y = MinMaxScaler().fit_transform(y) # Data Normalization
                    X = MinMaxScaler().fit_transform(X)
                    train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) # Split data

                    Nfeatures = train_X.shape[1]
                    train_y = train_y.ravel() # flatten to 1d array # data is in a column format while it expected it in a row.
                    
                    ############### Predict turbidity using RF ()###############
                    model = RFR(n_estimators=500, max_features=int(len(features)/3.0), max_depth=10, random_state=seed)                    
                    model.fit(train_X, train_y)
                    y_pred = model.predict(val_X)
                    
                    importance = model.feature_importances_
                    indices = sorted(range(len(importance)), key=lambda i: importance[i])[-3:]
                    dict_temp = {'B':[1,10,19,28],'G':[4,13,22,31],'R':[7,16,25,34],
                                 'BG':[2,11,20,29],'GB':[5,14,23,32],'RB':[8,17,26,35],
                                 'BR':[3,12,21,30],'GR':[6,15,24,33],'RG':[9,18,27,36]}
                    best3F = ''
                    for i in indices:
                        for key,values in dict_temp.items():
                            if i+1 in values: # Add 1 because the indices were counted from 0 whilst layer names start from L1
                                best3F+=key+' '
                    
                    Model.append(ml_model)
                    GF.append(gf_folder)
                    DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                    Nb_S2_used.append(int(file2[17:18]))
                    mse.append(round(mean_squared_error(val_y, y_pred, squared=True),4))
                    r2.append(round(r2_score(val_y, y_pred),4)) 
                    best3Features.append(best3F)

results = pd.DataFrame({'Model':Model,
                        'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'mse':mse,
                        'r2':r2,
                        'best3Features':best3Features
                        })

# step7: Export as excel files
outputdir2 = os.path.join(maindir2,'performanceResults_1Model.xlsx')
results.to_excel(outputdir2, encoding='utf-8', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [02:46<00:00, 11.10s/it]


In [5]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning'
ml_models = ['RF']
gf_folders = ['withoutGF', 'withGF']

Model = []
GF = []
DATE = []
Nb_S2_used = []
mse = []
r2 = []
        
for ml_model in ml_models:    
    for gf_folder in gf_folders:    
        subdir1 = os.path.join(maindir1,gf_folder,'France')
        files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]        
        
        for n in tqdm(range(len(files_temp))):
            file1 = files_temp[n]
            for file2 in os.listdir(os.path.join(subdir1,file1)): 
                if 'Best_2' in file2:
                    excel_file = pd.read_excel(os.path.join(subdir1,file1,file2)) # step0: Read and split data
                    y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                        # Target data
                    excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                    features = ['L'+str(i) for i in range(1,len(excel_file.columns)-3)]
                    X = excel_file.values                
                    y = MinMaxScaler().fit_transform(y) # Data Normalization
                    X = MinMaxScaler().fit_transform(X)
                    train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) # Split data

                    Nfeatures = train_X.shape[1]
                    
                    ############### Predict turbidity using ANN ()###############
                    model = ANN(build_fn=build_model, epochs=100, batch_size=10, verbose=0)  # create model                    
                    model.fit(train_X, train_y)
                    y_pred = model.predict(val_X)
                    
                    Model.append(ml_model)
                    GF.append(gf_folder)
                    DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                    Nb_S2_used.append(int(file2[17:18]))
                    mse.append(round(mean_squared_error(val_y, y_pred, squared=True),4))
                    r2.append(round(r2_score(val_y, y_pred),4)) 
                    
results = pd.DataFrame({'Model':Model,
                        'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'mse':mse,
                        'r2':r2
                        })

# step7: Export as excel files
outputdir2 = os.path.join(maindir2,'perfANN.xlsx')
results.to_excel(outputdir2, encoding='utf-8', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [10:16<00:00, 41.13s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [06:50<00:00, 27.39s/it]


In [6]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning'
ml_models = ['RF']
gf_folders = ['withoutGF', 'withGF']

Model = []
GF = []
DATE = []
Nb_S2_used = []
mse = []
r2 = []
        
for ml_model in ml_models:    
    for gf_folder in gf_folders:    
        subdir1 = os.path.join(maindir1,gf_folder,'France')
        files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]        
        
        for n in tqdm(range(len(files_temp))):
            file1 = files_temp[n]
            for file2 in os.listdir(os.path.join(subdir1,file1)): 
                if 'Best_2' in file2:
                    excel_file = pd.read_excel(os.path.join(subdir1,file1,file2)) # step0: Read and split data
                    y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                        # Target data
                    excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                    features = ['L'+str(i) for i in range(1,len(excel_file.columns)-3)]
                    X = excel_file.values                
                    y = MinMaxScaler().fit_transform(y) # Data Normalization
                    X = MinMaxScaler().fit_transform(X)
                    train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) # Split data

                    Nfeatures = train_X.shape[1]
                    train_y = train_y.ravel() # flatten to 1d array # data is in a column format while it expected it in a row.
                    
                    ############### Predict turbidity using KNN ()###############
                    model = KNN(n_neighbors=8, leaf_size=1, weights='distance')
                    model.fit(train_X, train_y)
                    y_pred = model.predict(val_X)
                    
                    Model.append(ml_model)
                    GF.append(gf_folder)
                    DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                    Nb_S2_used.append(int(file2[17:18]))
                    mse.append(round(mean_squared_error(val_y, y_pred, squared=True),4))
                    r2.append(round(r2_score(val_y, y_pred),4)) 
                    
results = pd.DataFrame({'Model':Model,
                        'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'mse':mse,
                        'r2':r2
                        })

# step7: Export as excel files
outputdir2 = os.path.join(maindir2,'perfKNN.xlsx')
results.to_excel(outputdir2, encoding='utf-8', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:20<00:00,  1.34s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:20<00:00,  1.34s/it]


In [10]:
my_dir = r'G:\MScThesis\waterQualityMonitoring\Results\ML'
excel_file = pd.read_excel(os.path.join(my_dir,'performanceResults_1Model.xlsx'))
print(excel_file[excel_file['Gapfilling']=='withoutGF']['BestF1'].value_counts().idxmax()) # Most frequent 
print(excel_file[excel_file['Gapfilling']=='withoutGF']['BestF2'].value_counts().idxmax())
print(excel_file[excel_file['Gapfilling']=='withoutGF']['BestF3'].value_counts().idxmax())

BR
GB
G


In [11]:
print(excel_file[excel_file['Gapfilling']=='withGF']['BestF1'].value_counts().idxmax()) # Most frequent 
print(excel_file[excel_file['Gapfilling']=='withGF']['BestF2'].value_counts().idxmax())
print(excel_file[excel_file['Gapfilling']=='withGF']['BestF3'].value_counts().idxmax())

BG
GB
G


# Generate new turbidity maps

In [None]:
# Prepare excel files containing all pixel values of best 2 S2 (including missing values = -99)

In [5]:
def getPixelValue(array,idx1,idx2,idx3):
    return array[idx1,idx2,idx3]

In [None]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\massProduction\preparedInputData'
gf_folders = ['withoutGF', 'withGF']

for gf_folder in gf_folders:
    subdir1 = os.path.join(maindir1,gf_folder,'France')
    subdir2 = os.path.join(maindir2,gf_folder,'France')
    files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' in fileName]
    for n in tqdm(range(len(files_temp))):
        file1 = files_temp[n]        
        img = rio.open(os.path.join(subdir1,file1)) # start by reading all layers
        arr = img.read()
        # Rank S2 scenes based on n° KP
        indices = [i for i in range(1,36,9)]
        nb_KP = []
        for i in indices:
            temp_copy = deepcopy(arr[i])
            temp_copy[temp_copy==-99]=9.96921e+36
            nb_KP.append(len(np.argwhere(temp_copy<=1e+36).tolist()))        
        df1 = pd.DataFrame({'indices':indices,'nb_KP':nb_KP})
        df1.sort_values('nb_KP', inplace=True)  # order based on nb_KP and make changes to df permanent (order from worst to best)
        df1.reset_index(drop=True, inplace=True) # Drop old index and make changes to df permanent
        # Select reflectance layers associated with best 2 images
        name = 'Pixels_From_Best_2_S2_'+file1[7:15]
        l = list(df1[2:]['indices']) # out of the 4 indices, removes the 1st two
        # Create a new stacked array of layers to be used
        arr_temp = np.expand_dims(arr[0], axis=0)
        for k1 in l:
            for k2 in range(k1,k1+9):
                arr_temp = np.append(arr_temp,np.expand_dims(arr[k2], axis=0),axis=0) # get 19 layers (1st layer is turbidity + 18 layers of best S2 images and associated combinations of bands )

        # Get all possible pixel coordinates for valid or none pixel values (=9.96921e+36)
        idX = []
        idY = [] 
        idX_none = []
        idY_none = []
        for idx in range(arr_temp.shape[1]):    # get all pixel coordinates
            for idy in range(arr_temp.shape[2]):
                if arr_temp[1,idx,idy] <= 1e+36 or arr_temp[10,idx,idy] <= 1e+36:  # if both s2 images have known values
                    if arr_temp[1,idx,idy] != -99 or arr_temp[10,idx,idy] != -99:
                        idX.append(idx)                                      # 1: 1st best S2 image # 10: 2nd best S2 image 
                        idY.append(idy)
                    else:
                        idX_none.append(idx)                                            
                        idY_none.append(idy)
                else:
                    idX_none.append(idx)                                            
                    idY_none.append(idy)                    
        
        # Store all pixel values (!=none) in an empty df            
        rows = ['L'+str(index) for index in range(len(arr_temp))]
        columns = [index for index in range(len(idX))]
        results = pd.DataFrame(index=rows, columns=columns)
        data = [] # It is recommended to collect data in a list of lists and then assign it to a df (Than modifying a df each iteration => time costly and prone to error of dtypes)
        for idxLayer in range(len(arr_temp)):
            pixelValues = Parallel(n_jobs=-1)(delayed(getPixelValue)(arr_temp,idxLayer,idX[k],idY[k]) for k in range(len(idX)))
            data.append(pixelValues)
        results = pd.DataFrame(data, index=rows, columns=columns).T
        results.insert(loc=0, column='idx', value=idX)   # Add coordinates to df (while specifying position)
        results.insert(loc=1, column='idy', value=idY)  
        # Store all pixel values (==none) in an empty df            
        results_none = pd.DataFrame({'idx_none':idX_none, 'idy_none':idY_none})
        
        # Export as excel files
        os.makedirs(subdir2, exist_ok=True)
        outputdir = os.path.join(subdir2, name+'.xlsx')
        results.to_excel(outputdir, encoding='utf-8')
        
        outputdir2 = os.path.join(subdir2,'coordsNonePixelValues'+str(n)+'.xlsx') 
        results_none.to_excel(outputdir2, encoding='utf-8', index=False) # The coords of none pixel values are the same # save them 1 time

100%|█████████████████████████████████████████████████████████████████████████████████| 15/15 [38:27<00:00, 153.86s/it]
 47%|██████████████████████████████████████▋                                            | 7/15 [08:02<09:15, 69.43s/it]

In [None]:
###############################
# generate new turbidity maps #
###############################

In [None]:
# predict turbidity using all training dataset
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\massProduction\preparedInputData'
maindir3 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\massProduction\predictedTurbidity'
gf_folders = ['withoutGF', 'withGF']
    
for gf_folder in gf_folders:    
    subdir1 = os.path.join(maindir1,gf_folder,'France')
    subdir2 = os.path.join(maindir2,gf_folder,'France')
    subdir3 = os.path.join(maindir3,gf_folder,'France')
    files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]        

    for n in tqdm(range(len(files_temp))):
        file1 = files_temp[n]
        for file2 in os.listdir(os.path.join(subdir1,file1)): 
            if 'Best_2' in file2:
                ############### Read all training dataset (without splitting) ###############
                # first train the model with the previously prepared training set. Then, apply the model to predict turbidity in whole study area #
                excel_file = pd.read_excel(os.path.join(subdir1,file1,file2)) # step0: Read and split data
                y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                        # Target data
                excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                features = ['L'+str(i) for i in range(1,len(excel_file.columns)-3)]
                X = excel_file.values                
#                 y = MinMaxScaler().fit_transform(y) # Data Normalization is not necessary for random forests
#                 X = MinMaxScaler().fit_transform(X) # This will save us the time of invert normalization afterwards

                Nfeatures = X.shape[1]
                y = y.ravel() # flatten to 1d array # data is in a column format while it expected it in a row.
                ############### Read all pixel values in 2 S2 images to predict corresponding turbidity values ###############
                excel_file2 = pd.read_excel(os.path.join(subdir2,file2)) 
                idx = np.array(excel_file2['idx'].values,dtype=np.float).reshape(-1,1)                        
                idy = np.array(excel_file2['idy'].values,dtype=np.float).reshape(-1,1)
                excel_file2.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                features2 = ['L'+str(i) for i in range(1,len(excel_file2.columns)-3)]
                S2_values = excel_file2.values                
#                 S2_values = MinMaxScaler().fit_transform(S2_values) # Data Normalization
                ############### Predict turbidity using RF ()###############
                model = KNN(n_neighbors=8, leaf_size=1, weights='distance')
                    
        
                # model = RFR(n_estimators=500, max_features=int(len(features)/3.0), max_depth=10, random_state=seed)                    
                model.fit(X, y)
                y_pred = model.predict(S2_values)
                results = pd.DataFrame({'idx':idx.ravel(), 'idy':idy.ravel(), 'predTur':y_pred.ravel()})

                # step7: Export as excel files
                outputdir2 = os.path.join(subdir3,'Tur_'+file1+'.xlsx')
                results.to_excel(outputdir2, encoding='utf-8', index=False) 

In [None]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\massProduction\preparedInputData'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\massProduction\predictedTurbidity'
gf_folders = ['withoutGF', 'withGF']
    
for gf_folder in gf_folders:
    subdir1 = os.path.join(maindir1,gf_folder,'France')
    subdir2 = os.path.join(maindir2,gf_folder,'France')
    files_temp = [fileName for fileName in os.listdir(subdir1) if 'None' in fileName]
    for n in tqdm(range(len(files_temp))):
        file1 = files_temp[n]
        # Read file: coordsNonePixelValues
        excel_file1 = pd.read_excel(os.path.join(subdir1,file1)) # step0: Read and split data
        idx_temp1 = list(excel_file1['idx_none'])
        idy_temp1 = list(excel_file1['idy_none'])
        noneValues = []
        for i in range(excel_file1.shape[0]):
            noneValues.append(9.96921e+36)
        
        df = pd.DataFrame({'idx':idx_temp1, 'idy':idy_temp1, 'predTur':noneValues})
        # step7: Export as excel files
        outputdir2 = os.path.join(subdir2,file1)
        df.to_excel(outputdir2, encoding='utf-8', index=False)

In [None]:
# Read predicted turbidity pixel values and add the none values to it
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\massProduction\predictedTurbidity'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\preparedInputData'
gf_folders = ['withoutGF', 'withGF']
    
for gf_folder in gf_folders:
    subdir1 = os.path.join(maindir1, gf_folder, 'France')
    subdir2 = os.path.join(maindir2, gf_folder, 'France')
    files_temp1 = [fileName for fileName in os.listdir(subdir1) if ('Tur' in fileName)and('tiff' not in fileName)]  

    for n in tqdm(range(len(files_temp1))):
        file1 = files_temp1[n]
        excel_file1 = pd.read_excel(os.path.join(subdir1,file1))
        idx_temp1 = list(excel_file1['idx'])
        idy_temp1 = list(excel_file1['idy'])
        predTur_temp1 = list(excel_file1['predTur'])
        
        file2 = 'coordsNonePixelValues'+str(n)+'.xlsx'        
        excel_file2 = pd.read_excel(os.path.join(subdir1,file2))        
        idx_temp2 = list(excel_file2['idx'])
        idy_temp2 = list(excel_file2['idy'])
        predTur_temp2 = list(excel_file2['predTur'])
        
        idx = idx_temp1+idx_temp2
        idy = idy_temp1+idy_temp2
        predTur = predTur_temp1+predTur_temp2

        results = pd.DataFrame({'idx':idx, 'idy':idy, 'predTur':predTur})
        results.sort_values(by=['idx', 'idy'], ascending=True, inplace=True) # Sort Values by idx then by idy

        rowsList = results['idx']
        colList = results['idy']
        turList = results['predTur']

        file3 = file1[4:12]        
        img = rio.open(os.path.join(subdir2,'merged_'+file3+'.tiff')) # start by reading all layers
        arr = img.read()
                
        ######## Update Array ########  
        # Export as images 
        temp_copy1 = deepcopy(arr[0]) # retain layer as actual turbidity
        outputdir1 = os.path.join(subdir1, 'actual_'+file1[:-5]+'.tiff')
        with rio.open(outputdir1,'w',driver='Gtiff', width=img.width, height=img.height, 
                            count=1,crs=img.crs,transform=img.transform, dtype='float32', nodata=9.96921e+36) as newImg:
            newImg.write(temp_copy1,1)
            newImg.close()
        
        temp_copy2 = deepcopy(arr[0]) # to be filled with predicted turbidity
        for item in range(len(rowsList)):
            temp_copy2[int(rowsList[item]),int(colList[item])] = turList[item]
        outputdir2 = os.path.join(subdir1, 'predicted_'+file1[:-5]+'.tiff')
        with rio.open(outputdir2,'w',driver='Gtiff', width=img.width, height=img.height, 
                            count=1,crs=img.crs,transform=img.transform, dtype='float32', nodata=9.96921e+36) as newImg:
            newImg.write(temp_copy2,1)
            newImg.close()

In [8]:
# generate over/under estimation maps (for areas that have been gap filled display None)
# in NTU 
# Need to exclude pixels where turbidity is none while S2 is known

In [5]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\massProduction\predictedTurbidity'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\massProduction\slope'
gf_folders = ['withoutGF', 'withGF']

for gf_folder in gf_folders:
    subdir1 = os.path.join(maindir1,gf_folder,'France')
    files_temp = [fileName for fileName in os.listdir(subdir1) if ('tiff' in fileName) and ('actual' in fileName) ]
    
    for n in tqdm(range(len(files_temp))):
        img = rio.open(os.path.join(subdir1,'actual_Tur_20190121.tiff'))        
        file1 = files_temp[n]        
        actual = rio.open(os.path.join(subdir1,file1)).read(1)
        predicted = rio.open(os.path.join(subdir1,'predicted_'+file1[7:])).read(1)
        
        arrayBias = predicted-actual
        for i in range(actual.shape[0]):  # Exclude none values from this analysis
            for j in range(actual.shape[1]):
                if actual[i,j] > 1e+36:
                    arrayBias[i,j] = actual[i,j]
        
        # Export as image
        outputdir1 = os.path.join(maindir2,gf_folder,'France','biasArray'+file1[7:19]+'.tiff')
        with rio.open(outputdir1,'w',driver='Gtiff', width=img.width, height=img.height, 
                            count=1,crs=img.crs,transform=img.transform, dtype='float32', nodata=9.96921e+36) as newImg:
            newImg.write(arrayBias,1)
            newImg.close()

  arrayBias = (predicted-actual)/actual
100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:19<00:00,  1.30s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:18<00:00,  1.24s/it]


In [None]:
## https://gist.github.com/bshishov/5dc237f59f019b26145648e2124ca1c9

EPSILON = 1e-10

def _error(actual: np.ndarray, predicted: np.ndarray):
    """ Simple error """
    return actual - predicted
def _absolute_error(actual: np.ndarray, predicted: np.ndarray):
    """ absolute error """
    return abs(actual - predicted)
def _percentage_error(actual: np.ndarray, predicted: np.ndarray):
    """
    Percentage error
    Note: result is NOT multiplied by 100
    """
    return _error(actual, predicted) / (actual + EPSILON)
def error(actual: np.ndarray, predicted: np.ndarray):
    """ Simple error """
    return np.mean(_error(actual, predicted))
def percentage_error(actual: np.ndarray, predicted: np.ndarray):
    """
    Percentage error
    Note: result is NOT multiplied by 100
    """
    return np.mean(_error(actual, predicted)/(actual + EPSILON))
    
def mse(actual: np.ndarray, predicted: np.ndarray):
    """ Mean Squared Error """
    return np.mean(np.square(_error(actual, predicted)))
def mdape(actual: np.ndarray, predicted: np.ndarray):
    """
    Median Absolute Percentage Error
    """
    return np.median(np.abs(_percentage_error(actual, predicted)))
def R2_score(actual: np.ndarray, predicted: np.ndarray):
    return r2_score(actual, predicted)

METRICS = {
    'mse': mse,
    'mdape': mdape, # less affected by outliers
    '_error':_error,
    '_percentage_error':_percentage_error,
    'error':error,
    'percentage_error':percentage_error,
    'R2_score':R2_score,
}

def evaluate(actual: np.ndarray, predicted: np.ndarray, metrics=('mse', 'mdape', '_error','_percentage_error', 'error','percentage_error','R2_score')):
    results = {}
    for name in metrics:
        try:
            results[name] = METRICS[name](actual, predicted)
        except Exception as err:
            results[name] = np.nan
            print('Unable to compute metric {0}: {1}'.format(name, err))
    return results

def evaluate_all(actual: np.ndarray, predicted: np.ndarray):
    return evaluate(actual, predicted, metrics=set(METRICS.keys()))

In [None]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\massProduction\predictedTurbidity'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\massProduction\slope'
gf_folders = ['withoutGF', 'withGF']

gf = []
d = []
err = []
errP = []
MdAPE = []
MSE = []
R2score = []

for gf_folder in gf_folders:
    subdir1 = os.path.join(maindir1,gf_folder,'France')
    files_temp = [fileName for fileName in os.listdir(subdir1) if ('tiff' in fileName) and ('actual' in fileName) ]
    
    for n in tqdm(range(len(files_temp))):
        img = rio.open(os.path.join(subdir1,'actual_Tur_20190121.tiff'))        
        file1 = files_temp[n]        
        actual = rio.open(os.path.join(subdir1,file1)).read(1)
        predicted = rio.open(os.path.join(subdir1,'predicted_'+file1[7:])).read(1)

        actualValues = []
        predValues = []
        for i in range(actual.shape[0]):  # Exclude none values from this analysis
            for j in range(actual.shape[1]):
                if actual[i,j]<10000 and actual[i,j]>-90 and predicted[i,j]<10000 and predicted[i,j]>-90:
                    actualValues.append(actual[i,j])
                    predValues.append(predicted[i,j])
        # Use error metrics that do not penalize large differences between actual and predicted
        errorMetrics = evaluate(np.array(actualValues,dtype=np.float64), np.array(predValues,dtype=np.float64), metrics=('error', 'percentage_error','mdape','mse','R2_score'))
    
        gf.append(gf_folder)
        d.append(file1[11:19])
        err.append(round(errorMetrics['error'],4))
        errP.append(round(100*errorMetrics['percentage_error'],2))
        MdAPE.append(round(100*errorMetrics['mdape'],2))
        MSE.append(round(errorMetrics['mse'],4))
        R2score.append(round(100*errorMetrics['R2_score'],2))
df = pd.DataFrame({'gf':gf,'d':d,'err':err, 'errP':errP, 'MdAPE':MdAPE,'MSE':MSE,'R2score':R2score})
df

In [None]:
# Get n° pixels ranging between 0-10 / 10-30 / 30-max

In [30]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\massProduction\predictedTurbidity'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\massProduction\slope'
gf_folders = ['withoutGF', 'withGF']

gf = []
d = []
a11 = []
a30 = []
amax = []
p11 = []
p30 = []
pmax = []

for gf_folder in gf_folders:    
    subdir1 = os.path.join(maindir1,gf_folder,'France')
    files_temp = [fileName for fileName in os.listdir(subdir1) if ('tiff' in fileName) and ('actual' in fileName) ]
    
    for n in tqdm(range(len(files_temp))):
        img = rio.open(os.path.join(subdir1,'actual_Tur_20190121.tiff'))        
        file1 = files_temp[n]        
        actual = rio.open(os.path.join(subdir1,file1)).read(1)
        predicted = rio.open(os.path.join(subdir1,'predicted_'+file1[7:])).read(1)
        
        A11 = len(actual[actual<11])
        A30 = len(actual[actual<30]) - len(actual[actual<11])
        Amax = len(actual[actual<10000]) - len(actual[actual<30])
        
        P11 = len(predicted[predicted<11])
        P30 = len(predicted[predicted<30]) - len(predicted[predicted<11])
        Pmax = len(predicted[predicted<10000]) - len(predicted[predicted<30])
        
        gf.append(gf_folder)
        d.append(file1[11:19])
        
        a11.append(A11)
        a30.append(A30)
        amax.append(Amax)
        p11.append(P11)
        p30.append(P30)
        pmax.append(Pmax)
        
        
df = pd.DataFrame({'d':d,'gf':gf,'a11':a11,'p11':p11,'a30':a30, 'p30':p30, 'amax':amax, 'pmax':pmax})
df     

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 55.92it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 57.90it/s]


Unnamed: 0,d,gf,a11,p11,a30,p30,amax,pmax
0,20190121,withoutGF,89155,100594,2217,2464,1,0
1,20190211,withoutGF,46189,101368,893,499,14,0
2,20190311,withoutGF,102720,103007,180,51,2,0
3,20190411,withoutGF,102516,103058,12,0,2,0
4,20190511,withoutGF,102419,103057,18,1,0,0
5,20190611,withoutGF,102231,103058,0,0,4,0
6,20190711,withoutGF,103052,103058,3,0,2,0
7,20190811,withoutGF,102805,102805,3,0,6,0
8,20190911,withoutGF,103057,103058,0,0,1,0
9,20191011,withoutGF,7015,103058,0,0,0,0


In [50]:
maindir1 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\massProduction\predictedTurbidity'
maindir2 = r'G:\MScThesis\waterQualityMonitoring\Data\MLearning\massProduction\slope'
gf_folders = ['withoutGF', 'withGF']

gf = []
d = []
pixels11 = []
pixels31 = []
pixelsMax = []
N_pixels_changed = []
N_pixels_noChange = []

for gf_folder in gf_folders:    
    subdir1 = os.path.join(maindir1,gf_folder,'France')
    files_temp = [fileName for fileName in os.listdir(subdir1) if ('tiff' in fileName) and ('actual' in fileName) ]
    
    for n in tqdm(range(len(files_temp))):
        img = rio.open(os.path.join(subdir1,'actual_Tur_20190121.tiff'))        
        file1 = files_temp[n]        
        actual = rio.open(os.path.join(subdir1,file1)).read(1)
        predicted = rio.open(os.path.join(subdir1,'predicted_'+file1[7:])).read(1)
        
        gf.append(gf_folder)
        d.append(file1[11:19])
        
        pixels_total = 0
        noChangeinClass = 0
        changeinClass = 0
        
        v11=0
        v31=0
        vmax=0
        for i in range(actual.shape[0]):  # Exclude none values from this analysis
            for j in range(actual.shape[1]):
                if actual[i,j]<1e+7 and actual[i,j]>-90:
                    pixels_total+=1
                    actualValue = actual[i,j]
                    predValue = predicted[i,j]
                    if (actualValue<11 and predValue<11):
                        v11+=1
                        noChangeinClass+=1
                    elif (actualValue<31 and predValue<31 and actualValue>=11 and predValue>=11):
                        v31+=1
                        noChangeinClass+=1
                    elif (actualValue>=31 and predValue>=31):
                        vmax+=1
                        noChangeinClass+=1
                    else:
                        changeinClass+=1
                    
        pixels11.append(v11)
        pixels31.append(v31)
        pixelsMax.append(vmax)
        N_pixels_changed.append(changeinClass)
        N_pixels_noChange.append(noChangeinClass)
    
df = pd.DataFrame({'gf':gf,'d':d,'pixels11':pixels11,'pixels31':pixels31,'pixelsMax':pixelsMax,'N_pixels_changed':N_pixels_changed,'N_pixels_noChange':N_pixels_noChange,'pixels_total':pixels_total})
df

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:33<00:00,  2.25s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:35<00:00,  2.40s/it]


Unnamed: 0,gf,d,pixels11,pixels31,pixelsMax,N_pixels_changed,N_pixels_noChange,pixels_total
0,withoutGF,20190121,88944,2106,0,323,91050,103058
1,withoutGF,20190211,45106,23,0,1967,45129,103058
2,withoutGF,20190311,102711,42,0,149,102753,103058
3,withoutGF,20190411,102516,0,0,14,102516,103058
4,withoutGF,20190511,102419,1,0,17,102420,103058
5,withoutGF,20190611,102231,0,0,4,102231,103058
6,withoutGF,20190711,103052,0,0,5,103052,103058
7,withoutGF,20190811,102552,0,0,262,102552,103058
8,withoutGF,20190911,103057,0,0,1,103057,103058
9,withoutGF,20191011,7015,0,0,0,7015,103058
