In [None]:
import os
os.environ['PYTHONHASHSEED']=str(4)

import random
import pandas as pd
import numpy as np 
import rasterio as rio
from copy import deepcopy
from joblib import Parallel, delayed
from tqdm import tqdm
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.optimizers import Nadam, Adam
from keras.layers import Dense, Activation
from keras.wrappers.scikit_learn import KerasRegressor as ANN
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.colors as colors
from rasterio.plot import show

In [None]:
def set_random_seed(x):
    tf.random.set_seed(x) 
    np.random.seed(x)     
    random.seed(x)

In [None]:
seed = 4
set_random_seed(seed) 

In [None]:
def build_model(learn_rate=0.01, units1=14,units2=12,activ_func1='sigmoid',activ_func2='sigmoid',activ_func3='sigmoid'):    
    model = Sequential()
    model.add(Dense(units1, kernel_initializer='uniform', activation=activ_func1, input_shape=(Nfeatures,))) 
    model.add(Dense(units2, kernel_initializer='uniform', activation=activ_func2))                           
    model.add(Dense(1, kernel_initializer='uniform', activation=activ_func3))
    optimizer = Adam(lr=learn_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-07, name="Adam")
    model.compile(loss='mse', optimizer=optimizer, metrics=['accuracy'])
    return model

In [None]:
maindir1 = r'G:\preparedInputData'
maindir2 = r'G:\MLearning'
ml_models = ['KNN','RF','ANN']
gf_folders = ['withoutGF','withGF']
scoring = {'mse':'neg_mean_squared_error', 'r2': 'r2'}
Model = []
GF = []
DATE = []
Nb_S2_used = []
mse = []
r2 = []        
for ml_model in ml_models:    
    for gf_folder in gf_folders:    
        subdir1 = os.path.join(maindir1,gf_folder)
        files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName] 
        for n in tqdm(range(len(files_temp))):
            file1 = files_temp[n]
            for file2 in os.listdir(os.path.join(subdir1,file1)): 
                if 'Best_4' not in file2:
                    excel_file = pd.read_excel(os.path.join(subdir1,file1,file2)) 
                    y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                        
                    excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                    features = ['L'+str(i) for i in range(1,len(excel_file.columns)-3)]<
                    X = excel_file.values                
                    y = MinMaxScaler().fit_transform(y) # 
                    X = MinMaxScaler().fit_transform(X)
                    train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) 
                    kfold_indexes = list(KFold(10,shuffle=True,random_state=seed).split(train_X))             
                    Nfeatures = train_X.shape[1]
                    if ml_model == 'ANN':                    
                        model = ANN(build_fn=build_model, epochs=100, batch_size=10, verbose=0)   
                    elif ml_model == 'KNN':                    
                        train_y = train_y.ravel() 
                        model = KNN(n_neighbors=8, leaf_size=1, weights='distance')
                    elif ml_model == 'RF':
                        train_y = train_y.ravel() 
                        model = RFR(n_estimators=500, max_features=int(len(features)/3.0), max_depth=25, random_state=seed)
                    scores = cross_validate(model,train_X,train_y,cv=kfold_indexes,scoring=scoring,return_estimator=True)            
                    avg_mse = np.mean(scores['test_mse'])                
                    avg_r2 = np.mean(scores['test_r2']) 
                    Model.append(ml_model)
                    GF.append(gf_folder)
                    DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                    Nb_S2_used.append(int(file2[17:18]))
                    mse.append(round(-avg_mse,4)) 
                    r2.append(round(avg_r2,4))                 
results = pd.DataFrame({'Model':Model,
                        'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'mse':mse,
                        'r2':r2
                        })
outputdir2 = os.path.join(maindir2,'10k_cvResults_all3Models.xlsx')
results.to_excel(outputdir2, encoding='utf-8', index=False)

In [None]:
maindir1 = r'G:\preparedInputData'
maindir2 = r'G:\MLearning'
gf_folders = ['withoutGF', 'withGF']
Model = []
GF = []
DATE = []
Nb_S2_used = []
mse = []
r2 = []
best3Features = []
for gf_folder in gf_folders:    
    subdir1 = os.path.join(maindir1,gf_folder)
    files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]  
    for n in tqdm(range(len(files_temp))):
        file1 = files_temp[n]
        for file2 in os.listdir(os.path.join(subdir1,file1)): 
            if 'Best_2' in file2:
                excel_file = pd.read_excel(os.path.join(subdir1,file1,file2)) 
                y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                       
                excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                features = ['L'+str(i) for i in range(1,len(excel_file.columns)-3)]
                X = excel_file.values                
                y = MinMaxScaler().fit_transform(y) 
                X = MinMaxScaler().fit_transform(X)
                train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) 
                Nfeatures = train_X.shape[1]
                train_y = train_y.ravel() 
                model = RFR(n_estimators=500, max_features=int(len(features)/3.0), max_depth=10, random_state=seed)                    
                model.fit(train_X, train_y)
                y_pred = model.predict(val_X)                    
                importance = model.feature_importances_
                indices = sorted(range(len(importance)), key=lambda i: importance[i])[-3:]
                dict_temp = {'B':[1,10,19,28],'G':[4,13,22,31],'R':[7,16,25,34],
                             'BG':[2,11,20,29],'GB':[5,14,23,32],'RB':[8,17,26,35],
                             'BR':[3,12,21,30],'GR':[6,15,24,33],'RG':[9,18,27,36]}
                best3F = ''
                for i in indices:
                    for key,values in dict_temp.items():
                        if i+1 in values: 
                            best3F+=key+' '                    
                Model.append('RFR')
                GF.append(gf_folder)
                DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                Nb_S2_used.append(int(file2[17:18]))
                mse.append(round(mean_squared_error(val_y, y_pred, squared=True),4))
                r2.append(round(r2_score(val_y, y_pred),4)) 
                best3Features.append(best3F)
results = pd.DataFrame({'Model':Model,
                        'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'mse':mse,
                        'r2':r2,
                        'best3Features':best3Features
                        })
outputdir2 = os.path.join(maindir2,'perfRFR.xlsx')
results.to_excel(outputdir2, encoding='utf-8', index=False)

In [None]:
maindir1 = r'G:\preparedInputData'
maindir2 = r'G:\MLearning'
gf_folders = ['withoutGF', 'withGF']
Model = []
GF = []
DATE = []
Nb_S2_used = []
mse = []
r2 = [] 
for gf_folder in gf_folders:    
    subdir1 = os.path.join(maindir1,gf_folder)
    files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName]
    for n in tqdm(range(len(files_temp))):
        file1 = files_temp[n]
        for file2 in os.listdir(os.path.join(subdir1,file1)): 
            if 'Best_2' in file2:
                excel_file = pd.read_excel(os.path.join(subdir1,file1,file2)) 
                y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                        
                excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                features = ['L'+str(i) for i in range(1,len(excel_file.columns)-3)]
                X = excel_file.values                
                y = MinMaxScaler().fit_transform(y) 
                X = MinMaxScaler().fit_transform(X)
                train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) 
                Nfeatures = train_X.shape[1]
                model = ANN(build_fn=build_model, epochs=100, batch_size=10, verbose=0)                      
                model.fit(train_X, train_y)
                y_pred = model.predict(val_X)
                Model.append('ANN')
                GF.append(gf_folder)
                DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                Nb_S2_used.append(int(file2[17:18]))
                mse.append(round(mean_squared_error(val_y, y_pred, squared=True),4))
                r2.append(round(r2_score(val_y, y_pred),4))                     
results = pd.DataFrame({'Model':Model,
                        'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'mse':mse,
                        'r2':r2
                        })
outputdir2 = os.path.join(maindir2,'perfANN.xlsx')
results.to_excel(outputdir2, encoding='utf-8', index=False)

In [None]:
maindir1 = r'G:\preparedInputData'
maindir2 = r'G:\MLearning'
gf_folders = ['withoutGF', 'withGF']
Model = []
GF = []
DATE = []
Nb_S2_used = []
mse = []
r2 = [] 
for gf_folder in gf_folders:    
    subdir1 = os.path.join(maindir1,gf_folder)
    files_temp = [fileName for fileName in os.listdir(subdir1) if 'tiff' not in fileName] 
    for n in tqdm(range(len(files_temp))):
        file1 = files_temp[n]
        for file2 in os.listdir(os.path.join(subdir1,file1)): 
            if 'Best_2' in file2:
                excel_file = pd.read_excel(os.path.join(subdir1,file1,file2)) 
                y = np.array(excel_file['L0'].values,dtype=np.float).reshape(-1,1)                       
                excel_file.drop(['Unnamed: 0','idx','idy','L0'], axis=1,inplace=True)
                features = ['L'+str(i) for i in range(1,len(excel_file.columns)-3)]
                X = excel_file.values                
                y = MinMaxScaler().fit_transform(y) 
                X = MinMaxScaler().fit_transform(X)
                train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=seed) 
                Nfeatures = train_X.shape[1]
                train_y = train_y.ravel() 
                model = KNN(n_neighbors=8, leaf_size=1, weights='distance')
                model.fit(train_X, train_y)
                y_pred = model.predict(val_X)
                Model.append('KNN')
                GF.append(gf_folder)
                DATE.append(file1[:4]+'-'+file1[4:6]+'-'+file1[6:])
                Nb_S2_used.append(int(file2[17:18]))
                mse.append(round(mean_squared_error(val_y, y_pred, squared=True),4))
                r2.append(round(r2_score(val_y, y_pred),4))                     
results = pd.DataFrame({'Model':Model,
                        'Date':DATE,
                        'Gapfilling':GF,
                        'Nb_S2_used':Nb_S2_used,
                        'mse':mse,
                        'r2':r2
                        })
outputdir2 = os.path.join(maindir2,'perfKNN.xlsx')
results.to_excel(outputdir2, encoding='utf-8', index=False)