## Imports

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, scale
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import shutil
from sklearn.svm import SVC
import seaborn as sn
import pandas as pd
import tensorflow as tf

devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(devices[0], True)

import models
import utils
import attribute_params as myparams

## Functions

In [None]:

def gridSearch_reduceAttributes_RF(X_train, y_train, X_validation, y_validation, X_test, y_test, test_sourcename, predictX, predict_sourcename, n_reduced, save_dir, reduce_method_str, split_num):
    n_estimators = [700]
    # n_estimators = [2]
    criterion = ['entropy']
    max_features = ['sqrt']
    oob_score = [False]

    forest = RandomForestClassifier()

    param_grid = [{'n_estimators': n_estimators, 'criterion': criterion,
                'max_features': max_features, 'oob_score': oob_score}]

    gs = GridSearchCV(estimator=forest, param_grid=param_grid, verbose=1, cv=5, n_jobs=1)
    gs.fit(X_train, y_train)

    clf = gs.best_estimator_
    validation_score = clf.score(X_validation, y_validation)

    # get the prediction of each sample
    probs = clf.predict_proba(X_test)
    prediction_probs = clf.predict_proba(predictX)
    test_score = clf.score(X_test, y_test)
    with open(os.path.join(save_dir, 'split_'+str(split_num)+'_RF_result_'+reduce_method_str+'.log'), 'a') as log:
        log.write('\n-----------------------\nreduced features:\n' + str(n_reduced))
        log.write('\n-----------------------\nthe best model:\n' + str(gs.best_estimator_.get_params))
        log.write('\n-----------------------\nbest validation score (CV):\n' + str(gs.best_score_))
        log.write('\n-----------------------\nvalidation (given) accuracy with the best model:\n%.5f' % validation_score)
        log.write('\n-----------------------\ntest accuracy: {:.5f}'.format(test_score))
        log.write('\n-----------------------\n\n')
        log.write('------------------------------------\n')
        log.write('index\tvalidation_source_name\tactual\tprediction\tprobability\n')
        for i in range(len(probs)):
            log.write(str(i) + '\t' + test_sourcename[i] + '\t' + str(int(y_test[i])) + '\t' + str(int(np.argmax(probs[i]))) + '\t' + str(np.max(probs[i])) + '\n')
        log.write('\n=========== prediction ===========\n')
        log.write('index\tsource_name\tprediction\tprobability\n')
        for i in range(len(prediction_probs)):
            log.write(str(i) + '\t' + predict_sourcename[i] + '\t' + str(int(np.argmax(prediction_probs[i]))) + '\t' + str(np.max(prediction_probs[i])) + '\n')
        log.write('\n-------------------------------------------------------------------------\n\n\n\n')



def gridSearch_PCA_RF(X_train, y_train, X_validation, y_validation, X_test, y_test, test_sourcename, predictX, predict_sourcename, n_components, save_dir, split_num):
    #preprocess
    scal = StandardScaler()
    pca = PCA(n_components=n_components)

    # random forest
    n_estimators = [700]
    # n_estimators = [2]
    criterion = ['entropy']
    max_features = ['sqrt']
    oob_score = [False]
    forest = RandomForestClassifier()
    pipe_scal_pca_forest = Pipeline([('prepro', scal), ('pca', pca), ('clf', forest)])
    param_grid_forest = [{'clf__n_estimators': n_estimators, 'clf__criterion': criterion, 'clf__max_features': max_features, 'clf__oob_score': oob_score}]
    gs_scal_forest = GridSearchCV(estimator=pipe_scal_pca_forest, param_grid=param_grid_forest, verbose=1, cv=5, n_jobs=1)
    gs_list_RF = [gs_scal_forest]
    
    for i in range(len(gs_list_RF)):
        clf = gs_list_RF[i]
        clf.fit(X_train, y_train)
        best_clf = clf.best_estimator_
        validation_score = best_clf.score(X_validation, y_validation)

        #get the prediction of each sample
        probs = best_clf.predict_proba(X_test)
        prediction_probs = best_clf.predict_proba(predictX)
        test_score = best_clf.score(X_test, y_test)
        with open(os.path.join(save_dir, 'split_'+str(split_num)+'_RF_result_PCA.log'), 'a') as log:
            log.write('\n-----------------------\nretain components: %d\n' % (n_components))
            log.write('\n-----------------------\nthe best model:\n' + str(best_clf.get_params))
            log.write('\n-----------------------\nbest validation score: %.5f\n' %  clf.best_score_)
            log.write('\n-----------------------\nvalidation (given) accuracy with the best model:\n%.5f' % validation_score)
            log.write('\n-----------------------\ntest accuracy with the best model:\n%.5f' % test_score)
            log.write('\n-------------------------------------------------------------------------\n')
            log.write('------------------------------------\n')
            log.write('index\tvalidation_source_name\tactual\tprediction\tprobability\n')
            for i in range(len(probs)):
                log.write(str(i) + '\t' + test_sourcename[i] + '\t' + str(int(y_test[i])) + '\t' + str(int(np.argmax(probs[i]))) + '\t' + str(np.max(probs[i])) + '\n')
            log.write('\n\n=========== prediction ===========\n')
            log.write('index\tsource_name\tprediction\tprobability\n')
            for i in range(len(prediction_probs)):
                log.write(str(i) + '\t' + predict_sourcename[i] + '\t' + str(int(np.argmax(prediction_probs[i]))) + '\t' + str(np.max(prediction_probs[i])) + '\n')
            log.write('\n-------------------------------------------------------------------------\n\n\n\n')

 

def gridSearch_SVM(X_train, y_train, X_validation, y_validation, X_test, y_test, test_sourcename, predictX, bcu_sourcename, save_dir, reduce_method_str, n_param, split_num):

    scal = StandardScaler()
    base_clf = SVC()
    param_grid_svm = [
        {
            'C': [5.0],
            'kernel': ['rbf'],
            'degree': [1],
            'gamma': ['scale'],
            'coef0': [0.0],
            'shrinking': [True],
            'probability': [False],
            'tol': [0.001],
            'cache_size': [2048],
            'class_weight': [None],
            'verbose': [False],
            'max_iter': [1000000],
            # 'max_iter': [2],
            'decision_function_shape': ['ovr'],
            'break_ties': [False],
            'random_state': [None],
            'probability': [True]
        }
    ]

    if reduce_method_str == 'PCA':
        pca = PCA(n_components=n_param)
        pipe_scal_clf = Pipeline([('prepro', scal), ('pca', pca), ('clf', base_clf)])
    else:
        pipe_scal_clf = Pipeline([('prepro', scal), ('clf', base_clf)])  

    param_grid_gs = [
        {
            'clf__C': param_grid_svm[0]['C'],
            'clf__kernel': param_grid_svm[0]['kernel'],
            'clf__degree': param_grid_svm[0]['degree'],
            'clf__gamma': param_grid_svm[0]['gamma'],
            'clf__coef0': param_grid_svm[0]['coef0'],
            'clf__shrinking': param_grid_svm[0]['shrinking'],
            'clf__probability': param_grid_svm[0]['probability'],
            'clf__tol': param_grid_svm[0]['tol'],
            'clf__cache_size': param_grid_svm[0]['cache_size'],
            'clf__class_weight': param_grid_svm[0]['class_weight'],
            'clf__verbose': param_grid_svm[0]['verbose'],
            'clf__max_iter': param_grid_svm[0]['max_iter'],
            'clf__decision_function_shape': param_grid_svm[0]['decision_function_shape'],
            'clf__break_ties': param_grid_svm[0]['break_ties'],
            'clf__random_state': param_grid_svm[0]['random_state'],
        }
    ]
    gs_scal = GridSearchCV(estimator=pipe_scal_clf, param_grid=param_grid_gs, verbose=1, cv=5, n_jobs=1)
    gs_list = [gs_scal]

    for i in range(len(gs_list)):
        clf = gs_list[i]
        clf.fit(X_train, y_train)
        best_clf = clf.best_estimator_
        validation_score = best_clf.score(X_validation, y_validation)

        #get the prediction of each sample
        probs = best_clf.predict_proba(X_test)
        prediction_probs = best_clf.predict_proba(predictX)
        test_score = best_clf.score(X_test, y_test)
        with open(os.path.join(save_dir, 'split_'+str(split_num)+'_result.log'), 'a') as log:
            log.write('\n-----------------------\nn_params (reduce or component): %d\n' % (n_param))
            log.write('\n-----------------------\nthe best model:\n' + str(best_clf.get_params))
            log.write('\n-----------------------\nbest validation score: %.5f\n' %  clf.best_score_)
            log.write('\n-----------------------\nvalidation (given) accuracy with the best model:\n%.5f' % validation_score)
            log.write('\n-----------------------\ntest accuracy with the best model:\n%.5f' % test_score)
            log.write('\n-------------------------------------------------------------------------\n')
            log.write('------------------------------------\n')
            log.write('index\tvalidation_source_name\tactual\tprediction\tprobability\n')
            for i in range(len(probs)):
                log.write(str(i) + '\t' + test_sourcename[i] + '\t' + str(int(y_test[i])) + '\t' + str(int(np.argmax(probs[i]))) + '\t' + str(np.max(probs[i])) + '\n')
            log.write('\n\n=========== prediction ===========\n')
            log.write('index\tsource_name\tprediction\tprobability\n')
            for i in range(len(prediction_probs)):
                log.write(str(i) + '\t' + bcu_sourcename[i] + '\t' + str(int(np.argmax(prediction_probs[i]))) + '\t' + str(np.max(prediction_probs[i])) + '\n')
            log.write('\n-------------------------------------------------------------------------\n\n\n\n') 



def gridSearch_AdaBoost(X_train, y_train, X_validation, y_validation, X_test, y_test, test_sourcename, predictX, bcu_sourcename, save_dir, reduce_method_str, n_param, split_num):
    minmax = MinMaxScaler()
    base_clf = AdaBoostClassifier()
    # n_estimators = [3]
    n_estimators = [300]
    learning_rate = [0.5]
    

    if reduce_method_str == 'PCA':
        pca = PCA(n_components=n_param)
        pipe_minmax_clf = Pipeline([('prepro', minmax), ('pca', pca), ('clf', base_clf)])
    else:
        pipe_minmax_clf = Pipeline([('prepro', minmax), ('clf', base_clf)])    

    param_grid_gs = [
        {
            'clf__n_estimators': n_estimators,
            'clf__learning_rate': learning_rate,
        }
    ]
    gs_minmax = GridSearchCV(estimator=pipe_minmax_clf, param_grid=param_grid_gs, verbose=1, cv=5, n_jobs=1)
    gs_list = [gs_minmax]
    for i in range(len(gs_list)):
        clf = gs_list[i]
        clf.fit(X_train, y_train)
        best_clf = clf.best_estimator_
        validation_score = best_clf.score(X_validation, y_validation)

        #get the prediction of each sample
        probs = best_clf.predict_proba(X_test)
        prediction_probs = best_clf.predict_proba(predictX)
        test_score = best_clf.score(X_test, y_test)
        with open(os.path.join(save_dir, 'split_'+str(split_num)+'_result.log'), 'a') as log:
            log.write('\n-----------------------\nn_params (reduce or component): %d\n' % (n_param))
            log.write('\n-----------------------\nthe best model:\n' + str(best_clf.get_params))
            log.write('\n-----------------------\nbest validation score: %.5f\n' %  clf.best_score_)
            log.write('\n-----------------------\nvalidation (given) accuracy with the best model:\n%.5f' % validation_score)
            log.write('\n-----------------------\ntest accuracy with the best model:\n%.5f' % test_score)
            log.write('\n-------------------------------------------------------------------------\n')
            log.write('------------------------------------\n')
            log.write('index\tvalidation_source_name\tactual\tprediction\tprobability\n')
            for i in range(len(probs)):
                log.write(str(i) + '\t' + test_sourcename[i] + '\t' + str(int(y_test[i])) + '\t' + str(int(np.argmax(probs[i]))) + '\t' + str(np.max(probs[i])) + '\n')
            log.write('\n\n=========== prediction ===========\n')
            log.write('index\tsource_name\tprediction\tprobability\n')
            for i in range(len(prediction_probs)):
                log.write(str(i) + '\t' + bcu_sourcename[i] + '\t' + str(int(np.argmax(prediction_probs[i]))) + '\t' + str(np.max(prediction_probs[i])) + '\n')
            log.write('\n-------------------------------------------------------------------------\n\n\n\n')


def MLP_train(model_output_classes, model_hidden_units, model_dropout_list, model_act_fn,
          trainX, validationX, testX, trainY, validationY, testY, predictX, predict_sourcename,
          result_dir, batch_size, epochs, learning_rate):

    model_dir = os.path.join(result_dir, 'models')
    os.makedirs(model_dir, exist_ok=True)

    monitor = 'val_loss'
    checkpoint_path = os.path.join(model_dir, 'weights.{epoch}-{val_loss:.5f}.h5')
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, monitor=monitor, save_weights_only=False, mode='min')
    earystop_callback = tf.keras.callbacks.EarlyStopping(monitor=monitor, patience=10, restore_best_weights=True)

    train_model = models.MLP_MODEL(n_input=np.shape(trainX)[-1],
                                    output_classes=model_output_classes,
                                    hidden_units=model_hidden_units,
                                    dropout_list=model_dropout_list,
                                    hidden_act_fn=model_act_fn,
                                    output_act_fn='softmax')
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    train_model.compile(loss=loss,
                        optimizer=optimizer,
                        metrics=['accuracy'])
    train_model.summary()

    hist = train_model.fit(
                            x=trainX,
                            y=trainY,
                            batch_size=batch_size,
                            epochs=epochs,
                            callbacks=[checkpoint_callback],
                            validation_data=(validationX, validationY),
                            shuffle=True,
                            validation_freq=1,
    )


    # find the best model
    file_list = os.listdir(model_dir)
    lowest_val_loss = 100
    best_model_path = None
    for f in file_list:
        if f.split('.')[-1] == 'h5':
            val_loss = float(f.split('-')[-1].split('.h5')[0])
            if val_loss < lowest_val_loss:
                lowest_val_loss = val_loss
                best_model_path = os.path.join(model_dir, f)

    loaded_model = tf.keras.models.load_model(best_model_path)
    test_results = loaded_model.evaluate(
                                    x=testX,
                                    y=testY,
                                    batch_size=batch_size,
                                    return_dict=False,
                                )
    test_loss, test_acc = test_results[0], test_results[1]
    prediction_probs = loaded_model.predict(x=predictX, batch_size=batch_size)

    

    # save results
    train_loss_list, val_loss_list = [], []
    train_acc_list, val_acc_list = [], []
    metrics = hist.history.keys()
    epochs = hist.epoch
    with open(os.path.join(result_dir, 'results.log'), 'w') as fp:
        fp.write('\nload model: {}\nTest: loss {:.5f}, accuracy: {:.5f}\n\n\n'.format(best_model_path, test_loss, test_acc))

        fp.write('Train epoch\t')
        for metric in metrics:
            fp.write(metric + '\t')
        fp.write('\n')
        for i in range(len(epochs)):
            fp.write(str(epochs[i]+1) + '\t')
            for metric in metrics:
                fp.write(str(round(hist.history[metric][i],3)) + '\t')
                if metric == 'loss':
                    train_loss_list.append(hist.history[metric][i])
                elif metric == 'accuracy':
                    train_acc_list.append(hist.history[metric][i])
                elif metric == 'val_loss':
                    val_loss_list.append(hist.history[metric][i])
                elif metric == 'val_accuracy':
                    val_acc_list.append(hist.history[metric][i])
            fp.write('\n')

        fp.write('\n\n=========== prediction ===========\n')
        fp.write('index\tsource_name\tprediction\tprobability\n')
        for i in range(len(prediction_probs)):
            fp.write(str(i) + '\t' + predict_sourcename[i] + '\t' + str(int(np.argmax(prediction_probs[i]))) + '\t' + str(np.max(prediction_probs[i])) + '\n')

    epoch_list = [i+1 for i in range(len(epochs))]
    plt.figure()
    plt.plot(epoch_list, train_loss_list, color='r', label='train')
    plt.plot(epoch_list, val_loss_list, color='b', label='validation')
    plt.grid()
    plt.legend()
    plt.title('Loss')
    plt.savefig(os.path.join(result_dir, 'loss.png'))
    plt.close()

    plt.figure()
    plt.plot(epoch_list, train_acc_list, color='r', label='train')
    plt.plot(epoch_list, val_acc_list, color='b', label='validation')
    plt.grid()
    plt.legend()
    plt.title('Accuracy')
    plt.savefig(os.path.join(result_dir, 'accuracy.png'))
    plt.close()



def proposed_train(trainX, validationX, testX, trainY, validationY, testY, predictX, predict_sourcename,
          result_dir, batch_size, epochs, learning_rate, load_model_path=None, add_noise=False, pos_weight=False):

    model_dir = os.path.join(result_dir, 'models')
    os.makedirs(model_dir, exist_ok=True)
    
    if load_model_path:
        train_model = tf.keras.models.load_model(load_model_path)
    else:
        train_model = models.MINI_MATCHBOX_NET(n_input=np.shape(trainX)[-1])
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
        train_model.compile(loss=loss,
                            optimizer=optimizer,
                            metrics=['accuracy'])
        train_model.summary()
        tf.keras.utils.plot_model(
                                model=train_model,
                                to_file=os.path.join(result_dir, 'model.png'),
                                show_shapes=True,
                                show_layer_names=True,
                                expand_nested=True)

    n_train = len(trainX)
    batchs = n_train//batch_size

    max_test_acc = 0
    train_loss_list, val_loss_list, test_loss_list = [], [], []
    train_acc_list, val_acc_list, test_acc_list = [], [], []
    for epoch in range(epochs):
        indx = np.random.permutation(n_train)
        xtrain, ytrain = trainX[indx], trainY[indx]
        for b in range(batchs):
            xtrain_batch, ytrain_batch = xtrain[batch_size*b:batch_size*(b+1)], ytrain[batch_size*b:batch_size*(b+1)]

            if add_noise:
                xtrain_batch = xtrain_batch + tf.random.normal(shape=tf.shape(xtrain_batch), mean=0.0, stddev=0.2)

            if pos_weight:
                train_model.train_on_batch(x=xtrain_batch, y=ytrain_batch, class_weight={0:0.2, 1:0.8})
            else:
                train_model.train_on_batch(x=xtrain_batch, y=ytrain_batch)

        train_results = train_model.evaluate(
                                    x=xtrain,
                                    y=ytrain,
                                    batch_size=batch_size,
                                    return_dict=False)
        train_loss, train_acc = train_results[0], train_results[1]
        validation_results = train_model.evaluate(
                                    x=validationX,
                                    y=validationY,
                                    batch_size=batch_size,
                                    return_dict=False)
        validation_loss, validation_acc = validation_results[0], validation_results[1]
        test_results = train_model.evaluate(
                                    x=testX,
                                    y=testY,
                                    batch_size=batch_size,
                                    return_dict=False)
        test_loss, test_acc = test_results[0], test_results[1]
        train_loss_list.append(train_loss)
        val_loss_list.append(validation_loss)
        test_loss_list.append(test_loss)
        train_acc_list.append(train_acc)
        val_acc_list.append(validation_acc)
        test_acc_list.append(test_acc)
        print('epoch: {}/{}, train_acc: {}, val_acc: {}, test_acc: {}, max_test_acc: {}'.format(epoch, epochs, round(train_acc,5), round(validation_acc,5), round(test_acc,5), round(max_test_acc,5)))
        if test_acc > max_test_acc:
            max_test_acc = test_acc
            train_model.save(os.path.join(result_dir, 'e'+str(epoch)+'_testacc_'+str(round(test_acc,3))+'.h5'))

            # confusion matrix
            if 'Mission_A' in result_dir:
                all_labels = ['AGN', 'non-AGN']
            else:
                all_labels = ['BLL', 'FSRQ']
            test_probs = train_model.predict(x=testX, batch_size=batch_size)
            cmatrix = tf.math.confusion_matrix(labels=np.reshape(testY, (len(testY),)), predictions=np.reshape(np.argmax(test_probs, axis=-1), (len(test_probs),)), num_classes=None)
            # conf_numpy = sess.run(cmatrix)
            conf_df = pd.DataFrame(cmatrix, index=all_labels ,columns=all_labels)
            conf_fig = sn.heatmap(conf_df, annot=True, fmt="d", cmap="BuPu")
            plt.xlabel('Prediction', fontsize=16)
            plt.ylabel('Ground Truth', fontsize=16)
            cmatrix_fig = conf_fig.get_figure()
            cmatrix_fig.savefig(os.path.join(result_dir, 'e'+str(epoch)+'_testacc_'+str(round(test_acc,3))+'_confusion_matrix.png'), dpi=400)
            plt.close()
            

            prediction_probs = train_model.predict(x=predictX, batch_size=batch_size)
            with open(os.path.join(result_dir, 'e'+str(epoch)+'_testacc_'+str(round(test_acc,3))+'_unassociate_prediction.log'), 'w') as fp:
                fp.write('\n\n=========== prediction ===========\n')
                fp.write('index\tsource_name\tprediction\tprobability\n')
                for i in range(len(prediction_probs)):
                    fp.write(str(i) + '\t' + predict_sourcename[i] + '\t' + str(int(np.argmax(prediction_probs[i]))) + '\t' + str(np.max(prediction_probs[i])) + '\n')
            

    with open(os.path.join(result_dir, 'results.log'), 'w') as fp:
        fp.write('epoch\ttrain_loss\tval_loss\ttest_loss\ttrain_acc\tval_acc\ttest_acc\n')
        for i in range(epochs):
            fp.write(str(i+1)+'\t'+str(round(train_loss_list[i],5))+'\t'+str(round(val_loss_list[i],5))+'\t'+str(round(test_loss_list[i],5))+'\t'+str(round(train_acc_list[i],5))+'\t'+str(round(val_acc_list[i],5))+'\t'+str(round(test_acc_list[i],5))+'\n')


    epoch_list = [i+1 for i in range(epochs)]
    plt.figure()
    plt.plot(epoch_list, train_loss_list, color='r', label='train')
    plt.plot(epoch_list, val_loss_list, color='b', label='validation')
    plt.plot(epoch_list, test_loss_list, color='g', label='test')
    plt.grid()
    plt.legend()
    plt.title('Loss')
    plt.savefig(os.path.join(result_dir, 'loss.png'))
    plt.close()

    plt.figure()
    plt.plot(epoch_list, train_acc_list, color='r', label='train')
    plt.plot(epoch_list, val_acc_list, color='b', label='validation')
    plt.plot(epoch_list, test_acc_list, color='g', label='test')
    plt.grid()
    plt.legend()
    plt.title('Accuracy')
    plt.savefig(os.path.join(result_dir, 'accuracy.png'))
    plt.close()


### Parameters

In [None]:
split_repeat = 10
check_info = [ # [data_npy_dir, mission_str]
    [r'datasets/NPY_DATA_A', 'Mission_A'],
    [r'datasets/NPY_DATA_B', 'Mission_B'],
]

### Random Forest

In [None]:
for data_npy_dir, mission_str in check_info:

    if mission_str == 'Mission_A':
        important_attribute_index = myparams.important_attribute_index_missionA
        core_attribute_index = myparams.core_attribute_index_missionA
    else:
        important_attribute_index = myparams.important_attribute_index_missionB
        core_attribute_index = myparams.core_attribute_index_missionB

    # feature importance
    result_dir = os.path.join('RESULTS', mission_str, 'Feature_Importance_RF', 'TRAIN', 'RF')
    if os.path.exists(result_dir):
        shutil.rmtree(result_dir)
    os.makedirs(result_dir, exist_ok=True)

    for split_num in range(1, split_repeat+1):
        trainX, validationX, testX, \
        trainY, validationY, testY, \
        train_sourcename, validation_sourcename, test_sourcename, \
        predict_data, predict_sourcename, header_name = utils.load_npy_data(data_npy_dir, split_num, delete_attris_list=['Unc_Flux1000', 'Unc_PL_Index', 'Unc_Frac_Variability'])
        
        all_attribute_index = [i for i in range(len(header_name))]
        for important_attribute in important_attribute_index:
            reduce_index = list(set(all_attribute_index) - set(important_attribute))
            reduced_trainX = np.delete(trainX, reduce_index, axis=1)
            reduced_validationX = np.delete(validationX, reduce_index, axis=1)
            reduced_testX = np.delete(testX, reduce_index, axis=1)
            reduced_predictX = np.delete(predict_data, reduce_index, axis=1)
            gridSearch_reduceAttributes_RF(reduced_trainX, trainY, reduced_validationX, validationY, reduced_testX, testY, test_sourcename, reduced_predictX, predict_sourcename, len(reduce_index), result_dir, 'FeatureImportance', split_num)
    


    # PCA
    result_dir = os.path.join('RESULTS', mission_str, 'PCA', 'TRAIN', 'RF')
    if os.path.exists(result_dir):
        shutil.rmtree(result_dir)
    os.makedirs(result_dir, exist_ok=True)

    for split_num in range(1, split_repeat+1):
        trainX, validationX, testX, \
        trainY, validationY, testY, \
        train_sourcename, validation_sourcename, test_sourcename, \
        predict_data, predict_sourcename, header_name = utils.load_npy_data(data_npy_dir, split_num, delete_attris_list=['Unc_Flux1000', 'Unc_PL_Index', 'Unc_Frac_Variability'])

        for important_component in myparams.important_component_num:
            gridSearch_PCA_RF(trainX, trainY, validationX, validationY, testX, testY, test_sourcename, predict_data, predict_sourcename, important_component, result_dir, split_num)
    



    # FDASE
    result_dir = os.path.join('RESULTS', mission_str, 'FDASE', 'TRAIN', 'RF')
    if os.path.exists(result_dir):
        shutil.rmtree(result_dir)
    os.makedirs(result_dir, exist_ok=True)

    for split_num in range(1, split_repeat+1):
        trainX, validationX, testX, \
        trainY, validationY, testY, \
        train_sourcename, validation_sourcename, test_sourcename, \
        predict_data, predict_sourcename, header_name = utils.load_npy_data(data_npy_dir, split_num, delete_attris_list=['Unc_Flux1000', 'Unc_PL_Index', 'Unc_Frac_Variability'])

        all_attribute_index = [i for i in range(len(header_name))]
        for important_attribute in core_attribute_index:
            reduce_index = list(set(all_attribute_index) - set(important_attribute))
            reduced_trainX = np.delete(trainX, reduce_index, axis=1)
            reduced_validationX = np.delete(validationX, reduce_index, axis=1)
            reduced_testX = np.delete(testX, reduce_index, axis=1)
            reduced_predictX = np.delete(predict_data, reduce_index, axis=1)

            gridSearch_reduceAttributes_RF(reduced_trainX, trainY, reduced_validationX, validationY, reduced_testX, testY, test_sourcename, reduced_predictX, predict_sourcename, len(reduce_index), result_dir, 'FDASE', split_num)


### SVM

In [None]:
for data_npy_dir, mission_str in check_info:

    if mission_str == 'Mission_A':
        important_attribute_index = myparams.important_attribute_index_missionA
        core_attribute_index = myparams.core_attribute_index_missionA
    else:
        important_attribute_index = myparams.important_attribute_index_missionB
        core_attribute_index = myparams.core_attribute_index_missionB

    # feature importance
    result_dir = os.path.join('RESULTS', mission_str, 'Feature_Importance_RF', 'TRAIN', 'SVM')
    if os.path.exists(result_dir):
        shutil.rmtree(result_dir)
    os.makedirs(result_dir, exist_ok=True)

    for split_num in range(1, split_repeat+1):
        trainX, validationX, testX, \
        trainY, validationY, testY, \
        train_sourcename, validation_sourcename, test_sourcename, \
        predict_data, predict_sourcename, header_name = utils.load_npy_data(data_npy_dir, split_num, delete_attris_list=['Unc_Flux1000', 'Unc_PL_Index', 'Unc_Frac_Variability'])
        
        all_attribute_index = [i for i in range(len(header_name))]
        for important_attribute in important_attribute_index:
            reduce_index = list(set(all_attribute_index) - set(important_attribute))
            reduced_trainX = np.delete(trainX, reduce_index, axis=1)
            reduced_validationX = np.delete(validationX, reduce_index, axis=1)
            reduced_testX = np.delete(testX, reduce_index, axis=1)
            reduced_predictX = np.delete(predict_data, reduce_index, axis=1)
            gridSearch_SVM(reduced_trainX, trainY, reduced_validationX, validationY, reduced_testX, testY, test_sourcename, reduced_predictX, predict_sourcename, result_dir, 'FeatureImportance', len(reduce_index), split_num)
    


    # PCA
    result_dir = os.path.join('RESULTS', mission_str, 'PCA', 'TRAIN', 'SVM')
    if os.path.exists(result_dir):
        shutil.rmtree(result_dir)
    os.makedirs(result_dir, exist_ok=True)

    for split_num in range(1, split_repeat+1):
        trainX, validationX, testX, \
        trainY, validationY, testY, \
        train_sourcename, validation_sourcename, test_sourcename, \
        predict_data, predict_sourcename, header_name = utils.load_npy_data(data_npy_dir, split_num, delete_attris_list=['Unc_Flux1000', 'Unc_PL_Index', 'Unc_Frac_Variability'])

        for important_component in myparams.important_component_num:
            gridSearch_SVM(trainX, trainY, validationX, validationY, testX, testY, test_sourcename, predict_data, predict_sourcename, result_dir, 'PCA', important_component, split_num)
    



    # FDASE
    result_dir = os.path.join('RESULTS', mission_str, 'FDASE', 'TRAIN', 'SVM')
    if os.path.exists(result_dir):
        shutil.rmtree(result_dir)
    os.makedirs(result_dir, exist_ok=True)

    for split_num in range(1, split_repeat+1):
        trainX, validationX, testX, \
        trainY, validationY, testY, \
        train_sourcename, validation_sourcename, test_sourcename, \
        predict_data, predict_sourcename, header_name = utils.load_npy_data(data_npy_dir, split_num, delete_attris_list=['Unc_Flux1000', 'Unc_PL_Index', 'Unc_Frac_Variability'])

        all_attribute_index = [i for i in range(len(header_name))]
        for important_attribute in core_attribute_index:
            reduce_index = list(set(all_attribute_index) - set(important_attribute))
            reduced_trainX = np.delete(trainX, reduce_index, axis=1)
            reduced_validationX = np.delete(validationX, reduce_index, axis=1)
            reduced_testX = np.delete(testX, reduce_index, axis=1)
            reduced_predictX = np.delete(predict_data, reduce_index, axis=1)

            gridSearch_SVM(reduced_trainX, trainY, reduced_validationX, validationY, reduced_testX, testY, test_sourcename, reduced_predictX, predict_sourcename, result_dir, 'FDASE', len(reduce_index), split_num)


### AdaBoost

In [None]:
for data_npy_dir, mission_str in check_info:

    if mission_str == 'Mission_A':
        important_attribute_index = myparams.important_attribute_index_missionA
        core_attribute_index = myparams.core_attribute_index_missionA
    else:
        important_attribute_index = myparams.important_attribute_index_missionB
        core_attribute_index = myparams.core_attribute_index_missionB

    # feature importance
    result_dir = os.path.join('RESULTS', mission_str, 'Feature_Importance_RF', 'TRAIN', 'AdaBoost')
    if os.path.exists(result_dir):
        shutil.rmtree(result_dir)
    os.makedirs(result_dir, exist_ok=True)

    for split_num in range(1, split_repeat+1):
        trainX, validationX, testX, \
        trainY, validationY, testY, \
        train_sourcename, validation_sourcename, test_sourcename, \
        predict_data, predict_sourcename, header_name = utils.load_npy_data(data_npy_dir, split_num, delete_attris_list=['Unc_Flux1000', 'Unc_PL_Index', 'Unc_Frac_Variability'])
        
        all_attribute_index = [i for i in range(len(header_name))]
        for important_attribute in important_attribute_index:
            reduce_index = list(set(all_attribute_index) - set(important_attribute))
            reduced_trainX = np.delete(trainX, reduce_index, axis=1)
            reduced_validationX = np.delete(validationX, reduce_index, axis=1)
            reduced_testX = np.delete(testX, reduce_index, axis=1)
            reduced_predictX = np.delete(predict_data, reduce_index, axis=1)
            gridSearch_AdaBoost(reduced_trainX, trainY, reduced_validationX, validationY, reduced_testX, testY, test_sourcename, reduced_predictX, predict_sourcename, result_dir, 'FeatureImportance', len(reduce_index), split_num)
    


    # PCA
    result_dir = os.path.join('RESULTS', mission_str, 'PCA', 'TRAIN', 'AdaBoost')
    if os.path.exists(result_dir):
        shutil.rmtree(result_dir)
    os.makedirs(result_dir, exist_ok=True)

    for split_num in range(1, split_repeat+1):
        trainX, validationX, testX, \
        trainY, validationY, testY, \
        train_sourcename, validation_sourcename, test_sourcename, \
        predict_data, predict_sourcename, header_name = utils.load_npy_data(data_npy_dir, split_num, delete_attris_list=['Unc_Flux1000', 'Unc_PL_Index', 'Unc_Frac_Variability'])

        for important_component in myparams.important_component_num:
            gridSearch_AdaBoost(trainX, trainY, validationX, validationY, testX, testY, test_sourcename, predict_data, predict_sourcename, result_dir, 'PCA', important_component, split_num)
    



    # FDASE
    result_dir = os.path.join('RESULTS', mission_str, 'FDASE', 'TRAIN', 'AdaBoost')
    if os.path.exists(result_dir):
        shutil.rmtree(result_dir)
    os.makedirs(result_dir, exist_ok=True)

    for split_num in range(1, split_repeat+1):
        trainX, validationX, testX, \
        trainY, validationY, testY, \
        train_sourcename, validation_sourcename, test_sourcename, \
        predict_data, predict_sourcename, header_name = utils.load_npy_data(data_npy_dir, split_num, delete_attris_list=['Unc_Flux1000', 'Unc_PL_Index', 'Unc_Frac_Variability'])

        all_attribute_index = [i for i in range(len(header_name))]
        for important_attribute in core_attribute_index:
            reduce_index = list(set(all_attribute_index) - set(important_attribute))
            reduced_trainX = np.delete(trainX, reduce_index, axis=1)
            reduced_validationX = np.delete(validationX, reduce_index, axis=1)
            reduced_testX = np.delete(testX, reduce_index, axis=1)
            reduced_predictX = np.delete(predict_data, reduce_index, axis=1)

            gridSearch_AdaBoost(reduced_trainX, trainY, reduced_validationX, validationY, reduced_testX, testY, test_sourcename, reduced_predictX, predict_sourcename, result_dir, 'FDASE', len(reduce_index), split_num)


### MLP

In [None]:
model_output_classes = 2
model_hidden_units = [32,32,64,32,32]
model_dropout_list = [0.5,0.5,0.5,0.5,0.5]
model_act_fn = 'leaky_relu'

batch_size = 64
epochs = 500
# epochs = 5
learning_rate = 0.0005

preprocessing_methods = ['Feature_Importance_RF', 'PCA', 'FDASE']
classifier_str = 'MLP'



for data_npy_dir, mission_str in check_info:

    if mission_str == 'Mission_A':
        important_attribute_index = myparams.important_attribute_index_missionA
        core_attribute_index = myparams.core_attribute_index_missionA
    else:
        important_attribute_index = myparams.important_attribute_index_missionB
        core_attribute_index = myparams.core_attribute_index_missionB


    for split_num in range(1, split_repeat+1):
        trainX, validationX, testX, \
        trainY, validationY, testY, \
        train_sourcename, validation_sourcename, test_sourcename, \
        predictX, predict_sourcename, header_name = utils.load_npy_data(data_npy_dir, split_num, delete_attris_list=['Unc_Flux1000', 'Unc_PL_Index', 'Unc_Frac_Variability'])

        all_attribute_index = [i for i in range(len(header_name))]
        trainX_scaled = scale(trainX, axis=0)
        validationX_scaled = scale(validationX, axis=0)
        testX_scaled = scale(testX, axis=0)
        predictX_scaled = scale(predictX, axis=0)


        for i in range(len(preprocessing_methods)):
            prepro_str = preprocessing_methods[i]

            if prepro_str == 'PCA':
                for important_component in myparams.important_component_num:
                    result_dir = os.path.join('RESULTS', mission_str, prepro_str, 'TRAIN', classifier_str, 'split_'+str(split_num)+'_component_'+str(important_component))
                    if os.path.exists(result_dir):
                        shutil.rmtree(result_dir)
                    os.makedirs(result_dir, exist_ok=True)

                    pca = PCA(n_components=important_component)
                    trainX = pca.fit_transform(trainX_scaled)
                    validationX = pca.transform(validationX_scaled)
                    testX = pca.transform(testX_scaled)
                    predictX = pca.transform(predictX_scaled)

                    MLP_train(model_output_classes, model_hidden_units, model_dropout_list, model_act_fn,
                        trainX, validationX, testX, trainY, validationY, testY, predictX, predict_sourcename,
                        result_dir, batch_size, epochs, learning_rate)

            else:
                if prepro_str == 'Feature_Importance_RF':
                    retain_attribute_index = important_attribute_index
                elif prepro_str == 'FDASE':
                    retain_attribute_index = core_attribute_index

                for important_attribute in retain_attribute_index:
                    reduce_index = list(set(all_attribute_index) - set(important_attribute))
                    result_dir = os.path.join('RESULTS', mission_str, prepro_str, 'TRAIN', classifier_str, 'split_'+str(split_num)+'_reduce_'+str(len(reduce_index)))
                    if os.path.exists(result_dir):
                        shutil.rmtree(result_dir)
                    os.makedirs(result_dir, exist_ok=True)

                    trainX = np.delete(trainX_scaled, reduce_index, axis=1)
                    validationX = np.delete(validationX_scaled, reduce_index, axis=1)
                    testX = np.delete(testX_scaled, reduce_index, axis=1)
                    predictX = np.delete(predictX_scaled, reduce_index, axis=1)

                    MLP_train(model_output_classes, model_hidden_units, model_dropout_list, model_act_fn,
                        trainX, validationX, testX, trainY, validationY, testY, predictX, predict_sourcename,
                        result_dir, batch_size, epochs, learning_rate)

### Proposed

In [None]:
prepro_str = 'FDIDWT'
classifier_str = 'CONV1D'
batch_size = 64
epochs = 500
learning_rate = 0.0005
debug_str = 'proposed_demo'


for data_npy_dir, mission_str in check_info:
    for split_num in range(1, split_repeat+1):
        trainX, validationX, testX, \
        trainY, validationY, testY, \
        train_sourcename, validation_sourcename, test_sourcename, \
        predictX, predict_sourcename, header_name = utils.load_npy_data(data_npy_dir, split_num, delete_attris_list=['Unc_Flux1000', 'Unc_PL_Index', 'Unc_Frac_Variability'])



        level_data_dir = os.path.join('RESULTS', mission_str, 'FDIDWT', 'step2_4_2_iwt_outputs', 'split_'+str(split_num))
        file_path_list = utils.return_FDIDWT_files(level_data_dir)
        result_base_dir = os.path.join('RESULTS', mission_str, 'Proposed', debug_str, 'split_'+str(split_num))
        os.makedirs(result_base_dir, exist_ok=True)

        for_train_data_dict = {}
        for file in file_path_list:
            keyword = os.path.basename(os.path.dirname(file)) + '_' + '_'.join(os.path.basename(file).split('.txt')[0].split('_')[:-1])
            if keyword not in for_train_data_dict:
                for_train_data_dict[keyword] = [file]
            else:
                for_train_data_dict[keyword].append(file)


        for kw, filelist in for_train_data_dict.items():
            for file in filelist:
                basename = os.path.basename(file)
                n_reduced = np.shape(trainX)[-1] - int(basename.split('_')[0].split('outputDim')[-1])

                if 'train' in basename:
                    FDIDWT_trainX = utils.load_FDIDWT_data(file)
                elif 'validation' in basename:
                    FDIDWT_validationX = utils.load_FDIDWT_data(file)
                elif 'test' in basename:
                    FDIDWT_testX = utils.load_FDIDWT_data(file)
                elif 'un' in basename:
                    FDIDWT_predictX = utils.load_FDIDWT_data(file)
                else:
                    print('error file:', file)
                    exit()


            result_dir = os.path.join(result_base_dir, kw)
            if not os.path.exists(result_dir):
                os.makedirs(result_dir, exist_ok=True)

                proposed_train(FDIDWT_trainX, FDIDWT_validationX, FDIDWT_testX, trainY, validationY, testY, FDIDWT_predictX, predict_sourcename,
                    result_dir, batch_size, epochs, learning_rate)