## Imports

In [5]:
import os
import numpy as np
import pandas as pd
from random import shuffle


## Functions

In [6]:

def read_excel_data(excel_path):

    df = pd.read_excel(excel_path)
    header_name_list = list(df.head())
    del header_name_list[0]
    
    all_data = np.array(df)
    all_data = np.delete(all_data, 0, axis=1)
    all_data = np.concatenate([np.reshape(header_name_list, (1, -1)), all_data], axis=0)
    print(len(header_name_list), header_name_list)
    print('read all_data shape: {}'.format(np.shape(all_data)))


    row, col = np.shape(all_data)
    source_name_list, header_name_list, label_list, data = [], [], [], []

    for i in range(row):
        row_data = []
        for j in range(col):
            if i == 0:
                if 'source_name' == all_data[i, j]:
                    source_name_index = j
                    print('source_name_index: {}'.format(source_name_index))
                    continue
                elif 'label' == all_data[i, j]:
                    label_name_index = j
                    print('label_name_index: {}'.format(label_name_index))
                    continue
                header_name_list.append(all_data[i, j])
            else:
                if source_name_index == j:
                    source_name_list.append(all_data[i, j])
                elif label_name_index == j:
                    label_list.append(all_data[i, j])
                else:
                    try:
                        row_data.append(float(all_data[i, j]))
                    except:
                        print('error in ({}, {}) -> {}'.format(i, j, all_data[i, j]))
        if len(row_data) > 0:
            data.append(row_data)

    return source_name_list, header_name_list, label_list, np.array(data)


## Load and split Data

In [8]:

split_info = [ # [file_path, label_str_1, label_str_2, label_predict, save_npy_base_dir]
    [r'datasets\Dataset_A.xlsx', 'agn', 'non-agn', 'un', r'datasets/NPY_DATA_A'],
    [r'datasets\Dataset_B.xlsx', 'bll', 'fsrq', 'bcu', r'datasets/NPY_DATA_B']
]
train_ratio, validation_ratio = 0.8, 0.1
split_repeat = 10 # randomly split 10 times


for file_path, label_str_1, label_str_2, label_predict, save_npy_base_dir in split_info:
    os.makedirs(save_npy_base_dir, exist_ok=True)

    # load excel data
    source_name_list, header_name_list, label_list, data = read_excel_data(file_path)

    # generate the dict of the label to index
    label2index, index2label = {}, {}
    cnt = 0
    for lab in label_list:
        if lab not in label2index.keys() and lab != label_predict:
            label2index[lab] = cnt
            index2label[cnt] = lab
            cnt += 1
    for lab, indx in label2index.items():
        print(lab, indx)
    row, col = np.shape(data)
    assert len(source_name_list) == row
    assert len(header_name_list) == col
    assert len(label_list) == row

    # save the predict samples
    predict_sourcename_list, predict_data_list = [], []
    for i in range(len(source_name_list)):
        source_name = source_name_list[i]
        label = label_list[i]
        if label == label_predict:
            predict_sourcename_list.append(source_name)
            predict_data_list.append(data[i])
    predict_data_list = np.array(predict_data_list)
    print('shape of predict samples', np.shape(predict_data_list))
    np.save(os.path.join(save_npy_base_dir, 'predict_data.npy'), predict_data_list)
    np.save(os.path.join(save_npy_base_dir, 'predict_sourcename.npy'), predict_sourcename_list)
    np.save(os.path.join(save_npy_base_dir, 'header_name.npy'), header_name_list)


    # randomly split the samples many times
    source_name_list_1, source_name_list_2 = [], []
    label_list_1, label_list_2 = [], []
    attri_data_list_1, attri_data_list_2 = [], []
    for i in range(len(source_name_list)):
        source_name = source_name_list[i]
        indx = source_name_list.index(source_name)
        label = label_list[indx]
        if label == label_predict:
            continue
        label_int = label2index[label]
        attri_data = data[indx]
        if label == label_str_1:
            source_name_list_1.append(source_name)
            label_list_1.append(label_int)
            attri_data_list_1.append(attri_data)
        elif label == label_str_2:
            source_name_list_2.append(source_name)
            label_list_2.append(label_int)
            attri_data_list_2.append(attri_data)
        else:
            print('label error:', label)
            exit()
    source_name_list_1 = np.array(source_name_list_1)
    label_list_1 = np.array(label_list_1)
    attri_data_list_1 = np.array(attri_data_list_1)
    source_name_list_2 = np.array(source_name_list_2)
    label_list_2 = np.array(label_list_2)
    attri_data_list_2 = np.array(attri_data_list_2)
    print('data 1:', np.shape(attri_data_list_1), 'data 2:', np.shape(attri_data_list_2))
    num_samples_1 = len(source_name_list_1)
    num_samples_2 = len(source_name_list_2)
    num_train_1 = int(train_ratio*num_samples_1)
    num_validation_1 = int(validation_ratio*num_samples_1)
    num_train_2 = int(train_ratio*num_samples_2)
    num_validation_2 = int(validation_ratio*num_samples_2)


    # randomly split
    for split_num in range(1, split_repeat+1):
        index_list_1 = [i for i in range(num_samples_1)]
        index_list_2 = [i for i in range(num_samples_2)]
        shuffle(index_list_1)
        shuffle(index_list_2)

        train_sourcename_list, train_data, train_labels = [], [], []
        validation_sourcename_list, validation_data, validation_labels = [], [], []
        test_sourcename_list, test_data, test_labels = [], [], []

        for i in range(num_samples_1):
            indx = index_list_1[i]
            source_name = source_name_list_1[indx]
            attri_data = attri_data_list_1[indx]
            label = label_list_1[indx]
            if i < num_train_1:
                train_sourcename_list.append(source_name)
                train_data.append(attri_data)
                train_labels.append(label)
            elif i >= num_train_1 and i < num_train_1+num_validation_1:
                validation_sourcename_list.append(source_name)
                validation_data.append(attri_data)
                validation_labels.append(label)
            else:
                test_sourcename_list.append(source_name)
                test_data.append(attri_data)
                test_labels.append(label)

        for i in range(num_samples_2):
            indx = index_list_2[i]
            source_name = source_name_list_2[indx]
            attri_data = attri_data_list_2[indx]
            label = label_list_2[indx]
            if i < num_train_2:
                train_sourcename_list.append(source_name)
                train_data.append(attri_data)
                train_labels.append(label)
            elif i >= num_train_2 and i < num_train_2+num_validation_2:
                validation_sourcename_list.append(source_name)
                validation_data.append(attri_data)
                validation_labels.append(label)
            else:
                test_sourcename_list.append(source_name)
                test_data.append(attri_data)
                test_labels.append(label)


        train_data = np.array(train_data)
        validation_data = np.array(validation_data)
        test_data = np.array(test_data)
        print('split {}, train: {}, validation: {}, test: {}'.format(split_num, np.shape(train_data), np.shape(validation_data), np.shape(test_data)))

        # rand_indx = np.random.randint(len(train_data))
        # print('check train:', train_sourcename_list[rand_indx], train_labels[rand_indx], train_data[rand_indx])
        # rand_indx = np.random.randint(len(validation_data))
        # print('check validation:', validation_sourcename_list[rand_indx], validation_labels[rand_indx], validation_data[rand_indx])
        # rand_indx = np.random.randint(len(test_data))
        # print('check test:', test_sourcename_list[rand_indx], test_labels[rand_indx], test_data[rand_indx])

        save_npy_dir = os.path.join(save_npy_base_dir, 'split_'+str(split_num))
        os.makedirs(save_npy_dir, exist_ok=True)

        np.save(os.path.join(save_npy_dir, 'train_data.npy'), train_data)
        np.save(os.path.join(save_npy_dir, 'validation_data.npy'), validation_data)
        np.save(os.path.join(save_npy_dir, 'test_data.npy'), test_data)
        np.save(os.path.join(save_npy_dir, 'train_sourcename.npy'), train_sourcename_list)
        np.save(os.path.join(save_npy_dir, 'validation_sourcename.npy'), validation_sourcename_list)
        np.save(os.path.join(save_npy_dir, 'test_sourcename.npy'), test_sourcename_list)
        np.save(os.path.join(save_npy_dir, 'train_labels.npy'), train_labels)
        np.save(os.path.join(save_npy_dir, 'validation_labels.npy'), validation_labels)
        np.save(os.path.join(save_npy_dir, 'test_labels.npy'), test_labels)





18 ['source_name', 'label', 'Pivot_Energy', 'Flux1000', 'Unc_Flux1000', 'PL_Index', 'Unc_PL_Index', 'Variability_Index', 'Frac_Variability', 'Unc_Frac_Variability', 'Flux_Band1', 'Flux_Band2', 'Flux_Band3', 'Flux_Band4', 'Flux_Band5', 'Flux_Band6', 'Flux_Band7', 'Flux_Band8']
read all_data shape: (6660, 18)
source_name_index: 0
label_name_index: 1
agn 0
non-agn 1
shape of predict samples (2291, 16)
data 1: (3809, 16) data 2: (559, 16)
split 1, train: (3494, 16), validation: (435, 16), test: (439, 16)
split 2, train: (3494, 16), validation: (435, 16), test: (439, 16)
split 3, train: (3494, 16), validation: (435, 16), test: (439, 16)
split 4, train: (3494, 16), validation: (435, 16), test: (439, 16)
split 5, train: (3494, 16), validation: (435, 16), test: (439, 16)
split 6, train: (3494, 16), validation: (435, 16), test: (439, 16)
split 7, train: (3494, 16), validation: (435, 16), test: (439, 16)
split 8, train: (3494, 16), validation: (435, 16), test: (439, 16)
split 9, train: (3494, 16