In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from scipy.io import arff
from sklearn.preprocessing import MinMaxScaler
from scipy import stats

import sys
sys.path.insert(0, '..')

from algorithms.fs_neat import FS_NEAT
from algorithms.n3o import N3O
from algorithms.neat import set_seed
from utilities.activation_functions import Gaussian
from utilities.fitness_functions import torch_fitness_function

In [2]:
# Load train dataset
data = arff.loadarff('../datasets/ALL-AML_train.arff')
df_train = pd.DataFrame(data[0])

# Load test dataset
data = arff.loadarff('../datasets/ALL-AML_test.arff')
df_test = pd.DataFrame(data[0])

df_train.head()

Unnamed: 0,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9,attribute10,...,attribute7121,attribute7122,attribute7123,attribute7124,attribute7125,attribute7126,attribute7127,attribute7128,attribute7129,myclass
0,-214.0,-153.0,-58.0,88.0,-295.0,-558.0,199.0,-176.0,252.0,206.0,...,511.0,-125.0,389.0,-37.0,793.0,329.0,36.0,191.0,-37.0,b'ALL'
1,-139.0,-73.0,-1.0,283.0,-264.0,-400.0,-330.0,-168.0,101.0,74.0,...,837.0,-36.0,442.0,-17.0,782.0,295.0,11.0,76.0,-14.0,b'ALL'
2,-76.0,-49.0,-307.0,309.0,-376.0,-650.0,33.0,-367.0,206.0,-215.0,...,1199.0,33.0,168.0,52.0,1138.0,777.0,41.0,228.0,-41.0,b'ALL'
3,-135.0,-114.0,265.0,12.0,-419.0,-585.0,158.0,-253.0,49.0,31.0,...,835.0,218.0,174.0,-110.0,627.0,170.0,-50.0,126.0,-91.0,b'ALL'
4,-106.0,-125.0,-76.0,168.0,-230.0,-284.0,4.0,-122.0,70.0,252.0,...,649.0,57.0,504.0,-26.0,250.0,314.0,14.0,56.0,-25.0,b'ALL'


In [3]:
df_train['myclass'].unique()

array([b'ALL', b'AML'], dtype=object)

In [4]:
# Change category class label to binary class label
labels = {b'ALL' : 1, b'AML' : 0}
df_train['myclass'] = df_train['myclass'].replace(labels)
df_test['myclass'] = df_test['myclass'].replace(labels)

# Count class distribution from both datasets
n_all_train = np.sum(df_train['myclass'].to_numpy(dtype=np.float32))
n_aml_train = df_train.shape[0] - n_all_train
n_all_test = np.sum(df_test['myclass'].to_numpy(dtype=np.float32))
n_aml_test = df_test.shape[0] - n_all_test

# Print information
print(f"Train dataset shape: {df_train.shape}, Relapsed instances: {n_all_train}, Non-Relapsed instances: {n_aml_train}")
print(f"Test dataset shape: {df_test.shape}, Relapsed instances: {n_all_test}, Non-Relapsed instances: {n_aml_test}")


Train dataset shape: (38, 7130), Relapsed instances: 27.0, Non-Relapsed instances: 11.0
Test dataset shape: (34, 7130), Relapsed instances: 20.0, Non-Relapsed instances: 14.0


In [5]:
# Convert train dataset to Numpy array
x_train = df_train.iloc[:, :-1].to_numpy(dtype=np.float32)
y_train = df_train.iloc[:, -1].to_numpy(dtype=np.float32)

# Convert test dataset to Numpy array
x_test = df_test.iloc[:, :-1].to_numpy(dtype=np.float32)
y_test = df_test.iloc[:, -1].to_numpy(dtype=np.float32)

# Kruskal Wallis H Test
kw_pvalue = np.zeros(x_train.shape[1])

for feature in range(x_train.shape[1]):
	_, kw_pvalue[feature] = stats.kruskal(x_train[:, feature], y_train)

kw_feature_selected = np.argwhere(kw_pvalue < 1e-5)
kw_pvalue = kw_pvalue[kw_feature_selected]
x_train_kw = x_train[:, kw_feature_selected[:, 0]]
x_test_kw = x_test[:, kw_feature_selected[:, 0]]

print(f'Attributes selected after KW H Test: {kw_feature_selected.shape[0]}')


Attributes selected after KW H Test: 5499
