In [1]:
import pandas as pd

In [2]:
df = {}
for k in ["train", "val", "test"]:
    df[k] = {}
    for c in ["data", "bong"]:
        df[k][c] = pd.read_excel(f"setting/{c}_v2_{k}.xlsx", index_col=0)

In [3]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

In [4]:
X_train = df["train"]["bong"]
X_val = df["val"]["bong"]
X_test = df["test"]["bong"]

X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_val = X_val.loc[:, ~X_val.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

y_train = (df["train"]["data"]["Final decision"] == "INCLUDE").apply(int)
y_val = (df["val"]["data"]["Final decision"] == "INCLUDE").apply(int)
y_test = (df["test"]["data"]["Final decision"] == "INCLUDE").apply(int)

In [5]:
from sklearn_genetic import GAFeatureSelectionCV
import numpy as np
import random

2023-10-21 17:22:20.993918: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from sklearn.model_selection import KFold

In [7]:
random_seed = 54

np.random.seed(random_seed)
random.seed(random_seed)

train_test_sample = pd.concat([
    pd.concat([X_train, y_train], axis=1), 
    pd.concat([X_val, y_val], axis=1)
    ], ignore_index=True)
train_test_sample

kf = KFold(n_splits=15, random_state=42, shuffle=True)
F_features = []
for i, (train_index, test_index) in enumerate(kf.split(train_test_sample)):
    print(f"Fold {i}:")

    evolved_estimator = GAFeatureSelectionCV(
        estimator=DecisionTreeClassifier(
            random_state=42, 
            max_depth=5, 
            criterion="gini", 
        ),
        cv=[(train_index, test_index)],
        scoring="f1_macro",
        population_size=30,
        generations=10,
        n_jobs=-1,
        verbose=True,
        keep_top_k=2,
        elitism=True,
    )

    support = [x for x in train_test_sample.columns if x not in ["Final decision"]]

    evolved_estimator.fit(pd.DataFrame(train_test_sample.drop(columns="Final decision")[support].values), train_test_sample["Final decision"])

    best_features = [x for i, x in enumerate(train_test_sample.drop(columns="Final decision")[support].columns) if vars(evolved_estimator)["best_features_"][i]]
    F_features += best_features

Fold 0:


gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	30    	0.618411	0.0391128  	0.700699   	0.549474   
1  	60    	0.685155	0.0379312  	0.738295   	0.587885   
2  	60    	0.687462	0.0379756  	0.754414   	0.607442   
3  	60    	0.688484	0.0339617  	0.754414   	0.616286   
4  	60    	0.700262	0.0300808  	0.754414   	0.635387   
5  	60    	0.692401	0.0413004  	0.754414   	0.607442   
6  	60    	0.700378	0.0436278  	0.754414   	0.62535    
7  	60    	0.71801 	0.0382481  	0.783153   	0.635004   
8  	60    	0.691626	0.0491951  	0.783153   	0.611192   
9  	60    	0.710215	0.0479424  	0.783153   	0.639539   
10 	60    	0.700846	0.0388811  	0.783153   	0.64458    
Fold 1:
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	30    	0.661204	0.0241653  	0.722773   	0.624899   
1  	60    	0.67836 	0.0249342  	0.722773   	0.635004   
2  	60    	0.695019	0.0233858  	0.738112   	0.657051   
3  	60    	0.688494	0.0226821  	0.738112   	0.643333   
4  	60    	0.695743	0.0234376  	0.738112

In [8]:
with open(f'output/bong_features.npy', 'wb') as f:
    o = np.array(F_features)
    np.save(f, o)