In [1]:
import pandas as pd

In [2]:
df = {}
for k in ["train", "val", "test"]:
    df[k] = {}
    for c in ["data", "nllf", "lf"]:
        df[k][c] = pd.read_excel(f"setting/{c}_v2_{k}.xlsx", index_col=0)

In [3]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

In [4]:
X_train = df["train"]["nllf"]
X_val = df["val"]["nllf"]
X_test = df["test"]["nllf"]

X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_val = X_val.loc[:, ~X_val.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

y_train = (df["train"]["data"]["Final decision"] == "INCLUDE").apply(int)
y_val = (df["val"]["data"]["Final decision"] == "INCLUDE").apply(int)
y_test = (df["test"]["data"]["Final decision"] == "INCLUDE").apply(int)

In [5]:
from sklearn_genetic import GAFeatureSelectionCV
import numpy as np
import random

2023-10-21 17:22:48.896420: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from sklearn.model_selection import KFold

In [7]:
random_seed = 54

np.random.seed(random_seed)
random.seed(random_seed)

train_test_sample = pd.concat([
    pd.concat([X_train, y_train], axis=1), 
    pd.concat([X_val, y_val], axis=1)
    ], ignore_index=True)
train_test_sample

kf = KFold(n_splits=15, random_state=42, shuffle=True)
F_features = []
for i, (train_index, test_index) in enumerate(kf.split(train_test_sample)):
    print(f"Fold {i}:")

    evolved_estimator = GAFeatureSelectionCV(
        estimator=DecisionTreeClassifier(
            random_state=42, 
            max_depth=5, 
            criterion="gini", 
        ),
        cv=[(train_index, test_index)],
        scoring="f1_macro",
        population_size=30,
        generations=10,
        n_jobs=-1,
        verbose=True,
        keep_top_k=2,
        elitism=True,
    )

    support = [x for x in train_test_sample.columns if x not in ["Final decision"] and x[0] in ["b"]]

    evolved_estimator.fit(pd.DataFrame(train_test_sample.drop(columns="Final decision")[support].values), train_test_sample["Final decision"])

    best_features = [x for i, x in enumerate(train_test_sample.drop(columns="Final decision")[support].columns) if vars(evolved_estimator)["best_features_"][i]]
    F_features += best_features

Fold 0:


gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	30    	0.650458	0.0215086  	0.700908   	0.607442   
1  	60    	0.665909	0.0142724  	0.70028    	0.633465   
2  	60    	0.672611	0.0223039  	0.707728   	0.606618   
3  	60    	0.681417	0.0117988  	0.70028    	0.651159   
4  	60    	0.686806	0.0224016  	0.762286   	0.642329   
5  	60    	0.696188	0.0307086  	0.762286   	0.644829   
6  	60    	0.706373	0.0343031  	0.762286   	0.653722   
7  	60    	0.700775	0.0338943  	0.762286   	0.643333   
8  	60    	0.70641 	0.0319333  	0.753369   	0.654206   
9  	60    	0.703916	0.0293565  	0.753369   	0.654839   
10 	60    	0.697265	0.0274696  	0.753369   	0.661154   
Fold 1:
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	30    	0.633153	0.0284846  	0.682215   	0.563652   
1  	60    	0.645241	0.0172173  	0.682215   	0.606618   
2  	60    	0.654241	0.0175603  	0.688871   	0.624561   
3  	60    	0.654215	0.0211635  	0.69774    	0.616286   
4  	60    	0.66971 	0.0226813  	0.719013

In [8]:
with open(f'output/nllf_features.npy', 'wb') as f:
    o = np.array(F_features)
    np.save(f, o)