In [1]:
import pandas as pd

In [2]:
df = {}
for k in ["train", "val", "test"]:
    df[k] = {}
    for c in ["data", "nllf", "lf"]:
        df[k][c] = pd.read_excel(f"setting/{c}_v2_{k}.xlsx", index_col=0)

In [3]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

In [4]:
X_train = df["train"]["lf"]
X_val = df["val"]["lf"]
X_test = df["test"]["lf"]

X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_val = X_val.loc[:, ~X_val.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

y_train = (df["train"]["data"]["Final decision"] == "INCLUDE").apply(int)
y_val = (df["val"]["data"]["Final decision"] == "INCLUDE").apply(int)
y_test = (df["test"]["data"]["Final decision"] == "INCLUDE").apply(int)

In [5]:
from sklearn_genetic import GAFeatureSelectionCV
import numpy as np
import random

2023-10-21 17:22:30.417546: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from sklearn.model_selection import KFold

In [7]:
random_seed = 54

np.random.seed(random_seed)
random.seed(random_seed)

train_test_sample = pd.concat([
    pd.concat([X_train, y_train], axis=1), 
    pd.concat([X_val, y_val], axis=1)
    ], ignore_index=True)
train_test_sample

kf = KFold(n_splits=15, random_state=42, shuffle=True)
F_features = []
for i, (train_index, test_index) in enumerate(kf.split(train_test_sample)):
    print(f"Fold {i}:")

    evolved_estimator = GAFeatureSelectionCV(
        estimator=DecisionTreeClassifier(
            random_state=42, 
            max_depth=5, 
            criterion="gini", 
        ),
        cv=[(train_index, test_index)],
        scoring="f1_macro",
        population_size=30,
        generations=10,
        n_jobs=-1,
        verbose=True,
        keep_top_k=2,
        elitism=True,
    )

    support = [x for x in train_test_sample.columns if x not in ["Final decision"]]

    evolved_estimator.fit(pd.DataFrame(train_test_sample.drop(columns="Final decision")[support].values), train_test_sample["Final decision"])

    best_features = [x for i, x in enumerate(train_test_sample.drop(columns="Final decision")[support].columns) if vars(evolved_estimator)["best_features_"][i]]
    F_features += best_features

Fold 0:


gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	30    	0.66335	0.0498815  	0.733072   	0.511706   
1  	60    	0.71022	0.0294163  	0.778787   	0.641687   
2  	60    	0.724663	0.0272024  	0.778787   	0.672783   
3  	60    	0.730313	0.0316545  	0.778787   	0.654085   
4  	60    	0.731279	0.0261637  	0.778787   	0.669513   
5  	60    	0.72759 	0.0178233  	0.77234    	0.691157   
6  	60    	0.730674	0.0186385  	0.77234    	0.700699   
7  	60    	0.727898	0.0202073  	0.763379   	0.680597   
8  	60    	0.738078	0.0252335  	0.765618   	0.680597   
9  	60    	0.74075 	0.0267698  	0.802635   	0.674014   
10 	60    	0.734458	0.0296719  	0.765618   	0.674014   
Fold 1:
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	30    	0.653765	0.0339332  	0.714614   	0.58894    
1  	60    	0.673562	0.0215756  	0.710498   	0.636878   
2  	60    	0.682155	0.0241394  	0.721079   	0.628786   
3  	60    	0.689668	0.021458   	0.721079   	0.64499    
4  	60    	0.687371	0.0208175  	0.712366   

In [8]:
with open(f'output/lf_features.npy', 'wb') as f:
    o = np.array(F_features)
    np.save(f, o)