In [1]:
import pandas as pd
import numpy as np
import pickle
root="data/"

In [2]:
df = {}
for k in ["train", "val", "test"]:
    df[k] = {}
    for c in ["bong"]:
        df[k][c] = pd.read_excel(root+f"{c}_{k}.xlsx", index_col=0)

In [3]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

In [4]:
cols_bong = [c for c in df["train"]["bong"].columns if c not in ["id", "label"]]

X_train = df["train"]["bong"][cols_bong]
X_val = df["val"]["bong"][cols_bong]
X_test = df["test"]["bong"][cols_bong]

X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_val = X_val.loc[:, ~X_val.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

y_train = (df["train"]["bong"]["label"]).apply(int)
y_val = (df["val"]["bong"]["label"]).apply(int)
y_test = (df["test"]["bong"]["label"]).apply(int)

In [5]:
from sklearn_genetic import GAFeatureSelectionCV
import random

2023-10-21 17:24:35.528562: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from sklearn.model_selection import KFold

In [7]:
random_seed = 54

np.random.seed(random_seed)
random.seed(random_seed)

train_test_sample = pd.concat([
    pd.concat([X_train, y_train], axis=1), 
    pd.concat([X_val, y_val], axis=1)
    ], ignore_index=True)
train_test_sample

kf = KFold(n_splits=15, random_state=42, shuffle=True)
F_features = []
for i, (train_index, test_index) in enumerate(kf.split(train_test_sample)):
    print(f"Fold {i}:")

    evolved_estimator = GAFeatureSelectionCV(
        estimator=DecisionTreeClassifier(
            random_state=42, 
            max_depth=5, 
            criterion="gini", 
        ),
        cv=[(train_index, test_index)],
        scoring="f1_macro",
        population_size=30,
        generations=5,
        n_jobs=-1,
        verbose=True,
        keep_top_k=2,
        elitism=True,
    )

    support = [x for x in train_test_sample.columns if x not in ["label"]]

    evolved_estimator.fit(pd.DataFrame(train_test_sample.drop(columns="label")[support].values), train_test_sample["label"])

    best_features = [x for i, x in enumerate(train_test_sample.drop(columns="label")[support].columns) if vars(evolved_estimator)["best_features_"][i]]
    F_features += best_features

Fold 0:
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	30    	0.487077	0.0120581  	0.49867    	0.469163   
1  	60    	0.495847	0.00697735 	0.515873   	0.479763   
2  	60    	0.499635	0.00672178 	0.524517   	0.488121   
3  	60    	0.499635	0.0112656  	0.524517   	0.470038   
4  	60    	0.502543	0.0110299  	0.524517   	0.479388   
5  	60    	0.501268	0.00931962 	0.515194   	0.47033    
Fold 1:
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	30    	0.474063	0.0109451  	0.491367   	0.456901   
1  	60    	0.482677	0.00485929 	0.491367   	0.464349   
2  	60    	0.48201 	0.00667338 	0.491367   	0.457513   
3  	60    	0.48302 	0.00571149 	0.489838   	0.463652   
4  	60    	0.482886	0.00584315 	0.489838   	0.464698   
5  	60    	0.483242	0.0063178  	0.491367   	0.464001   
Fold 2:
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	30    	0.484149	0.0177439  	0.514111   	0.462054   
1  	60    	0.501271	0.010574   	0.514111   	0.469258   
2  	60    	0.50392 	0.00

In [8]:
with open(f'output/bong_features.npy', 'wb') as f:
    o = np.array(F_features)
    np.save(f, o)