In [1]:
import pandas as pd
import numpy as np
import pickle
root="data/"

In [2]:
df = {}
for k in ["train", "val", "test"]:
    df[k] = {}
    for c in ["nlfl"]:
        df[k][c] = pd.read_excel(root+f"{c}_{k}_sample_v3.xlsx", index_col=0)

In [3]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

In [4]:
cols_nlfl = [c for c in df["train"]["nlfl"].columns if "chatgpt_" in c and "(" in c]

X_train = df["train"]["nlfl"][cols_nlfl]
X_val = df["val"]["nlfl"][cols_nlfl]
X_test = df["test"]["nlfl"][cols_nlfl]

X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_val = X_val.loc[:, ~X_val.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

y_train = (df["train"]["nlfl"]["label"]).apply(int)
y_val = (df["val"]["nlfl"]["label"]).apply(int)
y_test = (df["test"]["nlfl"]["label"]).apply(int)

In [5]:
from sklearn_genetic import GAFeatureSelectionCV
import random

2023-10-21 17:24:15.580820: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from sklearn.model_selection import KFold

In [7]:
random_seed = 54

np.random.seed(random_seed)
random.seed(random_seed)

train_test_sample = pd.concat([
    pd.concat([X_train, y_train], axis=1), 
    pd.concat([X_val, y_val], axis=1)
    ], ignore_index=True)
train_test_sample

kf = KFold(n_splits=15, random_state=42, shuffle=True)
F_features = []
for i, (train_index, test_index) in enumerate(kf.split(train_test_sample)):
    print(f"Fold {i}:")

    evolved_estimator = GAFeatureSelectionCV(
        estimator=DecisionTreeClassifier(
            random_state=42, 
            max_depth=5, 
            criterion="gini", 
        ),
        cv=[(train_index, test_index)],
        scoring="f1_macro",
        population_size=30,
        generations=5,
        n_jobs=-1,
        verbose=True,
        keep_top_k=2,
        elitism=True,
    )

    support = [x for x in train_test_sample.columns if x not in ["label"] and ("chatgpt_v" in x or "Q3" in x)]

    evolved_estimator.fit(pd.DataFrame(train_test_sample.drop(columns="label")[support].values), train_test_sample["label"])

    best_features = [x for i, x in enumerate(train_test_sample.drop(columns="label")[support].columns) if vars(evolved_estimator)["best_features_"][i]]
    F_features += best_features

Fold 0:


gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	30    	0.70583	0.0187928  	0.750106   	0.676524   
1  	60    	0.725547	0.017481   	0.765382   	0.701475   
2  	60    	0.729666	0.0188883  	0.765382   	0.699979   
3  	60    	0.7401  	0.0146738  	0.765382   	0.708022   
4  	60    	0.743044	0.0171708  	0.765382   	0.693534   
5  	60    	0.748809	0.0149853  	0.765382   	0.699979   
Fold 1:
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	30    	0.676148	0.0267245  	0.75037    	0.619495   
1  	60    	0.700496	0.019825   	0.75037    	0.664016   
2  	60    	0.713459	0.0191016  	0.75037    	0.672934   
3  	60    	0.717005	0.0171201  	0.75037    	0.690554   
4  	60    	0.719619	0.0188449  	0.75037    	0.677609   
5  	60    	0.719643	0.016588   	0.753276   	0.685691   
Fold 2:
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	30    	0.719078	0.0280757  	0.757545   	0.666778   
1  	60    	0.742236	0.0150137  	0.776264   	0.689006   
2  	60    	0.749553	0.0127119  	0.

In [8]:
with open(f'output/nllf_features.npy', 'wb') as f:
    o = np.array(F_features)
    np.save(f, o)