In [1]:
import pandas as pd
import numpy as np
import pickle
root="data/"

In [2]:
df = {}
for k in ["train", "val", "test"]:
    df[k] = {}
    for c in ["nlfl", "mf"]:
        df[k][c] = pd.read_excel(root+f"{c}_{k}_sample_v3.xlsx", index_col=0) if c == "nlfl" else pd.read_excel(root+f"{c}_features_{k}_task_C1.xlsx", index_col=0)

In [3]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

In [4]:
cols_mf = [c for c in df["train"]["mf"].columns if "linguistic" not in c]

X_train = df["train"]["mf"][cols_mf]
X_val = df["val"]["mf"][cols_mf]
X_test = df["test"]["mf"][cols_mf]

X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_val = X_val.loc[:, ~X_val.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

y_train = (df["train"]["nlfl"].set_index("id").loc[X_train.index]["label"]).apply(int)
y_val = (df["val"]["nlfl"].set_index("id").loc[X_val.index]["label"]).apply(int)
y_test = (df["test"]["nlfl"].set_index("id").loc[X_test.index]["label"]).apply(int)

In [5]:
from sklearn_genetic import GAFeatureSelectionCV
import random

2023-10-21 17:25:08.646143: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from sklearn.model_selection import KFold

In [7]:
random_seed = 54

np.random.seed(random_seed)
random.seed(random_seed)

train_test_sample = pd.concat([
    pd.concat([X_train, y_train], axis=1), 
    pd.concat([X_val, y_val], axis=1)
    ], ignore_index=True)
train_test_sample

kf = KFold(n_splits=15, random_state=42, shuffle=True)
F_features = []
for i, (train_index, test_index) in enumerate(kf.split(train_test_sample)):
    print(f"Fold {i}:")

    evolved_estimator = GAFeatureSelectionCV(
        estimator=DecisionTreeClassifier(
            random_state=42, 
            max_depth=5, 
            criterion="gini", 
        ),
        cv=[(train_index, test_index)],
        scoring="f1_macro",
        population_size=30,
        generations=5,
        n_jobs=-1,
        verbose=True,
        keep_top_k=2,
        elitism=True,
    )

    support = [x for x in train_test_sample.columns if x not in ["label"]]

    evolved_estimator.fit(pd.DataFrame(train_test_sample.drop(columns="label")[support].values), train_test_sample["label"])

    best_features = [x for i, x in enumerate(train_test_sample.drop(columns="label")[support].columns) if vars(evolved_estimator)["best_features_"][i]]
    F_features += best_features

Fold 0:


gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	30    	0.804092	0.0187542  	0.838976   	0.76822    
1  	60    	0.822216	0.0140232  	0.841706   	0.795395   
2  	60    	0.827483	0.0120324  	0.841706   	0.804337   
3  	60    	0.831584	0.0113229  	0.845759   	0.8069     
4  	60    	0.829977	0.0139064  	0.845759   	0.801359   
5  	60    	0.832758	0.0140049  	0.845759   	0.799167   
Fold 1:
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	30    	0.851175	0.0126742  	0.884377   	0.82071    
1  	60    	0.860346	0.00809068 	0.884377   	0.84148    
2  	60    	0.865444	0.00811309 	0.884377   	0.845766   
3  	60    	0.865436	0.00792519 	0.889151   	0.846848   
4  	60    	0.870126	0.0120527  	0.896206   	0.850952   
5  	60    	0.870859	0.0108485  	0.896206   	0.855272   
Fold 2:
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	30    	0.814973	0.0129222  	0.842311   	0.7906     
1  	60    	0.826755	0.0101236  	0.858771   	0.814806   
2  	60    	0.831902	0.0108503  	

In [8]:
with open(f'output/mf_features.npy', 'wb') as f:
    o = np.array(F_features)
    np.save(f, o)