In [1]:
import pandas as pd

In [2]:
df = {}
for k in ["train", "val", "test"]:
    df[k] = {}
    for c in ["data", "nllf", "lf"]:
        df[k][c] = pd.read_excel(f"setting/{c}_v2_{k}.xlsx", index_col=0)

In [3]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

In [4]:
X_train = df["train"]["nllf"]
X_val = df["val"]["nllf"]
X_test = df["test"]["nllf"]

X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_val = X_val.loc[:, ~X_val.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

y_train = (df["train"]["data"]["Final decision"] == "INCLUDE").apply(int)
y_val = (df["val"]["data"]["Final decision"] == "INCLUDE").apply(int)
y_test = (df["test"]["data"]["Final decision"] == "INCLUDE").apply(int)

In [5]:
import numpy as np
import random

In [6]:
train_test_sample = pd.concat([
    pd.concat([X_train, y_train], axis=1), 
    pd.concat([X_val, y_val], axis=1)
    ], ignore_index=True)
train_test_sample

Unnamed: 0,lf5(N),lf5(Y),b2(N),b2(Y),exp26(N),exp26(Y),b12_v1_v3(N),b12_v1_v3(Y),raw10(N),raw10(Y),...,raw13(Y),exp18(N),exp18(Y),b13(N),b13(Y),raw39(N),raw39(Y),exp21_v2(N),exp21_v2(Y),Final decision
0,0.006797,0.991204,0.007208,0.993486,0.005154,0.997402,0.996565,0.001720,0.010484,0.983034,...,0.988148,0.015532,0.988488,0.009329,0.985278,0.006359,0.994883,0.010598,0.990263,1
1,0.464686,0.680502,0.986265,0.012898,0.456685,0.725145,0.994612,0.004176,0.585268,0.561689,...,0.976192,0.809093,0.236501,0.994463,0.005285,0.581683,0.601325,0.526812,0.646516,1
2,0.008111,0.990680,0.019364,0.979410,0.993112,0.014233,0.997153,0.001900,0.689119,0.399902,...,0.961028,0.627380,0.475045,0.994930,0.005071,0.015963,0.982280,0.736898,0.375038,0
3,0.938699,0.039409,0.006707,0.996540,0.008989,0.996240,0.015612,0.991958,0.981622,0.012530,...,0.892292,0.978232,0.014809,0.988281,0.010148,0.977685,0.016512,0.971108,0.021210,1
4,0.004827,0.995260,0.991239,0.005303,0.018197,0.991456,0.995041,0.002370,0.654691,0.582867,...,0.996197,0.097908,0.879996,0.009482,0.991748,0.014242,0.995127,0.021497,0.982178,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,0.012703,0.965127,0.027734,0.971884,0.062056,0.926370,0.997019,0.001740,0.470471,0.406782,...,0.794686,0.134199,0.851395,0.857962,0.059040,0.028292,0.933151,0.150524,0.754581,0
1596,0.028358,0.942425,0.985174,0.012300,0.039989,0.980104,0.691809,0.433737,0.718664,0.245848,...,0.989001,0.216472,0.784942,0.991878,0.006347,0.057826,0.958073,0.028895,0.959353,1
1597,0.007458,0.994409,0.005312,0.994881,0.007673,0.994913,0.978652,0.015731,0.014239,0.989168,...,0.994663,0.015180,0.987056,0.988918,0.006183,0.008343,0.994538,0.008844,0.992776,0
1598,0.020486,0.990103,0.012529,0.991320,0.007614,0.996302,0.809357,0.356282,0.060577,0.968685,...,0.992049,0.045135,0.976912,0.978252,0.017559,0.031301,0.983528,0.026847,0.986191,0


In [7]:
F_features = np.load("output/nllf_features.npy")

In [8]:
features, counts = np.unique(F_features, return_counts=True)
k = 5
new_best_features = features[counts>=k]

X_train_val = train_test_sample.drop(columns="Final decision")
y_train_val = train_test_sample["Final decision"]

clf = DecisionTreeClassifier(random_state=42, max_depth=5)
clf.fit(X_train_val[new_best_features], y_train_val)
print(clf.score(X_train_val[new_best_features], y_train_val))
print(classification_report(y_test, clf.predict(X_test[new_best_features]), digits=4))
o = classification_report(y_test, clf.predict(X_test[new_best_features]), digits=4, output_dict=True)
o

0.75625
              precision    recall  f1-score   support

           0     0.5885    0.6543    0.6196       188
           1     0.6597    0.5943    0.6253       212

    accuracy                         0.6225       400
   macro avg     0.6241    0.6243    0.6225       400
weighted avg     0.6262    0.6225    0.6226       400



{'0': {'precision': 0.5885167464114832,
  'recall': 0.6542553191489362,
  'f1-score': 0.619647355163728,
  'support': 188},
 '1': {'precision': 0.6596858638743456,
  'recall': 0.5943396226415094,
  'f1-score': 0.6253101736972705,
  'support': 212},
 'accuracy': 0.6225,
 'macro avg': {'precision': 0.6241013051429144,
  'recall': 0.6242974708952228,
  'f1-score': 0.6224787644304992,
  'support': 400},
 'weighted avg': {'precision': 0.6262363786668003,
  'recall': 0.6225,
  'f1-score': 0.6226486489865056,
  'support': 400}}