In [1]:
import pandas as pd

In [2]:
df = {}
for k in ["train", "val", "test"]:
    df[k] = {}
    for c in ["data", "nllf", "bong"]:
        df[k][c] = pd.read_excel(f"setting/{c}_v2_{k}.xlsx", index_col=0)

In [3]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

In [4]:
X_train = pd.concat([df["train"]["nllf"], df["train"]["bong"]], axis=1)
X_val = pd.concat([df["val"]["nllf"], df["val"]["bong"]], axis=1)
X_test = pd.concat([df["test"]["nllf"], df["test"]["bong"]], axis=1)

X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_val = X_val.loc[:, ~X_val.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

y_train = (df["train"]["data"]["Final decision"] == "INCLUDE").apply(int)
y_val = (df["val"]["data"]["Final decision"] == "INCLUDE").apply(int)
y_test = (df["test"]["data"]["Final decision"] == "INCLUDE").apply(int)

In [5]:
import numpy as np
import random

In [6]:
train_test_sample = pd.concat([
    pd.concat([X_train, y_train], axis=1), 
    pd.concat([X_val, y_val], axis=1)
    ], ignore_index=True)
train_test_sample

Unnamed: 0,lf5(N),lf5(Y),b2(N),b2(Y),exp26(N),exp26(Y),b12_v1_v3(N),b12_v1_v3(Y),raw10(N),raw10(Y),...,bong_992,bong_993,bong_994,bong_995,bong_996,bong_997,bong_998,bong_999,bong_1000,Final decision
0,0.006797,0.991204,0.007208,0.993486,0.005154,0.997402,0.996565,0.001720,0.010484,0.983034,...,0.000000,0.0,0.00000,0.0,0.000000,0.000000,0.0,0.0,0.0,1
1,0.464686,0.680502,0.986265,0.012898,0.456685,0.725145,0.994612,0.004176,0.585268,0.561689,...,0.000000,0.0,0.00000,0.0,0.000000,0.065172,0.0,0.0,0.0,1
2,0.008111,0.990680,0.019364,0.979410,0.993112,0.014233,0.997153,0.001900,0.689119,0.399902,...,0.000000,0.0,0.06129,0.0,0.000000,0.000000,0.0,0.0,0.0,0
3,0.938699,0.039409,0.006707,0.996540,0.008989,0.996240,0.015612,0.991958,0.981622,0.012530,...,0.045521,0.0,0.00000,0.0,0.000000,0.000000,0.0,0.0,0.0,1
4,0.004827,0.995260,0.991239,0.005303,0.018197,0.991456,0.995041,0.002370,0.654691,0.582867,...,0.000000,0.0,0.00000,0.0,0.000000,0.000000,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,0.012703,0.965127,0.027734,0.971884,0.062056,0.926370,0.997019,0.001740,0.470471,0.406782,...,0.000000,0.0,0.00000,0.0,0.052758,0.000000,0.0,0.0,0.0,0
1596,0.028358,0.942425,0.985174,0.012300,0.039989,0.980104,0.691809,0.433737,0.718664,0.245848,...,0.000000,0.0,0.00000,0.0,0.000000,0.039327,0.0,0.0,0.0,1
1597,0.007458,0.994409,0.005312,0.994881,0.007673,0.994913,0.978652,0.015731,0.014239,0.989168,...,0.000000,0.0,0.00000,0.0,0.000000,0.076594,0.0,0.0,0.0,0
1598,0.020486,0.990103,0.012529,0.991320,0.007614,0.996302,0.809357,0.356282,0.060577,0.968685,...,0.000000,0.0,0.00000,0.0,0.000000,0.000000,0.0,0.0,0.0,0


In [7]:
F_features1 = np.load("output/nllf_features.npy")
F_features2 = np.load("output/bong_features.npy")
features1, counts1 = np.unique(F_features1, return_counts=True)
features2, counts2 = np.unique(F_features2, return_counts=True)
k = 5
new_best_features = list(features1[counts1>=k]) + list(features2[counts2>=k])

X_train_val = train_test_sample.drop(columns="Final decision")
y_train_val = train_test_sample["Final decision"]

clf = DecisionTreeClassifier(random_state=42, max_depth=5)
clf.fit(X_train_val[new_best_features], y_train_val)
print(clf.score(X_train_val[new_best_features], y_train_val))
print(classification_report(y_test, clf.predict(X_test[new_best_features]), digits=4))
o = classification_report(y_test, clf.predict(X_test[new_best_features]), digits=4, output_dict=True)
o

0.78375
              precision    recall  f1-score   support

           0     0.6345    0.6649    0.6494       188
           1     0.6897    0.6604    0.6747       212

    accuracy                         0.6625       400
   macro avg     0.6621    0.6626    0.6620       400
weighted avg     0.6637    0.6625    0.6628       400



{'0': {'precision': 0.6345177664974619,
  'recall': 0.6648936170212766,
  'f1-score': 0.6493506493506492,
  'support': 188},
 '1': {'precision': 0.6896551724137931,
  'recall': 0.660377358490566,
  'f1-score': 0.6746987951807228,
  'support': 212},
 'accuracy': 0.6625,
 'macro avg': {'precision': 0.6620864694556275,
  'recall': 0.6626354877559213,
  'f1-score': 0.6620247222656861,
  'support': 400},
 'weighted avg': {'precision': 0.6637405916331176,
  'recall': 0.6625,
  'f1-score': 0.6627851666405883,
  'support': 400}}