In [1]:
import pandas as pd

In [2]:
df = {}
for k in ["train", "val", "test"]:
    df[k] = {}
    for c in ["data", "nllf", "lf"]:
        df[k][c] = pd.read_excel(f"setting/{c}_v2_{k}.xlsx", index_col=0)

In [3]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

In [4]:
X_train = df["train"]["lf"]
X_val = df["val"]["lf"]
X_test = df["test"]["lf"]

X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_val = X_val.loc[:, ~X_val.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

y_train = (df["train"]["data"]["Final decision"] == "INCLUDE").apply(int)
y_val = (df["val"]["data"]["Final decision"] == "INCLUDE").apply(int)
y_test = (df["test"]["data"]["Final decision"] == "INCLUDE").apply(int)

In [5]:
import numpy as np
import random

In [6]:
train_test_sample = pd.concat([
    pd.concat([X_train, y_train], axis=1), 
    pd.concat([X_val, y_val], axis=1)
    ], ignore_index=True)
train_test_sample

Unnamed: 0,farming systems,significantly,ecosystem,conventional,farming,footprints,higher,nt,tillage practices,agriculture,...,inorganic,synthetic,chemical,intensive,intensification,livestock,silage,water,soil erosion,Final decision
0,0,0,0,0,2,0,0,0,0,1,...,0,0,0,0,0,4,0,0,0,1
1,0,0,0,1,2,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,0,0,0,0,1,0,0,0,0,6,...,0,0,0,0,0,0,0,0,0,0
1596,0,0,0,0,0,0,0,0,0,1,...,2,0,0,0,0,1,0,1,0,1
1597,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1598,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
F_features = np.load("output/lf_features.npy")

In [8]:
features, counts = np.unique(F_features, return_counts=True)
k = 5
new_best_features = features[counts>=k]

X_train_val = train_test_sample.drop(columns="Final decision")
y_train_val = train_test_sample["Final decision"]

clf = DecisionTreeClassifier(random_state=42, max_depth=5)
clf.fit(X_train_val[new_best_features], y_train_val)
print(clf.score(X_train_val[new_best_features], y_train_val))
print(classification_report(y_test, clf.predict(X_test[new_best_features]), digits=4))
o = classification_report(y_test, clf.predict(X_test[new_best_features]), digits=4, output_dict=True)
o

0.730625
              precision    recall  f1-score   support

           0     0.5954    0.8298    0.6933       188
           1     0.7681    0.5000    0.6057       212

    accuracy                         0.6550       400
   macro avg     0.6818    0.6649    0.6495       400
weighted avg     0.6869    0.6550    0.6469       400



{'0': {'precision': 0.5954198473282443,
  'recall': 0.8297872340425532,
  'f1-score': 0.6933333333333332,
  'support': 188},
 '1': {'precision': 0.7681159420289855,
  'recall': 0.5,
  'f1-score': 0.6057142857142858,
  'support': 212},
 'accuracy': 0.655,
 'macro avg': {'precision': 0.6817678946786149,
  'recall': 0.6648936170212766,
  'f1-score': 0.6495238095238095,
  'support': 400},
 'weighted avg': {'precision': 0.686948777519637,
  'recall': 0.655,
  'f1-score': 0.6468952380952382,
  'support': 400}}