In [1]:
import pandas as pd
import numpy as np
import pickle
root="data/"

In [2]:
df = {}
for k in ["train", "val", "test"]:
    df[k] = {}
    for c in ["nlfl"]:
        df[k][c] = pd.read_excel(root+f"{c}_{k}_sample_v3.xlsx", index_col=0)

In [3]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

In [4]:
cols_nlfl = [c for c in df["train"]["nlfl"].columns if "chatgpt_" in c and "(" in c]

X_train = df["train"]["nlfl"][cols_nlfl]
X_val = df["val"]["nlfl"][cols_nlfl]
X_test = df["test"]["nlfl"][cols_nlfl]

X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_val = X_val.loc[:, ~X_val.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

y_train = (df["train"]["nlfl"]["label"]).apply(int)
y_val = (df["val"]["nlfl"]["label"]).apply(int)
y_test = (df["test"]["nlfl"]["label"]).apply(int)

In [5]:
train_test_sample = pd.concat([
    pd.concat([X_train, y_train], axis=1), 
    pd.concat([X_val, y_val], axis=1)
    ], ignore_index=True)
train_test_sample

Unnamed: 0,chatgpt_v1 (N),chatgpt_v1 (Y),chatgpt_v2 (N),chatgpt_v2 (Y),chatgpt_v3 (N),chatgpt_v3 (Y),chatgpt_v4 (N),chatgpt_v4 (Y),chatgpt_v5 (N),chatgpt_v5 (Y),...,chatgpt_SP_2 (Y),chatgpt_SP_3 (N),chatgpt_SP_3 (Y),chatgpt_SP_4 (N),chatgpt_SP_4 (Y),chatgpt_SP_5 (N),chatgpt_SP_5 (Y),chatgpt_SP_6 (N),chatgpt_SP_6 (Y),label
0,0.837045,0.074928,0.874950,0.058878,0.842697,0.072170,0.872751,0.065264,0.944624,0.030124,...,0.080620,0.602511,0.224162,0.834352,0.094326,0.823963,0.088727,0.829290,0.083835,0
1,0.893856,0.042877,0.917656,0.033072,0.899637,0.040439,0.919270,0.035785,0.960115,0.017034,...,0.046663,0.877010,0.052646,0.901047,0.042246,0.884865,0.051048,0.887147,0.048326,0
2,0.868321,0.068624,0.888266,0.058634,0.876257,0.065225,0.896162,0.053497,0.960362,0.025203,...,0.076121,0.856120,0.074968,0.880512,0.062470,0.857966,0.079416,0.859009,0.076588,0
3,0.830221,0.069570,0.884124,0.056058,0.857856,0.066485,0.863737,0.065393,0.939130,0.027966,...,0.092856,0.337625,0.653284,0.628601,0.356157,0.836543,0.112854,0.812596,0.092018,0
4,0.810073,0.117722,0.856272,0.095584,0.831209,0.106522,0.850595,0.084074,0.957828,0.023903,...,0.125209,0.780242,0.136388,0.831060,0.108094,0.788978,0.150105,0.789846,0.143326,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14452,0.884656,0.133864,0.945302,0.070176,0.900210,0.115229,0.884784,0.121732,0.959946,0.035919,...,0.149740,0.432901,0.542170,0.888701,0.134504,0.883691,0.130722,0.843456,0.161103,0
14453,0.797020,0.158155,0.883019,0.101123,0.816630,0.145359,0.812437,0.146260,0.944786,0.042557,...,0.172934,0.621840,0.345189,0.811108,0.149026,0.800502,0.173553,0.765926,0.183528,0
14454,0.833436,0.106078,0.914619,0.062911,0.853236,0.097988,0.850571,0.091232,0.947122,0.033179,...,0.124764,0.556392,0.292726,0.856776,0.085556,0.869414,0.105912,0.800037,0.134119,0
14455,0.839765,0.180800,0.940706,0.092435,0.874007,0.157076,0.803798,0.165990,0.936916,0.056630,...,0.212695,0.209000,0.829604,0.597261,0.380869,0.839076,0.198679,0.774161,0.234423,0


In [6]:
F_features = np.load("output/nllf_features.npy")

In [7]:
features, counts = np.unique(F_features, return_counts=True)
k = 5
new_best_features = features[counts>=k]

X_train_val = train_test_sample.drop(columns="label")
y_train_val = train_test_sample["label"]

clf = DecisionTreeClassifier(random_state=42, max_depth=5)
clf.fit(X_train_val[new_best_features], y_train_val)
print(clf.score(X_train_val[new_best_features], y_train_val))
print(classification_report(y_test, clf.predict(X_test[new_best_features]), digits=4))
o = classification_report(y_test, clf.predict(X_test[new_best_features]), digits=4, output_dict=True)
o

0.8980424707754029
              precision    recall  f1-score   support

           0     0.8727    0.9630    0.9156       541
           1     0.7500    0.4412    0.5556       136

    accuracy                         0.8582       677
   macro avg     0.8113    0.7021    0.7356       677
weighted avg     0.8480    0.8582    0.8433       677



{'0': {'precision': 0.8726968174204355,
  'recall': 0.9630314232902033,
  'f1-score': 0.9156414762741651,
  'support': 541},
 '1': {'precision': 0.75,
  'recall': 0.4411764705882353,
  'f1-score': 0.5555555555555556,
  'support': 136},
 'accuracy': 0.8581979320531757,
 'macro avg': {'precision': 0.8113484087102177,
  'recall': 0.7021039469392193,
  'f1-score': 0.7355985159148604,
  'support': 677},
 'weighted avg': {'precision': 0.8480487122960938,
  'recall': 0.8581979320531757,
  'f1-score': 0.8433051613292155,
  'support': 677}}