In [1]:
import pandas as pd
import numpy as np
import pickle
root="data/"

In [2]:
df = {}
for k in ["train", "val", "test"]:
    df[k] = {}
    for c in ["nlfl", "mf"]:
        df[k][c] = pd.read_excel(root+f"{c}_{k}_sample_v3.xlsx", index_col=0) if c == "nlfl" else pd.read_excel(root+f"{c}_features_{k}_task_C1.xlsx", index_col=0)

In [3]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

In [4]:
for k in ["train", "val", "test"]:
    df[k]["mf"] = df[k]["mf"].loc[df[k]["nlfl"]["id"]].reset_index().drop(columns="id")

In [5]:
cols_mf = [c for c in df["train"]["mf"].columns if "linguistic" not in c]
cols_nlfl = [c for c in df["train"]["nlfl"].columns if "chatgpt_" in c and "(" in c]

X_train = pd.concat([df["train"]["nlfl"][cols_nlfl], df["train"]["mf"][cols_mf]], axis=1)
X_val = pd.concat([df["val"]["nlfl"][cols_nlfl], df["val"]["mf"][cols_mf]], axis=1)
X_test = pd.concat([df["test"]["nlfl"][cols_nlfl], df["test"]["mf"][cols_mf]], axis=1)

X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_val = X_val.loc[:, ~X_val.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

y_train = (df["train"]["nlfl"]["label"]).apply(int)
y_val = (df["val"]["nlfl"]["label"]).apply(int)
y_test = (df["test"]["nlfl"]["label"]).apply(int)

In [6]:
train_test_sample = pd.concat([
    pd.concat([X_train, y_train], axis=1), 
    pd.concat([X_val, y_val], axis=1)
    ], ignore_index=True)
train_test_sample

Unnamed: 0,chatgpt_v1 (N),chatgpt_v1 (Y),chatgpt_v2 (N),chatgpt_v2 (Y),chatgpt_v3 (N),chatgpt_v3 (Y),chatgpt_v4 (N),chatgpt_v4 (Y),chatgpt_v5 (N),chatgpt_v5 (Y),...,semantic<&>ratio_ud,semantic<&>ratio_slang,semantic<&>ratio_keywords,semantic<&>ratio_faces,traditional<&>ratio_vowel,traditional<&>ratio_no_numbers,traditional<&>ratio_punct,traditional<&>exist_numbs,traditional<&>max_len_number,label
0,0.837045,0.074928,0.874950,0.058878,0.842697,0.072170,0.872751,0.065264,0.944624,0.030124,...,0.000000,0.000000,0.000000,1.000000,0.333333,1.000000,0.000000,1,1,0
1,0.893856,0.042877,0.917656,0.033072,0.899637,0.040439,0.919270,0.035785,0.960115,0.017034,...,0.244898,0.022222,0.022222,0.044444,0.421053,0.555556,0.032000,1,4,0
2,0.868321,0.068624,0.888266,0.058634,0.876257,0.065225,0.896162,0.053497,0.960362,0.025203,...,0.437500,0.066667,0.066667,0.000000,0.428571,0.533333,0.045455,1,3,0
3,0.830221,0.069570,0.884124,0.056058,0.857856,0.066485,0.863737,0.065393,0.939130,0.027966,...,0.333333,0.133333,0.000000,0.000000,0.456522,0.800000,0.037736,1,2,0
4,0.810073,0.117722,0.856272,0.095584,0.831209,0.106522,0.850595,0.084074,0.957828,0.023903,...,0.384615,0.000000,0.000000,0.083333,0.523810,0.500000,0.027027,1,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14452,0.884656,0.133864,0.945302,0.070176,0.900210,0.115229,0.884784,0.121732,0.959946,0.035919,...,0.046512,0.000000,0.000000,0.050000,0.463235,1.025000,0.000000,1,2,0
14453,0.797020,0.158155,0.883019,0.101123,0.816630,0.145359,0.812437,0.146260,0.944786,0.042557,...,0.058824,0.022222,0.022222,0.044444,0.445122,1.022222,0.000000,1,2,0
14454,0.833436,0.106078,0.914619,0.062911,0.853236,0.097988,0.850571,0.091232,0.947122,0.033179,...,0.036364,0.039216,0.000000,0.019608,0.483146,1.019608,0.000000,1,2,0
14455,0.839765,0.180800,0.940706,0.092435,0.874007,0.157076,0.803798,0.165990,0.936916,0.056630,...,0.000000,0.100000,0.100000,0.100000,0.468085,1.000000,0.000000,0,0,0


In [7]:
F_features1 = np.load("output/nllf_features.npy")
F_features2 = np.load("output/mf_features.npy")
features1, counts1 = np.unique(F_features1, return_counts=True)
features2, counts2 = np.unique(F_features2, return_counts=True)
k = 5
new_best_features = list(features1[counts1>=k]) + list(features2[counts2>=k])

X_train_val = train_test_sample.drop(columns="label")
y_train_val = train_test_sample["label"]

clf = DecisionTreeClassifier(random_state=42, max_depth=5)
clf.fit(X_train_val[new_best_features], y_train_val)
print(clf.score(X_train_val[new_best_features], y_train_val))
print(classification_report(y_test, clf.predict(X_test[new_best_features]), digits=4))
o = classification_report(y_test, clf.predict(X_test[new_best_features]), digits=4, output_dict=True)
o

0.9440409490212354
              precision    recall  f1-score   support

           0     0.9324    0.9686    0.9501       541
           1     0.8522    0.7206    0.7809       136

    accuracy                         0.9188       677
   macro avg     0.8923    0.8446    0.8655       677
weighted avg     0.9163    0.9188    0.9161       677



{'0': {'precision': 0.9323843416370107,
  'recall': 0.9685767097966729,
  'f1-score': 0.9501359927470535,
  'support': 541},
 '1': {'precision': 0.8521739130434782,
  'recall': 0.7205882352941176,
  'f1-score': 0.7808764940239042,
  'support': 136},
 'accuracy': 0.9187592319054653,
 'macro avg': {'precision': 0.8922791273402444,
  'recall': 0.8445824725453952,
  'f1-score': 0.8655062433854789,
  'support': 677},
 'weighted avg': {'precision': 0.9162711683892699,
  'recall': 0.9187592319054653,
  'f1-score': 0.9161340845840574,
  'support': 677}}