In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import Counter

In [24]:
df = pd.read_csv("../data/features_for_ml.csv")

X = df.drop(columns=["token", "label"])
y = df["label"]

print("Phân phối nhãn:")
print(y.value_counts(normalize=True))

Phân phối nhãn:
label
0    0.929492
1    0.070508
Name: proportion, dtype: float64


In [None]:
df.head(50)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [27]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"Model : {model_name}")
    print("Precision:", precision_score(y_true, y_pred, zero_division=0))
    print("Recall   :", recall_score(y_true, y_pred, zero_division=0))
    print("F1 Score :", f1_score(y_true, y_pred, zero_division=0))
    print("-" * 40)


In [28]:
lr_model = LogisticRegression(class_weight='balanced')
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
evaluate_model(y_test, lr_preds, "Logistic Regression")


Model : Logistic Regression
Precision: 0.08178320252777495
Recall   : 0.47958459412006427
F1 Score : 0.1397371357043729
----------------------------------------


In [29]:
dt_model = DecisionTreeClassifier(class_weight='balanced')
dt_model.fit(X_train, y_train)
dt_preds = dt_model.predict(X_test)
evaluate_model(y_test, dt_preds, "Decision Tree")


Model : Decision Tree
Precision: 0.09071054350749996
Recall   : 0.5290821472598902
F1 Score : 0.1548689742650476
----------------------------------------


In [30]:
rf_model = RandomForestClassifier(class_weight='balanced')
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
evaluate_model(y_test, rf_preds, "Random Forest")


Model : Random Forest
Precision: 0.09071228647738118
Recall   : 0.5290821472598902
F1 Score : 0.1548715144887917
----------------------------------------


In [33]:
counter = Counter(y_train)
scale_pos_weight = counter[0] / counter[1]

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                          scale_pos_weight=scale_pos_weight)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
evaluate_model(y_test, xgb_preds, "XGBoost")


Parameters: { "use_label_encoder" } are not used.



Model : XGBoost
Precision: 0.09071228647738118
Recall   : 0.5290821472598902
F1 Score : 0.1548715144887917
----------------------------------------
