In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, optimizers, callbacks
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# =========================
# 1. 数据加载
# =========================

train_path = "../liar2_dataset/processed_dataset/processed_train.csv"
valid_path = "../liar2_dataset/processed_dataset/processed_valid.csv"
test_path  = "../liar2_dataset/processed_dataset/processed_test.csv"

train_data = pd.read_csv(train_path)
valid_data = pd.read_csv(valid_path)
test_data  = pd.read_csv(test_path)

# 如果你的标签是文本(如 "pants-fire" 等)，需要先映射到数值；若已是0~5，则如下
if train_data["label"].dtype == np.int64:
    # 将 0/1/2 => 0,   3/4/5 => 1
    train_data["label"] = train_data["label"].apply(lambda x: 0 if x < 3 else 1)
    valid_data["label"] = valid_data["label"].apply(lambda x: 0 if x < 3 else 1)
    test_data["label"]  = test_data["label"].apply(lambda x: 0 if x < 3 else 1)
else:
    # 如果还是文本标签，则手动做一个映射，例如：
    label_mapping_2class = {
        "pants-fire": 0,
        "false":      0,
        "barely-true":0,
        "half-true":  1,
        "mostly-true":1,
        "true":       1
    }
    train_data["label"] = train_data["label"].map(label_mapping_2class)
    valid_data["label"] = valid_data["label"].map(label_mapping_2class)
    test_data["label"]  = test_data["label"].map(label_mapping_2class)

train_labels = train_data["label"].values
valid_labels = valid_data["label"].values
test_labels  = test_data["label"].values

# =========================
# 2. Tokenizer & Padding
# =========================

max_words = 5000  # 只考虑前 5000 个常见词
max_len   = 100   # 每条语句的最大长度

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data["clean_statement"])

train_seq = tokenizer.texts_to_sequences(train_data["clean_statement"])
valid_seq = tokenizer.texts_to_sequences(valid_data["clean_statement"])
test_seq  = tokenizer.texts_to_sequences(test_data["clean_statement"])

train_seq = pad_sequences(train_seq, maxlen=max_len, padding='post', truncating='post')
valid_seq = pad_sequences(valid_seq, maxlen=max_len, padding='post', truncating='post')
test_seq  = pad_sequences(test_seq, maxlen=max_len, padding='post', truncating='post')

# =========================
# 3. 构建 LSTM 模型 (二分类)
# =========================
model_2class = models.Sequential([
    layers.Embedding(input_dim=max_words, output_dim=128),
    layers.Bidirectional(layers.LSTM(128, return_sequences=False)),
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    # 二分类：1个神经元 + sigmoid 输出
    layers.Dense(1, activation='sigmoid')
])

model_2class.compile(
    optimizer=optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',  # 二分类常用损失函数
    metrics=['accuracy']
)

# =========================
# 4. 模型训练
# =========================
early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)

history_2class = model_2class.fit(
    train_seq, train_labels,
    epochs=20,
    batch_size=128,
    validation_data=(valid_seq, valid_labels),
    callbacks=[early_stop],
    verbose=2
)

# =========================
# 5. 可视化训练过程
# =========================
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history_2class.history['loss'], label='Train Loss')
plt.plot(history_2class.history['val_loss'], label='Val Loss')
plt.title('2-class LSTM - Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history_2class.history['accuracy'], label='Train Acc')
plt.plot(history_2class.history['val_accuracy'], label='Val Acc')
plt.title('2-class LSTM - Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

# =========================
# 6. 模型评估与预测
# =========================
y_pred_prob_2class = model_2class.predict(test_seq)
# 概率 >= 0.5 则预测为 1，否则为 0
y_pred_2class = (y_pred_prob_2class >= 0.5).astype(int).reshape(-1)

print("=== 2-Class Classification Report ===\n")
print(classification_report(test_labels, y_pred_2class, target_names=["fake", "real"]))

cm_2class = confusion_matrix(test_labels, y_pred_2class)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_2class, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Pred_fake","Pred_real"],
            yticklabels=["True_fake","True_real"])
plt.title("Confusion Matrix - 2 Class")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# =========================
# 7. 导出预测结果
# =========================
result_df_2class = pd.DataFrame({
    "id": test_data["id"],
    "true_label": test_labels,
    "predicted_label": y_pred_2class
})
result_df_2class.to_csv("./predict_result_lstm_2class.csv", index=False)
print("✅ [二分类] 预测结果已保存至: ./predict_result_lstm_2class.csv")
