<a href="https://colab.research.google.com/github/chenbrilliancesol/machine-learning/blob/main/%E5%9F%BA%E4%BA%8ELSTM%E5%AF%B9%E6%8E%A8%E7%89%B9%E4%B8%8A%E7%81%BE%E9%9A%BE%E7%9A%84%E9%A2%84%E6%B5%8B%E6%A8%A1%E5%9E%8B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import re

# 1. 加载数据
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

# 2. 简单的文本清洗
def clean_text(text):
    text = text.lower()  # 小写化
    text = re.sub(r"http\S+", "", text)  # 移除URL
    text = re.sub(r"@\w+", "", text)  # 移除@提及
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)  # 移除非字母数字字符，保留空格
    return text

train_df['cleaned_text'] = train_df['text'].apply(clean_text)
test_df['cleaned_text'] = test_df['text'].apply(clean_text)

# 3. 准备文本数据
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")  # 处理未见过的词
tokenizer.fit_on_texts(train_df['cleaned_text'])

train_sequences = tokenizer.texts_to_sequences(train_df['cleaned_text'])
test_sequences = tokenizer.texts_to_sequences(test_df['cleaned_text'])

max_length = 50  # 设定一个序列最大长度
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# 4. 构建LSTM模型
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),  # 词嵌入层
    Bidirectional(LSTM(64, return_sequences=False)),  # 双向LSTM
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # 二分类输出
])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])  # 注意：Kaggle评估指标是F1，训练时监控accuracy更直观，但最终要用F1评估

# 5. 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(train_padded, train_df['target'], test_size=0.2, random_state=42)

# 6. 训练模型
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(X_val, y_val),
                    verbose=1)

# 7. 在验证集上预测并计算F1分数（这是比赛评估指标）
y_val_pred = (model.predict(X_val) > 0.5).astype("int32")
f1 = f1_score(y_val, y_val_pred)
print(f"Validation F1 Score: {f1}")

# 8. 对测试集进行预测并生成提交文件
test_pred = (model.predict(test_padded) > 0.5).astype("int32")
submission = pd.DataFrame({'id': test_df['id'], 'target': test_pred.flatten()})
submission.to_csv('submission.csv', index=False)

Epoch 1/10




[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 78ms/step - accuracy: 0.6539 - loss: 0.6015 - val_accuracy: 0.8030 - val_loss: 0.4426
Epoch 2/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 80ms/step - accuracy: 0.8760 - loss: 0.3167 - val_accuracy: 0.8102 - val_loss: 0.4559
Epoch 3/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 81ms/step - accuracy: 0.9236 - loss: 0.2096 - val_accuracy: 0.7991 - val_loss: 0.5515
Epoch 4/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 79ms/step - accuracy: 0.9525 - loss: 0.1432 - val_accuracy: 0.7827 - val_loss: 0.6319
Epoch 5/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 78ms/step - accuracy: 0.9647 - loss: 0.0958 - val_accuracy: 0.7708 - val_loss: 0.8223
Epoch 6/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 76ms/step - accuracy: 0.9731 - loss: 0.0720 - val_accuracy: 0.7492 - val_loss: 1.0094
Epoch 7/10
[1m191/191[0m 