# 6. LSTM + TensorFlow  Tokenizer

| **方法**                  | **特徵工程**                | **模型**                    | **準確率預估** | **優勢**                                     | **劣勢**                                      | **GPU 支援**            |
|--------------------------|----------------------------|-----------------------------|----------------|---------------------------------------------|----------------------------------------------|-------------------------|
| **TF-IDF + 隨機森林**      | 稀疏特徵表示，詞頻與逆文檔頻率權重 | 隨機森林                    | 75%-82%       | 模型穩定性強，對噪聲和高維數據不敏感          | 無法處理非線性模式，對語義信息利用不足           | 不支持                  |
| **TF-IDF + Boosting**      | 稀疏特徵表示，詞頻與逆文檔頻率權重 | XGBoost 或 LightGBM         | 78%-85%       | 擅長處理稀疏特徵，對錯分樣本有良好適應能力      | 訓練成本略高，需調參以達到最佳效果              | 支持（顯著加速，適合大數據集）|
| **Word2Vec + 隨機森林**     | 詞嵌入，計算句向量平均值       | 隨機森林                    | 72%-80%       | 能結合詞嵌入語義特徵，提升語義捕捉能力          | 詞嵌入需預處理，隨機森林對非線性語義的處理有限     | 不支持                  |
| **Word2Vec + CNN**         | 詞嵌入，保留語序            | 卷積神經網絡                | 75%-85%       | 捕捉局部語義特徵，對短文本效果佳               | 訓練需較多資源，對長文本效果有限               | 支持（顯著加速）         |
| **BERT 嵌入 + Transformer**| 上下文語義嵌入，保留全局語義 | 預訓練 BERT 模型             | 85%-90%       | 能捕捉上下文語義，分類準確率最高               | 訓練和推理成本高，需要大量數據和資源支持         | 支持（必要，否則速度較慢） |
| **Tokenizer + LSTM**       | 數字化文本序列，保留序列上下文 | 長短期記憶神經網絡           | 80%-88%       | 能捕捉文本序列特徵，適合時間序列或長文本        | 訓練成本中等，對長文本可能有梯度消失問題         | 支持（顯著加速，必要）   |


In [8]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dm-2024-isa-5810-lab-2-homework/tweets_DM.json
/kaggle/input/dm-2024-isa-5810-lab-2-homework/sampleSubmission.csv
/kaggle/input/dm-2024-isa-5810-lab-2-homework/data_identification.csv
/kaggle/input/dm-2024-isa-5810-lab-2-homework/emotion.csv


In [9]:
# 資料處理部分（保持原樣）
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# 載入 JSON 數據
data = []
with open('/kaggle/input/dm-2024-isa-5810-lab-2-homework/tweets_DM.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))

# 載入情緒標籤與數據標識檔案
emotion = pd.read_csv('/kaggle/input/dm-2024-isa-5810-lab-2-homework/emotion.csv')
data_identification = pd.read_csv('/kaggle/input/dm-2024-isa-5810-lab-2-homework/data_identification.csv')

# 處理 JSON 數據，提取必要欄位
df = pd.DataFrame(data)
_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'hashtags': _source.apply(lambda x: x['hashtags']),
    'text': _source.apply(lambda x: x['text']),
})
df = df.merge(data_identification, on='tweet_id', how='left')

# 分為訓練和測試數據
train_data = df[df['identification'] == 'train']
test_data = df[df['identification'] == 'test']

# 合併情緒標籤
train_data = train_data.merge(emotion, on='tweet_id', how='left')

# 移除重複文本
train_data.drop_duplicates(subset=['text'], keep=False, inplace=True)

# 設定 Tokenizer，將文字轉為數字序列
MAX_VOCAB_SIZE = 5000  # 最大詞彙數量
MAX_SEQUENCE_LENGTH = 100  # 序列長度

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<UNK>")
tokenizer.fit_on_texts(train_data['text'])

# 將文字轉為數字序列並進行填充
X_train_sequences = tokenizer.texts_to_sequences(train_data['text'])
X_test_sequences = tokenizer.texts_to_sequences(test_data['text'])
X_train_padded = pad_sequences(X_train_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

# 標籤編碼
le = LabelEncoder()
y_train_encoded = le.fit_transform(train_data['emotion'])

# 轉為 One-Hot 編碼格式
y_train_onehot = to_categorical(y_train_encoded)

# 使用 MirroredStrategy 進行多 GPU 訓練
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# 建立分布式策略
strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"])

with strategy.scope():
    # 嵌入層參數
    EMBEDDING_DIM = 100  # 嵌入層維度
    NUM_CLASSES = len(le.classes_)  # 類別數量

    # 建立 LSTM 模型
    model = Sequential([
        Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
        LSTM(128, return_sequences=False),  # LSTM 層，128 個隱藏神經元
        Dropout(0.5),  # Dropout，減少過擬合
        Dense(64, activation='relu'),  # 隱藏層
        Dropout(0.5),  # 再次 Dropout
        Dense(NUM_CLASSES, activation='softmax')  # 輸出層，使用 Softmax 進行多分類
    ])

    # 編譯模型
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 顯示模型結構
model.summary()

# 模型訓練
history = model.fit(
    X_train_padded, y_train_onehot,
    validation_split=0.2,  # 使用部分數據作為驗證集
    epochs=10,  # 訓練輪數
    batch_size=64,  # 批次大小（分布式策略下會自動分配到多 GPU）
    verbose=1  # 顯示訓練過程
)

# 預測與提交
y_test_pred = model.predict(X_test_padded)
y_pred_labels = le.inverse_transform(y_test_pred.argmax(axis=1))  # 將預測轉回情緒標籤

submission = pd.DataFrame({
    'tweet_id': test_data['tweet_id'],
    'emotion': y_pred_labels
})
# submission.to_csv('/kaggle/working/submission.csv', index=False)  # 儲存為提交檔案




Epoch 1/10
[1m18115/18115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m269s[0m 15ms/step - accuracy: 0.3528 - loss: 1.8009 - val_accuracy: 0.3544 - val_loss: 1.7879
Epoch 2/10
[1m18115/18115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 14ms/step - accuracy: 0.3540 - loss: 1.7881 - val_accuracy: 0.3544 - val_loss: 1.7878
Epoch 3/10
[1m18115/18115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 14ms/step - accuracy: 0.3548 - loss: 1.7865 - val_accuracy: 0.3544 - val_loss: 1.7877
Epoch 4/10
[1m18115/18115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 14ms/step - accuracy: 0.3543 - loss: 1.7875 - val_accuracy: 0.3544 - val_loss: 1.7877
Epoch 5/10
[1m18115/18115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 15ms/step - accuracy: 0.3548 - loss: 1.7877 - val_accuracy: 0.3544 - val_loss: 1.7877
Epoch 6/10
[1m18115/18115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 15ms/step - accuracy: 0.3542 - loss: 1.7872 - val_accuracy: 0.3544 - val

In [None]:
submission.head

In [7]:
emotion.head()

Unnamed: 0,tweet_id,emotion
0,0x3140b1,sadness
1,0x368b73,disgust
2,0x296183,anticipation
3,0x2bd6e1,joy
4,0x2ee1dd,anticipation


In [None]:
# 確保 test_data 中包含真實標籤
if 'emotion' not in test_data.columns:
    raise ValueError("test_data 中缺少 'emotion' 欄位，無法進行分析。請確保 test_data 包含真實標籤。")

# 編碼測試集的真實標籤
y_test_encoded = le.transform(test_data['emotion'])
y_pred_encoded = y_test_pred.argmax(axis=1)  # 預測的數值標籤

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# 1. 列出混淆矩陣
conf_matrix = confusion_matrix(y_test_encoded, y_pred_encoded)
print("\n", "----", "\n")
print("Confusion Matrix:\n", conf_matrix)

# 2. 畫出混淆矩陣
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

print("\n", "----", "\n")

# 3. 印出 Classification Report
class_report = classification_report(y_test_encoded, y_pred_encoded, target_names=le.classes_)
print("Classification Report:\n", class_report)

print("\n", "----", "\n")

# 4. 印出 Accuracy
accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
print("Accuracy:", accuracy)
