In [1]:
import json
import pandas as pd
import numpy as np
import torch
import string
import nltk
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModel
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from contractions import fix
import swifter
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
import lightgbm as lgb

In [2]:
data = []
with open('tweet/tweets_DM.json', 'r') as f:
    for line in f:
        data.append(json.loads(line)) 
f.close()

emotion = pd.read_csv('tweet/emotion.csv')
data_identification = pd.read_csv('tweet/data_identification.csv')

df = pd.DataFrame(data)
_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'hashtags': _source.apply(lambda x: x['hashtags']),
    'text': _source.apply(lambda x: x['text']),
})
df = df.merge(data_identification, on='tweet_id', how='left')

train_data = df[df['identification'] == 'train']
test_data = df[df['identification'] == 'test']

train_data = train_data.merge(emotion, on='tweet_id', how='left')
train_data.drop_duplicates(subset=['text'], keep=False, inplace=True)

train_data_sample = train_data.sample(frac=0.5, random_state=42)
train_data_sample.to_pickle("train_dsample.pkl")
train_df = pd.read_pickle("train_dsample.pkl")

test_data.to_pickle("test_d.pkl")
test_df = pd.read_pickle("test_d.pkl")

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import wordnet
import emoji
import re

stop_words = set(stopwords.words('english'))

# 表情符號替換詞典
emoji_dict = {
    '😂': '[joy]', '❤️': '[love]', '😍': '[adoration]', '😭': '[cry]',
    '❤': '[care]', '😊': '[happy]', '🙏': '[pray]', '😘': '[kiss]',
    '💕': '[love_each_other]', '🔥': '[fire]', '😩': '[weary]',
    '🤔': '[think]', '💯': '[perfect]', '💙': '[loyalty]',
    '🙄': '[annoyed]', '😁': '[happy]', '🙌': '[celebrate]',
    '🙏🏾': '[pray]', '👍': '[approve]', '🙏🏽': '[pray]'
}

# Define a dictionary for common Twitter abbreviations/slangs
slang_dict = {
    "lol": "laugh out_loud",
    "u": "you",
    "idk": "I do not know",
    "omg": "oh my god",
    "btw": "by the way",
    "lmao": "laugh my_ass_off",
    "lmfao": "laugh my_ass_off",
    "fyi": "for your information",
    "brb": "be right back"
    # Add more as needed
}

# 預處理函數
def preprocess_text(text):
    # 替換 emoji
    for emj, keyword in emoji_dict.items():
        text = text.replace(emj, keyword)
    text = emoji.replace_emoji(text, replace='')  # 移除其他 emoji
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # 移除網址
    text = re.sub(r'RT[\s]+', '', text)  # Remove RT
    text = text.replace('<LH>', '')
    text = re.sub(r'\@\w+|\#', '', text)  # 移除 @user 和 hashtags
    #text = re.sub(r"[^a-zA-Z0-9\s]", '', text)  # 移除特殊字元
    text = text.lower()
    text = re.sub(r'[^\w\s!?]', '', text)
    text = re.sub(r'not\s+(\w+)', r'not_\1', text)
    
    wds = text.split()
    tweet = " ".join([slang_dict[wd.lower()] if wd.lower() in slang_dict else wd for wd in wds])
    
    text = fix(text)
    #text = str(TextBlob(text).correct())

    lemmatizer = WordNetLemmatizer()
    text = " ".join([lemmatizer.lemmatize(wd) for wd in text.split()])
    
    text = text.strip()
    
    
    words = word_tokenize(text)
    return ' '.join([word for word in words])

# 清理訓練與測試資料
train_df['clean_text'] = train_df['text'].swifter.apply(preprocess_text)
test_df['clean_text'] = test_df['text'].swifter.apply(preprocess_text)

# 打亂訓練資料
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
train_df

Pandas Apply:   0%|          | 0/724591 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/411972 [00:00<?, ?it/s]

Unnamed: 0,tweet_id,hashtags,text,identification,emotion,clean_text
0,0x36fc6e,"[development, future, winner]",Escaping pain is not the answer. Embracing pai...,train,sadness,escaping pain is not_the answer embracing pain...
1,0x36f312,[ignored],If I don't like you more then likely you've be...,train,sadness,if i do not like you more then likely you have...
2,0x1d7398,[Silverdome],"Two stadiums I've been two and photographed, i...",train,sadness,two stadium i have been two and photographed i...
3,0x26d0d1,[trans],The racial trans badge #trans <LH> <LH>,train,trust,the racial trans badge trans
4,0x2c580d,[weddingdressfitting],Very special day with Luda and her mom ❤️Feeli...,train,joy,very special day with luda and her mom lovefee...
...,...,...,...,...,...,...
724586,0x380c45,[Power5at5],Hey @POWERATL @maddoxradio please play <LH> by...,train,sadness,hey please play by on power5at5
724587,0x36c504,[],damn my foot healed when everyone is already i...,train,disgust,damn my foot healed when everyone is already i...
724588,0x2e8018,"[ZENii, skincareroutine, moisturiser, health, ...",@skinandbodyclin we're excited 🙌 #ZENii #skinc...,train,trust,were excited celebrate zenii skincareroutine m...
724589,0x31e324,[job],So excited! Just got a call that I have an int...,train,fear,so excited ! just got a call that i have an in...


In [5]:
#w2v preprocessing
def w2v_preprocess_text(text):
    words = word_tokenize(text)
    return ' '.join([word for word in words if word not in stop_words])
    
train_df['w2v_text'] = train_df['clean_text'].swifter.apply(w2v_preprocess_text)
test_df['w2v_text'] = test_df['clean_text'].swifter.apply(w2v_preprocess_text)

Pandas Apply:   0%|          | 0/724591 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/411972 [00:00<?, ?it/s]

In [7]:
train_df

Unnamed: 0,tweet_id,hashtags,text,identification,emotion,clean_text,w2v_text
0,0x2de314,pharmtech ota,When your good at what you do offers come to y...,train,joy,when your good at what you do offer come to yo...,good offer come pharmtech ota
1,0x264caf,,"Watching <LH> again after a really long time, ...",train,surprise,watching again after a really long time and de...,watching really long time dear god jack insuff...
2,0x27a5c3,Ido,I <LH> what I do #Ido what I love,train,joy,i what i do ido what i love,ido love
3,0x389940,depression,"This is a new level of #depression, the old tr...",train,trust,this is a new level of depression the old tric...,new level depression old trick used get around...
4,0x2a44ad,Memoir inspiration,@TwBookClub @GwenLeane A story of a man that w...,train,anticipation,a story of a man that will truly inspire every...,story man truly inspire everyone read memoir i...
...,...,...,...,...,...,...,...
724586,0x28d4c9,,"Dear God, I'm afraid to die because I'm not fo...",train,anticipation,dear god i am afraid to die because i am not_f...,dear god afraid die not_following anymore plea...
724587,0x22ec3d,Life,70 The moments in your life are only once #Lif...,train,anticipation,70 the moment in your life are only once life ...,70 moment life life july 08 2017 0115am
724588,0x305abc,MukamaWanyi Delaware,Thanking <LH> for each & everyday I wake up fr...,train,anticipation,thanking for each everyday i wake up fresh muk...,thanking everyday wake fresh mukamawanyi great...
724589,0x2a947d,,@realDonaldTrump That’s too bad that they chos...,train,sadness,that is too bad that they chose to go,bad chose go


In [10]:
from gensim.models import Word2Vec
import numpy as np

# 將每段文字轉為詞列表
train_sentences = train_df['w2v_text'].apply(lambda x: x.split()).tolist()
test_sentences = test_df['w2v_text'].apply(lambda x: x.split()).tolist()

# 訓練 Word2Vec 模型
w2v_model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=2, workers=4)

# 將文字轉換為向量平均值
def sentence_to_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

train_text_vectors = np.array([sentence_to_vector(sent, w2v_model) for sent in train_sentences])
test_text_vectors = np.array([sentence_to_vector(sent, w2v_model) for sent in test_sentences])


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 將 hashtags 組合成字串
train_df['hashtags'] = train_df['hashtags'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
test_df['hashtags'] = test_df['hashtags'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

# 訓練 TF-IDF
tfidf = TfidfVectorizer(max_features=500, stop_words='english')
tfidf_train = tfidf.fit_transform(train_df['hashtags'])
tfidf_test = tfidf.transform(test_df['hashtags'])

In [13]:
cache_path = "C:\\Users\\t1070\\.cache\\huggingface\\transformers"

In [14]:
from transformers import AutoTokenizer, AutoModel

MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=cache_path)
model = AutoModel.from_pretrained(MODEL_NAME, cache_dir=cache_path)

# 將模型移至 GPU
model.to('cuda')

def extract_bert_embeddings(text_series, batch_size=32):
    def process_batch(batch_texts):
        encodings = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
        for key in encodings:
            encodings[key] = encodings[key].to('cuda')  # 移到 GPU
        outputs = model(**encodings)
        return outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()  # 回到 CPU 進行 NumPy 處理

    embeddings = (
        text_series
        .swifter.apply(lambda text: process_batch([text])[0])  # 單一文本處理
        .to_numpy()
    )
    return np.stack(embeddings)

bert_train = extract_bert_embeddings(train_df['clean_text'])
bert_test = extract_bert_embeddings(test_df['clean_text'])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

loading configuration file config.json from cache at C:\Users\t1070\.cache\huggingface\transformers\models--distilbert-base-uncased\snapshots\12040accade4e8a0f71eabdb258fecc2e7e948be\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.46.3",
  "vocab_size": 30522
}



vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

loading file vocab.txt from cache at C:\Users\t1070\.cache\huggingface\transformers\models--distilbert-base-uncased\snapshots\12040accade4e8a0f71eabdb258fecc2e7e948be\vocab.txt
loading file tokenizer.json from cache at C:\Users\t1070\.cache\huggingface\transformers\models--distilbert-base-uncased\snapshots\12040accade4e8a0f71eabdb258fecc2e7e948be\tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\t1070\.cache\huggingface\transformers\models--distilbert-base-uncased\snapshots\12040accade4e8a0f71eabdb258fecc2e7e948be\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\t1070\.cache\huggingface\transformers\models--distilbert-base-uncased\snapshots\12040accade4e8a0f71eabdb258fecc2e7e948be\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "Distil

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at C:\Users\t1070\.cache\huggingface\transformers\models--distilbert-base-uncased\snapshots\12040accade4e8a0f71eabdb258fecc2e7e948be\model.safetensors
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of DistilBertModel were initialized from the model checkpoint at distilbert-ba

Pandas Apply:   0%|          | 0/724591 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/411972 [00:00<?, ?it/s]

In [15]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# 使用 NRC Lexicon 特徵
sid = SentimentIntensityAnalyzer()
def extract_sentiment_features(text_series):
    sentiment_scores = text_series.swifter.apply(lambda text: sid.polarity_scores(text))
    return pd.DataFrame(list(sentiment_scores))

sentiment_train = extract_sentiment_features(train_df['text'])
sentiment_test = extract_sentiment_features(test_df['text'])

Pandas Apply:   0%|          | 0/724591 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/411972 [00:00<?, ?it/s]

In [16]:
from scipy.sparse import hstack

# 轉換為 GPU 張量並堆疊
train_features = torch.cat([
    torch.tensor(tfidf_train.toarray(), dtype=torch.float32).to('cuda'),
    torch.tensor(train_text_vectors, dtype=torch.float32).to('cuda'),
    torch.tensor(bert_train, dtype=torch.float32).to('cuda'),
    torch.tensor(sentiment_train.values.astype(np.float32), dtype=torch.float32).to('cuda')
], dim=1)

test_features = torch.cat([
    torch.tensor(tfidf_test.toarray(), dtype=torch.float32).to('cuda'),
    torch.tensor(test_text_vectors, dtype=torch.float32).to('cuda'),
    torch.tensor(bert_test, dtype=torch.float32).to('cuda'),
    torch.tensor(sentiment_test.values.astype(np.float32), dtype=torch.float32).to('cuda')
], dim=1)



In [17]:
# 標籤編碼
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['emotion'])

# 將特徵與標籤進行分割
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    train_features,  # 從 train_features 中劃分
    y_train, # 已編碼的目標值
    test_size=0.2,
    random_state=42,
    stratify=y_train  # 確保類別分佈一致
)

In [18]:
import os
os.environ['DMLC_WORKER_TEMP'] = "C:\\Users\\t1070\\.cache\\xgb"

# 設定緩存路徑
torch_cache_dir = "C:\\Users\\t1070\\.cache\\torch\\cache"
os.environ['TORCH_HOME'] = torch_cache_dir  # 指定 PyTorch 模型和數據緩存位置

In [19]:
# XGB模型定義
# 1. XGBoost
# xgb_model = xgb.XGBClassifier(
#     objective='multi:softmax', 
#     num_class=len(label_encoder.classes_), 
#     use_label_encoder=False, 
#     eval_metric='mlogloss', 
#     max_depth=6, 
#     learning_rate=0.01, 
#     n_estimators=500
# )

# # 訓練 XGBoost
# xgb_model.fit(X_train_split.cpu().numpy(), y_train_split)

# # 驗證 XGBoost
# xgb_val_preds = xgb_model.predict(X_val_split.cpu().numpy())
# print("XGBoost Confusion Matrix:")
# print(confusion_matrix(y_val_split, xgb_val_preds))
# print("\nXGBoost Classification Report:")
# print(classification_report(y_val_split, xgb_val_preds))

# 1. XGBoost with GPU support
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax', 
    num_class=len(label_encoder.classes_), 
    use_label_encoder=False, 
    eval_metric='mlogloss', 
    max_depth=6, 
    learning_rate=0.1, 
    n_estimators=100,
    tree_method = "hist", device = "cuda",
    verbosity = 2
    #predictor='gpu_predictor'  # 運行時也使用 GPU
)

# 訓練 XGBoost
xgb_model.fit(
    X_train_split.cpu().numpy(),
    y_train_split,
    eval_set=[(X_val_split.cpu().numpy(), y_val_split)],
    verbose=True
)

# 驗證 XGBoost
xgb_val_preds = xgb_model.predict(X_val_split.cpu().numpy())
print("XGBoost (GPU) Confusion Matrix:")
print(confusion_matrix(y_val_split, xgb_val_preds))
print("\nXGBoost (GPU) Classification Report:")
print(classification_report(y_val_split, xgb_val_preds))

Parameters: { "use_label_encoder" } are not used.



[0]	validation_0-mlogloss:2.00358
[1]	validation_0-mlogloss:1.94264
[2]	validation_0-mlogloss:1.89134
[3]	validation_0-mlogloss:1.84738
[4]	validation_0-mlogloss:1.80903
[5]	validation_0-mlogloss:1.77517
[6]	validation_0-mlogloss:1.74510
[7]	validation_0-mlogloss:1.71798
[8]	validation_0-mlogloss:1.69325
[9]	validation_0-mlogloss:1.67111
[10]	validation_0-mlogloss:1.65094
[11]	validation_0-mlogloss:1.63246
[12]	validation_0-mlogloss:1.61555
[13]	validation_0-mlogloss:1.60044
[14]	validation_0-mlogloss:1.58632
[15]	validation_0-mlogloss:1.57323
[16]	validation_0-mlogloss:1.56107
[17]	validation_0-mlogloss:1.54969
[18]	validation_0-mlogloss:1.53928
[19]	validation_0-mlogloss:1.52957
[20]	validation_0-mlogloss:1.52057
[21]	validation_0-mlogloss:1.51220
[22]	validation_0-mlogloss:1.50435
[23]	validation_0-mlogloss:1.49708
[24]	validation_0-mlogloss:1.49024
[25]	validation_0-mlogloss:1.48381
[26]	validation_0-mlogloss:1.47769
[27]	validation_0-mlogloss:1.47194
[28]	validation_0-mlogloss:1.4

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




XGBoost (GPU) Confusion Matrix:
[[  494   161   679    32  1659   907     1    22]
 [   24 11262   824    67 10717  1430     3   420]
 [   77   412  4550   137  5189  3481     3    88]
 [    6   380   460  1330  3331   809     5    44]
 [   30  2753  1181   170 44296  2164     6   816]
 [  104   701  2523   127  8310  7382     9    93]
 [   10   183   553    36  2475   903   556    29]
 [   13  1643   664    42 13691   994     3  3455]]

XGBoost (GPU) Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.12      0.21      3955
           1       0.64      0.46      0.53     24747
           2       0.40      0.33      0.36     13937
           3       0.69      0.21      0.32      6365
           4       0.49      0.86      0.63     51416
           5       0.41      0.38      0.40     19249
           6       0.95      0.12      0.21      4745
           7       0.70      0.17      0.27     20505

    accuracy                     

In [20]:
# 2. LightGBM
lgb_model = lgb.LGBMClassifier(
    objective='multiclass', 
    num_class=len(label_encoder.classes_), 
    boosting_type='gbdt', 
    max_depth=-1, 
    learning_rate=0.1, 
    n_estimators=100,
    device='gpu',              # 啟用 GPU
    gpu_device_id=0            # 指定 GPU 卡 ID，通常是 0
)

# 訓練 LightGBM
lgb_model.fit(X_train_split.cpu().numpy(), y_train_split)

# 驗證 LightGBM
lgb_val_preds = lgb_model.predict(X_val_split.cpu().numpy())
print("LightGBM Confusion Matrix:")
print(confusion_matrix(y_val_split, lgb_val_preds))
print("\nLightGBM Classification Report:")
print(classification_report(y_val_split, lgb_val_preds))

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 229051
[LightGBM] [Info] Number of data points in the train set: 579672, number of used features: 1372
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3080, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 871 dense feature groups (482.06 MB) transferred to GPU in 0.163718 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score -3.601124
[LightGBM] [Info] Start training from score -1.767484
[LightGBM] [Info] Start training from score -2.341639
[LightGBM] [Info] Start training from score -3.125354
[LightGBM] [Info] Start training from score -1.036238
[LightGBM] [Info] Start training from score -2.018709
[LightGBM] [Info] Start training from score -3.418971
[LightGBM] [Info] Start training from score -1.955499
LightGBM Confusion Matrix:
[[  58

In [21]:
from torch import nn

class LSTMWithAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
    
    def forward(self, x):
        # Ensure x is 3D: [batch_size, sequence_length, features]
        if x.dim() == 2:
            x = x.unsqueeze(1)  # Add sequence dimension if missing
        
        lstm_out, _ = self.lstm(x)
        
        # Compute attention scores
        attention_scores = self.attention(lstm_out).squeeze(-1)
        attention_weights = torch.softmax(attention_scores, dim=1)
        
        # Weighted sum of LSTM outputs
        attended_output = torch.sum(attention_weights.unsqueeze(-1) * lstm_out, dim=1)
        
        return self.fc(attended_output)

# 2. LSTM+Attention 模型
# 構建 DataLoader
batch_size = 16
accumulation_steps = 2
num_epochs = 5

# Create dataset and loader
train_dataset = TensorDataset(
    torch.tensor(X_train_split, dtype=torch.float32), 
    torch.tensor(y_train_split, dtype=torch.long)
)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize model, loss, and optimizer
lstm_attention_model = LSTMWithAttention(
    input_dim=X_train_split.shape[1], 
    hidden_dim=64,  # Reduced hidden dimension 
    output_dim=len(label_encoder.classes_)
).to('cuda')

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm_attention_model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    lstm_attention_model.train()
    epoch_loss = 0.0
    
    for i, (batch_features, batch_labels) in enumerate(train_loader):
        batch_features = batch_features.to('cuda')
        batch_labels = batch_labels.to('cuda')
        
        # Forward pass
        outputs = lstm_attention_model(batch_features)
        
        # Compute loss
        loss = criterion(outputs, batch_labels) / accumulation_steps
        
        # Backward pass
        loss.backward()
        
        # Update weights
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {epoch_loss / len(train_loader)}")

from sklearn.metrics import classification_report, confusion_matrix

# Validation loop
val_dataset = TensorDataset(
    torch.tensor(X_val_split, dtype=torch.float32), 
    torch.tensor(y_val_split, dtype=torch.long)
)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

lstm_attention_model.eval()
val_preds = []
val_true = []
with torch.no_grad():
    for batch_features, batch_labels in val_loader:
        batch_features = batch_features.to('cuda')
        outputs = lstm_attention_model(batch_features)
        batch_preds = torch.argmax(outputs, dim=1).cpu().numpy()
        val_preds.extend(batch_preds)
        val_true.extend(batch_labels.numpy())

print("LSTM Confusion Matrix:")
print(confusion_matrix(val_true, val_preds))
print("\nLSTM Classification Report:")
print(classification_report(val_true, val_preds))

  torch.tensor(X_train_split, dtype=torch.float32),


Epoch 1/5, Average Loss: 0.6686749887300268
Epoch 2/5, Average Loss: 0.6368004484660975
Epoch 3/5, Average Loss: 0.6261931888325222
Epoch 4/5, Average Loss: 0.6191142202106392
Epoch 5/5, Average Loss: 0.613716012470603


  torch.tensor(X_val_split, dtype=torch.float32),


LSTM Confusion Matrix:
[[  582   182   951    56  1278   820     5    81]
 [   33 13144  1038   193  8228  1066    21  1024]
 [  110   485  6209   146  3951  2672    26   338]
 [   10   424   666  2018  2442   647    10   148]
 [   43  3166  1836   355 41639  1865    24  2488]
 [  152   782  3727   223  6182  7723    27   433]
 [   17   236   905    66  2042   708   634   137]
 [   15  1934   822   111 10741   796    14  6072]]

LSTM Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.15      0.24      3955
           1       0.65      0.53      0.58     24747
           2       0.38      0.45      0.41     13937
           3       0.64      0.32      0.42      6365
           4       0.54      0.81      0.65     51416
           5       0.47      0.40      0.43     19249
           6       0.83      0.13      0.23      4745
           7       0.57      0.30      0.39     20505

    accuracy                           0.54    1449

In [22]:
lstm_val_preds = val_preds

val_predictions = np.array([lstm_val_preds, xgb_val_preds, lgb_val_preds])
major_voting_val_preds = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=val_predictions)

print("Major Voting Confusion Matrix:")
print(confusion_matrix(y_val_split, major_voting_val_preds))
print("\nMajor Voting Classification Report:")
print(classification_report(y_val_split, major_voting_val_preds))

Major Voting Confusion Matrix:
[[  574   195   770    40  1468   875     1    32]
 [   39 12332   919    85  9615  1267     3   487]
 [  123   522  5228   132  4610  3207     7   108]
 [   19   468   539  1558  2974   751     2    54]
 [   61  2993  1473   197 43617  1989     8  1078]
 [  157   850  2931   170  7364  7653    10   114]
 [   18   219   678    51  2283   871   582    43]
 [   23  1933   725    53 12701   887     5  4178]]

Major Voting Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.15      0.23      3955
           1       0.63      0.50      0.56     24747
           2       0.39      0.38      0.38     13937
           3       0.68      0.24      0.36      6365
           4       0.52      0.85      0.64     51416
           5       0.44      0.40      0.42     19249
           6       0.94      0.12      0.22      4745
           7       0.69      0.20      0.31     20505

    accuracy                       

In [27]:
torch.cuda.empty_cache()
# 測試集預測
xgb_test_preds = xgb_model.predict(test_features.cpu().numpy())

# 測試集預測
lgb_test_preds = lgb_model.predict(test_features.cpu().numpy())

test_features_tensor = torch.tensor(test_features, dtype=torch.float32).to('cuda')

# Predict on test set
lstm_attention_model.eval()
with torch.no_grad():
    lstm_test_preds = torch.argmax(lstm_attention_model(test_features_tensor), dim=1).cpu().numpy()

# Major Voting for Test
# lstm_test_preds = y_test_pred  # LSTM 測試預測
test_predictions = np.array([lstm_test_preds, xgb_test_preds, lgb_test_preds])
major_voting_test_preds = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=test_predictions)

  test_features_tensor = torch.tensor(test_features, dtype=torch.float32).to('cuda')


In [28]:
# Inverse transform predictions
test_df['emotion'] = label_encoder.inverse_transform(major_voting_test_preds)

# Output submission
submission = test_df[['tweet_id', 'emotion']]
submission.to_csv('major_submission.csv', index=False)