In [1]:
import json
import pandas as pd
import numpy as np
import torch
import string
import nltk
import emoji
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, TensorDataset
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModel
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from contractions import fix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import wordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import swifter

In [2]:
data = []
with open('tweet/tweets_DM.json', 'r') as f:
    for line in f:
        data.append(json.loads(line)) 
f.close()

emotion = pd.read_csv('tweet/emotion.csv')
data_identification = pd.read_csv('tweet/data_identification.csv')

df = pd.DataFrame(data)
_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'hashtags': _source.apply(lambda x: x['hashtags']),
    'text': _source.apply(lambda x: x['text']),
})
df = df.merge(data_identification, on='tweet_id', how='left')

train_data = df[df['identification'] == 'train']
test_data = df[df['identification'] == 'test']

train_data = train_data.merge(emotion, on='tweet_id', how='left')
train_data.drop_duplicates(subset=['text'], keep=False, inplace=True)

train_data_sample = train_data.sample(frac=0.1, random_state=42)
train_data_sample.to_pickle("train_dsample.pkl")
train_df = pd.read_pickle("train_dsample.pkl")

test_data.to_pickle("test_d.pkl")
test_df = pd.read_pickle("test_d.pkl")

In [3]:
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk import wordnet


stop_words = set(stopwords.words('english'))

# 表情符號替換詞典
emoji_dict = {
    '😂': '[joy]', '❤️': '[love]', '😍': '[adoration]', '😭': '[cry]',
    '❤': '[care]', '😊': '[happy]', '🙏': '[pray]', '😘': '[kiss]',
    '💕': '[love_each_other]', '🔥': '[fire]', '😩': '[weary]',
    '🤔': '[think]', '💯': '[perfect]', '💙': '[loyalty]',
    '🙄': '[annoyed]', '😁': '[happy]', '🙌': '[celebrate]',
    '🙏🏾': '[pray]', '👍': '[approve]', '🙏🏽': '[pray]'
}

# Define a dictionary for common Twitter abbreviations/slangs
slang_dict = {
    "lol": "laugh out_loud",
    "u": "you",
    "idk": "I do not know",
    "omg": "oh my god",
    "btw": "by the way",
    "lmao": "laugh my_ass_off",
    "lmfao": "laugh my_ass_off",
    "fyi": "for your information",
    "brb": "be right back"
    # Add more as needed
}

# 預處理函數
def preprocess_text(text):
    # 替換 emoji
    for emj, keyword in emoji_dict.items():
        text = text.replace(emj, keyword)
    text = emoji.replace_emoji(text, replace='')  # 移除其他 emoji
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # 移除網址
    text = re.sub(r'RT[\s]+', '', text)  # Remove RT
    text = text.replace('<LH>', '')
    text = re.sub(r'\@\w+|\#', '', text)  # 移除 @user 和 hashtags
    text = re.sub(r"[^a-zA-Z0-9\s]", '', text)  # 移除特殊字元
    text = text.lower()
    text = re.sub(r'[^\w\s!?]', '', text)
    text = re.sub(r'not\s+(\w+)', r'not_\1', text)
    
    wds = text.split()
    tweet = " ".join([slang_dict[wd.lower()] if wd.lower() in slang_dict else wd for wd in wds])
    
    text = fix(text)
    #text = str(TextBlob(text).correct())

    lemmatizer = WordNetLemmatizer()
    text = " ".join([lemmatizer.lemmatize(wd) for wd in text.split()])
    
    text = text.strip()
    
    
    words = word_tokenize(text)
    return ' '.join([word for word in words if word not in stop_words])

# 清理訓練與測試資料
train_df['clean_text'] = train_df['text'].swifter.apply(preprocess_text)
test_df['clean_text'] = test_df['text'].swifter.apply(preprocess_text)

# 打亂訓練資料
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

Pandas Apply:   0%|          | 0/144918 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/411972 [00:00<?, ?it/s]

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 將 hashtags 組合成字串
train_df['hashtags'] = train_df['hashtags'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
test_df['hashtags'] = test_df['hashtags'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

# 訓練 TF-IDF
tfidf = TfidfVectorizer(max_features=500, stop_words='english')
tfidf_train = tfidf.fit_transform(train_df['hashtags'].fillna(''))
tfidf_test = tfidf.transform(test_df['hashtags'].fillna(''))

In [5]:
from gensim.models import Word2Vec
import numpy as np

# 將每段文字轉為詞列表
train_sentences = train_df['clean_text'].apply(lambda x: x.split()).tolist()
test_sentences = test_df['clean_text'].apply(lambda x: x.split()).tolist()

# 訓練 Word2Vec 模型
w2v_model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=2, workers=4)

# 將文字轉換為向量平均值
def sentence_to_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

train_text_vectors = np.array([sentence_to_vector(sent, w2v_model) for sent in train_sentences])
test_text_vectors = np.array([sentence_to_vector(sent, w2v_model) for sent in test_sentences])


In [6]:
# 分批處理：BERT 嵌入
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# 將模型移至 GPU
model.to('cuda')

def extract_bert_embeddings(text_series, batch_size=32):
    def process_batch(batch_texts):
        encodings = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
        for key in encodings:
            encodings[key] = encodings[key].to('cuda')  # 移到 GPU
        outputs = model(**encodings)
        return outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()  # 回到 CPU 進行 NumPy 處理

    embeddings = (
        text_series
        .swifter.apply(lambda text: process_batch([text])[0])  # 單一文本處理
        .to_numpy()
    )
    return np.stack(embeddings)

bert_train = extract_bert_embeddings(train_df['text'])
bert_test = extract_bert_embeddings(test_df['text'])

Pandas Apply:   0%|          | 0/144918 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/411972 [00:00<?, ?it/s]

In [7]:
# 使用 NRC Lexicon 特徵
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
def extract_sentiment_features(text_series):
    sentiment_scores = text_series.swifter.apply(lambda text: sid.polarity_scores(text))
    return pd.DataFrame(list(sentiment_scores))

sentiment_train = extract_sentiment_features(train_df['text'])
sentiment_test = extract_sentiment_features(test_df['text'])

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\t1070\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Pandas Apply:   0%|          | 0/144918 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/411972 [00:00<?, ?it/s]

In [9]:
print(sentiment_train.dtypes)
print(sentiment_train.head())

clean_text    object
dtype: object
                                          clean_text
0  {'neg': 0.355, 'neu': 0.376, 'pos': 0.269, 'co...
1  {'neg': 0.388, 'neu': 0.432, 'pos': 0.18, 'com...
2  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...
3  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...
4  {'neg': 0.0, 'neu': 0.431, 'pos': 0.569, 'comp...


In [8]:
# 合併嵌入與情緒特徵
from scipy.sparse import hstack

# 轉換為 GPU 張量並堆疊
train_features = torch.cat([
    torch.tensor(tfidf_train.toarray(), dtype=torch.float32).to('cuda'),
    torch.tensor(train_text_vectors, dtype=torch.float32).to('cuda'),
    torch.tensor(bert_train, dtype=torch.float32).to('cuda'),
    torch.tensor(sentiment_train.values.astype(np.float32), dtype=torch.float32).to('cuda')
], dim=1)

test_features = torch.cat([
    torch.tensor(tfidf_test.toarray(), dtype=torch.float32).to('cuda'),
    torch.tensor(test_text_vectors, dtype=torch.float32).to('cuda'),
    torch.tensor(bert_test, dtype=torch.float32).to('cuda'),
    torch.tensor(sentiment_test.values.astype(np.float32), dtype=torch.float32).to('cuda')
], dim=1)

In [9]:
# 標籤編碼
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['emotion'])
#y_test = label_encoder.transform(test_df['emotion'])


In [13]:
# 模型定義
import xgboost as xgb
from torch import nn

xgb_model = xgb.XGBClassifier(
    objective='multi:softmax', 
    num_class=len(label_encoder.classes_), 
    n_estimators=500,
    learning_rate=0.01,
    max_depth=5,
    random_state=42,
    tree_method='auto'
)
# xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_))

# 模型定義：LSTM + Attention
# class LSTMWithAttention(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim):
#         super(LSTMWithAttention, self).__init__()
#         self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
#         self.attention = nn.Linear(hidden_dim * 2, 1)  # Linear layer for attention scores
#         self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Final output layer

#     def forward(self, x):
#         lstm_out, _ = self.lstm(x)  # lstm_out shape: (batch_size, seq_len, hidden_dim*2)
        
#         # Compute attention scores (shape: [batch_size, seq_len, 1])
#         attention_scores = torch.softmax(self.attention(lstm_out), dim=1)
        
#         # Apply attention weights to LSTM output
#         attended_output = torch.sum(attention_scores * lstm_out, dim=1)  # Summing over the sequence length

#         # Pass through the final fully connected layer
#         output = self.fc(attended_output)
#         return output
class LSTMWithAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
    
    def forward(self, x):
        # Ensure x is 3D: [batch_size, sequence_length, features]
        if x.dim() == 2:
            x = x.unsqueeze(1)  # Add sequence dimension if missing
        
        lstm_out, _ = self.lstm(x)
        
        # Compute attention scores
        attention_scores = self.attention(lstm_out).squeeze(-1)
        attention_weights = torch.softmax(attention_scores, dim=1)
        
        # Weighted sum of LSTM outputs
        attended_output = torch.sum(attention_weights.unsqueeze(-1) * lstm_out, dim=1)
        
        return self.fc(attended_output)



In [11]:
# 將特徵與標籤進行分割
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    train_features,  # 從 train_features 中劃分
    y_train, # 已編碼的目標值
    test_size=0.2,
    random_state=42,
    stratify=y_train  # 確保類別分佈一致
)

In [25]:
print(X_train_split.shape, y_train_split.shape)
print(X_val_split.shape, y_val_split.shape)

torch.Size([579672, 1372]) (579672,)
torch.Size([144919, 1372]) (144919,)


In [32]:
y_train_split_encoded = label_encoder.fit_transform(y_train_split)
y_train_split

array([4, 1, 7, ..., 7, 3, 7])

In [27]:
train_features_cpu = train_features.cpu().numpy()  # Move to CPU and convert to NumPy
y_train_cpu = y_train

# Check for NaN or infinity values in the features and labels
print(np.any(np.isnan(train_features_cpu)))  # Check for NaNs in the features
print(np.any(np.isnan(y_train_cpu)))         # Check for NaNs in the labels
print(np.any(np.isinf(train_features_cpu)))  # Check for infinities in the features
print(np.any(np.isinf(y_train_cpu)))         # Check for infinities in the labels

False
False
False
False


In [46]:
# 模型訓練與評估
# 1. XGBoost 模型
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_))
dtrain = xgb.DMatrix(train_features, label=y_train)
# xgb_model.fit(
#     X_train_split, y_train_split,
#     eval_set=[(X_val_split, y_val_split)],  # 驗證集評估
#     verbose=True
# )
#xgb_model.fit(X_train_split, y_train_split)
#y_pred_val_encoded = xgb_model.predict(X_val_split)

from sklearn.metrics import classification_report, confusion_matrix

# 將數值標籤轉回文字標籤
y_pred_val = label_encoder.inverse_transform(y_pred_val_encoded)
y_val_split_text = label_encoder.inverse_transform(y_val_split)

# 評估模型
print(confusion_matrix(y_val_split_text, y_pred_val))
print(classification_report(y_val_split_text, y_pred_val))

TypeError: fit() missing 1 required positional argument: 'y'

In [20]:
# 2. LSTM+Attention 模型
# 構建 DataLoader
batch_size = 16
accumulation_steps = 2
num_epochs = 5

# Create dataset and loader
train_dataset = TensorDataset(
    torch.tensor(X_train_split, dtype=torch.float32), 
    torch.tensor(y_train_split, dtype=torch.long)
)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize model, loss, and optimizer
lstm_attention_model = LSTMWithAttention(
    input_dim=X_train_split.shape[1], 
    hidden_dim=64,  # Reduced hidden dimension 
    output_dim=len(label_encoder.classes_)
).to('cuda')

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm_attention_model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    lstm_attention_model.train()
    epoch_loss = 0.0
    
    for i, (batch_features, batch_labels) in enumerate(train_loader):
        batch_features = batch_features.to('cuda')
        batch_labels = batch_labels.to('cuda')
        
        # Forward pass
        outputs = lstm_attention_model(batch_features)
        
        # Compute loss
        loss = criterion(outputs, batch_labels) / accumulation_steps
        
        # Backward pass
        loss.backward()
        
        # Update weights
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {epoch_loss / len(train_loader)}")


  torch.tensor(X_train_split, dtype=torch.float32),


Epoch 1/5, Average Loss: 0.7421766256730004
Epoch 2/5, Average Loss: 0.7017438596547917
Epoch 3/5, Average Loss: 0.6837248608342462
Epoch 4/5, Average Loss: 0.6710774905850075
Epoch 5/5, Average Loss: 0.6609893236499513


In [22]:
from sklearn.metrics import classification_report, confusion_matrix
# Move test features to CUDA and ensure correct tensor type
# val_features_tensor = torch.tensor(X_val_split, dtype=torch.float32).to('cuda')

# # Evaluate LSTM model
# lstm_attention_model.eval()
# with torch.no_grad():
#     lstm_preds = torch.argmax(lstm_attention_model(val_features_tensor), dim=1).cpu().numpy()

# print("LSTM Confusion Matrix:")
# print(confusion_matrix(y_val_split, lstm_preds))
# print("\nLSTM Classification Report:")
# print(classification_report(y_val_split, lstm_preds))

# Validation loop
val_dataset = TensorDataset(
    torch.tensor(X_val_split, dtype=torch.float32), 
    torch.tensor(y_val_split, dtype=torch.long)
)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

lstm_attention_model.eval()
val_preds = []
val_true = []
with torch.no_grad():
    for batch_features, batch_labels in val_loader:
        batch_features = batch_features.to('cuda')
        outputs = lstm_attention_model(batch_features)
        batch_preds = torch.argmax(outputs, dim=1).cpu().numpy()
        val_preds.extend(batch_preds)
        val_true.extend(batch_labels.numpy())

print("LSTM Confusion Matrix:")
print(confusion_matrix(val_true, val_preds))
print("\nLSTM Classification Report:")
print(classification_report(val_true, val_preds))

  torch.tensor(X_val_split, dtype=torch.float32),


LSTM Confusion Matrix:
[[  92   61  100   10  251  265    2   23]
 [   3 2285   94   31 1837  369    7  285]
 [  28  146  703   31  863  908    7   85]
 [   1   94   51  273  585  221    3   42]
 [   5  659  170   55 8010  769   12  618]
 [  23  178  351   49 1299 1858    3   94]
 [   4   66   70   15  419  245  102   32]
 [   5  397  103   24 2214  334    3 1042]]

LSTM Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.11      0.19       804
           1       0.59      0.47      0.52      4911
           2       0.43      0.25      0.32      2771
           3       0.56      0.21      0.31      1270
           4       0.52      0.78      0.62     10298
           5       0.37      0.48      0.42      3855
           6       0.73      0.11      0.19       953
           7       0.47      0.25      0.33      4122

    accuracy                           0.50     28984
   macro avg       0.53      0.33      0.36     28984
weighte

In [None]:
# 3. Ensemble Voting
voting_clf = VotingClassifier(estimators=[
    ('xgb', xgb_model),
    ('lstm', lstm_attention_model)  # 注意：需調整框架以支持 VotingClassifier
], voting='hard')

voting_preds = [np.argmax(np.bincount([x, y])) for x, y in zip(xgb_preds, lstm_preds)]

print("Voting Classifier Confusion Matrix:")
print(confusion_matrix(y_test, voting_preds))
print("\nVoting Classifier Classification Report:")
print(classification_report(y_test, voting_preds))

In [None]:
test_data['emotion'] = label_encoder.inverse_transform(voting_preds)
submission = test_data[['tweet_id', 'emotion']]
submission.to_csv('submission.csv', index=False)
print("Submission file created: 'submission.csv'")

In [23]:
test_features_tensor = torch.tensor(test_features, dtype=torch.float32).to('cuda')

# Predict on test set
lstm_attention_model.eval()
with torch.no_grad():
    y_test_pred = torch.argmax(lstm_attention_model(test_features_tensor), dim=1).cpu().numpy()

# Inverse transform predictions
test_df['emotion'] = label_encoder.inverse_transform(y_test_pred)

# Output submission
submission = test_df[['tweet_id', 'emotion']]
submission.to_csv('lstm_submission.csv', index=False)

  test_features_tensor = torch.tensor(test_features, dtype=torch.float32).to('cuda')


In [None]:
y_train = label_encoder.fit_transform(train_data['emotion'])
y_test = label_encoder.transform(test_data['emotion'])

X_train = np.array(train_data['bert_embeddings'].tolist())
X_test = np.array(test_data['bert_embeddings'].tolist())

train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# 訓練模型
input_dim = X_train.shape[1]
hidden_dim = 128
output_dim = len(label_encoder.classes_)

model = LSTMWithAttention(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

epochs = 5
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

In [None]:
# Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('xgb', xgb_model),
    ('lstm', lstm_attention_model)
], voting='hard')

# 注意：LSTM + Attention 的訓練需要使用 DataLoader 和之前的訓練程式碼。
# Voting 會將 XGBoost 和 LSTM 的預測結合
voting_clf.fit(train_features, y_train)
final_predictions = voting_clf.predict(test_features)

In [8]:
from sklearn.metrics import classification_report, confusion_matrix

# 將數值標籤轉回文字標籤
y_pred_val = label_encoder.inverse_transform(y_pred_val_encoded)
y_val_split_text = label_encoder.inverse_transform(y_val_split)

# 評估模型
print(confusion_matrix(y_val_split_text, y_pred_val))
print(classification_report(y_val_split_text, y_pred_val))


[[  460   196   638    37  1737   769     4    38]
 [   32 11747   880   137 10413  1243    14   486]
 [   70   635  4319   152  5325  3074     9   120]
 [    6   455   431  1623  3175   653     7    56]
 [   32  3258  1455   305 42798  2354    28  1251]
 [   82   979  2585   204  8177  6952    17   198]
 [    8   235   599    53  2447   781   614    72]
 [   12  1950   811    78 12834  1042    14  3753]]
              precision    recall  f1-score   support

       anger       0.66      0.12      0.20      3879
anticipation       0.60      0.47      0.53     24952
     disgust       0.37      0.32      0.34     13704
        fear       0.63      0.25      0.36      6406
         joy       0.49      0.83      0.62     51481
     sadness       0.41      0.36      0.39     19194
    surprise       0.87      0.13      0.22      4809
       trust       0.63      0.18      0.28     20494

    accuracy                           0.50    144919
   macro avg       0.58      0.33      0.37    14

In [10]:
# 預測測試集
y_test_pred_encoded = xgb_model.predict(X_test)
test_df['emotion'] = label_encoder.inverse_transform(y_test_pred_encoded)

# 輸出為 submission.csv
submission = test_df[['tweet_id', 'emotion']]
submission.to_csv('submission_final.csv', index=False)