In [2]:
import json
import pandas as pd
import numpy as np
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModel
import swifter

In [3]:
data = []
with open('tweet/tweets_DM.json', 'r') as f:
    for line in f:
        data.append(json.loads(line)) 
f.close()

emotion = pd.read_csv('tweet/emotion.csv')
data_identification = pd.read_csv('tweet/data_identification.csv')

df = pd.DataFrame(data)
_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'hashtags': _source.apply(lambda x: x['hashtags']),
    'text': _source.apply(lambda x: x['text']),
})
df = df.merge(data_identification, on='tweet_id', how='left')

train_data = df[df['identification'] == 'train']
test_data = df[df['identification'] == 'test']

train_data = train_data.merge(emotion, on='tweet_id', how='left')
train_data.drop_duplicates(subset=['text'], keep=False, inplace=True)

train_data_sample = train_data.sample(frac=0.5, random_state=42)
train_data_sample.to_pickle("train_dsample.pkl")
train_df = pd.read_pickle("train_dsample.pkl")

test_data.to_pickle("test_d.pkl")
test_df = pd.read_pickle("test_d.pkl")

In [4]:
import swifter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import emoji
import re

stop_words = set(stopwords.words('english'))

# 表情符號替換詞典
emoji_dict = {
    '😂': '[joy]', '❤️': '[love]', '😍': '[adoration]', '😭': '[cry]',
    '❤': '[care]', '😊': '[happy]', '🙏': '[pray]', '😘': '[kiss]',
    '💕': '[love_each_other]', '🔥': '[fire]', '😩': '[weary]',
    '🤔': '[think]', '💯': '[perfect]', '💙': '[loyalty]',
    '🙄': '[annoyed]', '😁': '[happy]', '🙌': '[celebrate]',
    '🙏🏾': '[pray]', '👍': '[approve]', '🙏🏽': '[pray]'
}

# 預處理函數
def preprocess_text(text):
    # 替換 emoji
    for emj, keyword in emoji_dict.items():
        text = text.replace(emj, keyword)
    text = emoji.replace_emoji(text, replace='')  # 移除其他 emoji
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # 移除網址
    text = text.replace('<LH>', '')
    text = re.sub(r'\@\w+|\#', '', text)  # 移除 @user 和 hashtags
    text = re.sub(r"[^a-zA-Z0-9\s]", '', text)  # 移除特殊字元
    text = text.lower()  # 小寫化
    text = text.strip()
    words = word_tokenize(text)
    return ' '.join([word for word in words if word not in stop_words])

# 清理訓練與測試資料
train_df['clean_text'] = train_df['text'].swifter.apply(preprocess_text)
test_df['clean_text'] = test_df['text'].swifter.apply(preprocess_text)

# 打亂訓練資料
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

Pandas Apply:   0%|          | 0/724591 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/411972 [00:00<?, ?it/s]

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 將 hashtags 組合成字串
train_df['hashtags'] = train_df['hashtags'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
test_df['hashtags'] = test_df['hashtags'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

# 訓練 TF-IDF
tfidf = TfidfVectorizer(max_features=500, stop_words='english')
tfidf_train = tfidf.fit_transform(train_df['hashtags'])
tfidf_test = tfidf.transform(test_df['hashtags'])

In [6]:
from gensim.models import Word2Vec
import numpy as np

# 將每段文字轉為詞列表
train_sentences = train_df['clean_text'].apply(lambda x: x.split()).tolist()
test_sentences = test_df['clean_text'].apply(lambda x: x.split()).tolist()

# 訓練 Word2Vec 模型
w2v_model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=2, workers=4)

# 將文字轉換為向量平均值
def sentence_to_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

train_text_vectors = np.array([sentence_to_vector(sent, w2v_model) for sent in train_sentences])
test_text_vectors = np.array([sentence_to_vector(sent, w2v_model) for sent in test_sentences])


In [7]:
from scipy.sparse import hstack

# 將 TF-IDF 與 Word2Vec 合併
X_train = hstack([tfidf_train, train_text_vectors])
X_test = hstack([tfidf_test, test_text_vectors])

# 訓練標籤
y_train = train_df['emotion']


In [9]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 將文字標籤轉換為數值
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# 分割訓練與驗證資料
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train_encoded, test_size=0.2, random_state=42)

# 訓練模型
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_))
xgb_model.fit(X_train_split, y_train_split)

# 預測驗證集
y_pred_val_encoded = xgb_model.predict(X_val_split)


In [10]:
from sklearn.metrics import classification_report, confusion_matrix

# 將數值標籤轉回文字標籤
y_pred_val = label_encoder.inverse_transform(y_pred_val_encoded)
y_val_split_text = label_encoder.inverse_transform(y_val_split)

# 評估模型
print(confusion_matrix(y_val_split_text, y_pred_val))
print(classification_report(y_val_split_text, y_pred_val))


[[  450   182   580    51  1691   875     1    49]
 [   24 11887   854   140 10375  1247    15   410]
 [   60   564  4299   150  5309  3195     6   121]
 [    7   428   461  1627  3183   636     4    60]
 [   31  3197  1539   319 42950  2349    16  1080]
 [   63   915  2647   208  8139  7035    12   175]
 [   12   229   590    57  2427   824   616    54]
 [   18  1988   787   104 12959  1045    10  3583]]
              precision    recall  f1-score   support

       anger       0.68      0.12      0.20      3879
anticipation       0.61      0.48      0.54     24952
     disgust       0.37      0.31      0.34     13704
        fear       0.61      0.25      0.36      6406
         joy       0.49      0.83      0.62     51481
     sadness       0.41      0.37      0.39     19194
    surprise       0.91      0.13      0.22      4809
       trust       0.65      0.17      0.28     20494

    accuracy                           0.50    144919
   macro avg       0.59      0.33      0.37    14

In [11]:
# 預測測試集
y_test_pred_encoded = xgb_model.predict(X_test)
test_df['emotion'] = label_encoder.inverse_transform(y_test_pred_encoded)

# 輸出為 submission.csv
submission = test_df[['id', 'emotion']]
submission.to_csv('submission_fin.csv', index=False)

1. 你認為使用xgboost是好方法嗎 還是有別的更好的辦法 code要做那些修改
2. 你認為降維會提高最後的結果嗎 應該在哪裡降
3. 你認為前處理還有什麼可以更加強的做法嗎 例如正則化之類的 或是其他 請提出

In [13]:
sub = pd.read_csv("submission_fin.csv")
sub

Unnamed: 0,tweet_id,emotion
0,0x28b412,anticipation
1,0x2de201,anticipation
2,0x218443,joy
3,0x2939d5,joy
4,0x26289a,anticipation
...,...,...
411967,0x2913b4,anticipation
411968,0x2a980e,anticipation
411969,0x316b80,sadness
411970,0x29d0cb,joy


In [14]:
sub1 = pd.read_csv("submission.csv")
sub1

Unnamed: 0,id,emotion
0,0x28b412,anticipation
1,0x2de201,anticipation
2,0x218443,joy
3,0x2939d5,joy
4,0x26289a,trust
...,...,...
411967,0x2913b4,anticipation
411968,0x2a980e,anticipation
411969,0x316b80,joy
411970,0x29d0cb,joy


In [15]:
# 找出兩個 column 中獨有的值
unique_to_df1 = set(sub['id']) - set(sub1['id'])
unique_to_df2 = set(sub1['id']) - set(sub['id'])

if unique_to_df1:
    print(f"sub.csv 中有 sub1.csv 沒有的值：{unique_to_df1}")
if unique_to_df2:
    print(f"sub1.csv 的  column 中有 sub.csv 沒有的值：{unique_to_df2}")

KeyError: 'id'