In [1]:
import json
import pandas as pd
import numpy as np
import torch
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModel
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from contractions import fix
import swifter

In [2]:
data = []
with open('tweet/tweets_DM.json', 'r') as f:
    for line in f:
        data.append(json.loads(line)) 
f.close()

emotion = pd.read_csv('tweet/emotion.csv')
data_identification = pd.read_csv('tweet/data_identification.csv')

df = pd.DataFrame(data)
_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'hashtags': _source.apply(lambda x: x['hashtags']),
    'text': _source.apply(lambda x: x['text']),
})
df = df.merge(data_identification, on='tweet_id', how='left')

train_data = df[df['identification'] == 'train']
test_data = df[df['identification'] == 'test']

train_data = train_data.merge(emotion, on='tweet_id', how='left')
train_data.drop_duplicates(subset=['text'], keep=False, inplace=True)

train_data_sample = train_data.sample(frac=0.5, random_state=42)
train_data_sample.to_pickle("train_dsample.pkl")
train_df = pd.read_pickle("train_dsample.pkl")

test_data.to_pickle("test_d.pkl")
test_df = pd.read_pickle("test_d.pkl")

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import wordnet
import emoji
import re

stop_words = set(stopwords.words('english'))

# 表情符號替換詞典
emoji_dict = {
    '😂': '[joy]', '❤️': '[love]', '😍': '[adoration]', '😭': '[cry]',
    '❤': '[care]', '😊': '[happy]', '🙏': '[pray]', '😘': '[kiss]',
    '💕': '[love_each_other]', '🔥': '[fire]', '😩': '[weary]',
    '🤔': '[think]', '💯': '[perfect]', '💙': '[loyalty]',
    '🙄': '[annoyed]', '😁': '[happy]', '🙌': '[celebrate]',
    '🙏🏾': '[pray]', '👍': '[approve]', '🙏🏽': '[pray]'
}

# Define a dictionary for common Twitter abbreviations/slangs
slang_dict = {
    "lol": "laugh out_loud",
    "u": "you",
    "idk": "I do not know",
    "omg": "oh my god",
    "btw": "by the way",
    "lmao": "laugh my_ass_off",
    "lmfao": "laugh my_ass_off",
    "fyi": "for your information",
    "brb": "be right back"
    # Add more as needed
}

# 預處理函數
def preprocess_text(text):
    # 替換 emoji
    for emj, keyword in emoji_dict.items():
        text = text.replace(emj, keyword)
    text = emoji.replace_emoji(text, replace='')  # 移除其他 emoji
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # 移除網址
    text = re.sub(r'RT[\s]+', '', text)  # Remove RT
    text = text.replace('<LH>', '')
    text = re.sub(r'\@\w+|\#', '', text)  # 移除 @user 和 hashtags
    text = re.sub(r"[^a-zA-Z0-9\s]", '', text)  # 移除特殊字元
    text = text.lower()
    text = re.sub(r'[^\w\s!?]', '', text)
    text = re.sub(r'not\s+(\w+)', r'not_\1', text)
    
    wds = text.split()
    tweet = " ".join([slang_dict[wd.lower()] if wd.lower() in slang_dict else wd for wd in wds])
    
    text = fix(text)
    #text = str(TextBlob(text).correct())

    lemmatizer = WordNetLemmatizer()
    text = " ".join([lemmatizer.lemmatize(wd) for wd in text.split()])
    
    text = text.strip()
    
    
    words = word_tokenize(text)
    return ' '.join([word for word in words if word not in stop_words])

# 清理訓練與測試資料
train_df['clean_text'] = train_df['text'].swifter.apply(preprocess_text)
test_df['clean_text'] = test_df['text'].swifter.apply(preprocess_text)

# 打亂訓練資料
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

Pandas Apply:   0%|          | 0/724591 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/411972 [00:00<?, ?it/s]

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 將 hashtags 組合成字串
train_df['hashtags'] = train_df['hashtags'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
test_df['hashtags'] = test_df['hashtags'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

# 訓練 TF-IDF
tfidf = TfidfVectorizer(max_features=500, stop_words='english')
tfidf_train = tfidf.fit_transform(train_df['hashtags'])
tfidf_test = tfidf.transform(test_df['hashtags'])

In [5]:
from gensim.models import Word2Vec
import numpy as np

# 將每段文字轉為詞列表
train_sentences = train_df['clean_text'].apply(lambda x: x.split()).tolist()
test_sentences = test_df['clean_text'].apply(lambda x: x.split()).tolist()

# 訓練 Word2Vec 模型
w2v_model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=2, workers=4)

# 將文字轉換為向量平均值
def sentence_to_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

train_text_vectors = np.array([sentence_to_vector(sent, w2v_model) for sent in train_sentences])
test_text_vectors = np.array([sentence_to_vector(sent, w2v_model) for sent in test_sentences])


In [6]:
from scipy.sparse import hstack

# 將 TF-IDF 與 Word2Vec 合併
X_train = hstack([tfidf_train, train_text_vectors])
X_test = hstack([tfidf_test, test_text_vectors])

# 訓練標籤
y_train = train_df['emotion']


In [7]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 將文字標籤轉換為數值
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# 分割訓練與驗證資料
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train_encoded, test_size=0.2, random_state=42)

# 訓練模型
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_))
xgb_model.fit(X_train_split, y_train_split)

# 預測驗證集
y_pred_val_encoded = xgb_model.predict(X_val_split)


In [8]:
from sklearn.metrics import classification_report, confusion_matrix

# 將數值標籤轉回文字標籤
y_pred_val = label_encoder.inverse_transform(y_pred_val_encoded)
y_val_split_text = label_encoder.inverse_transform(y_val_split)

# 評估模型
print(confusion_matrix(y_val_split_text, y_pred_val))
print(classification_report(y_val_split_text, y_pred_val))


[[  460   196   638    37  1737   769     4    38]
 [   32 11747   880   137 10413  1243    14   486]
 [   70   635  4319   152  5325  3074     9   120]
 [    6   455   431  1623  3175   653     7    56]
 [   32  3258  1455   305 42798  2354    28  1251]
 [   82   979  2585   204  8177  6952    17   198]
 [    8   235   599    53  2447   781   614    72]
 [   12  1950   811    78 12834  1042    14  3753]]
              precision    recall  f1-score   support

       anger       0.66      0.12      0.20      3879
anticipation       0.60      0.47      0.53     24952
     disgust       0.37      0.32      0.34     13704
        fear       0.63      0.25      0.36      6406
         joy       0.49      0.83      0.62     51481
     sadness       0.41      0.36      0.39     19194
    surprise       0.87      0.13      0.22      4809
       trust       0.63      0.18      0.28     20494

    accuracy                           0.50    144919
   macro avg       0.58      0.33      0.37    14

In [10]:
# 預測測試集
y_test_pred_encoded = xgb_model.predict(X_test)
test_df['emotion'] = label_encoder.inverse_transform(y_test_pred_encoded)

# 輸出為 submission.csv
submission = test_df[['tweet_id', 'emotion']]
submission.to_csv('submission_fin1.csv', index=False)

1. 你認為使用xgboost是好方法嗎 還是有別的更好的辦法 code要做那些修改
2. 你認為降維會提高最後的結果嗎 應該在哪裡降
3. 你認為前處理還有什麼可以更加強的做法嗎 例如正則化之類的 或是其他 請提出

In [11]:
train_df

Unnamed: 0,tweet_id,hashtags,text,identification,emotion,clean_text
0,0x36fc6e,development future winner,Escaping pain is not the answer. Embracing pai...,train,sadness,escaping pain not_the answer embracing pain al...
1,0x36f312,ignored,If I don't like you more then likely you've be...,train,sadness,like likely disrespectful unpleasant soul plea...
2,0x1d7398,Silverdome,"Two stadiums I've been two and photographed, i...",train,sadness,two stadium two photographed imploded within w...
3,0x26d0d1,trans,The racial trans badge #trans <LH> <LH>,train,trust,racial trans badge trans
4,0x2c580d,weddingdressfitting,Very special day with Luda and her mom ❤️Feeli...,train,joy,special day luda mom lovefeeling blessed weddi...
...,...,...,...,...,...,...
724586,0x380c45,Power5at5,Hey @POWERATL @maddoxradio please play <LH> by...,train,sadness,hey please play power5at5
724587,0x36c504,,damn my foot healed when everyone is already i...,train,disgust,damn foot healed everyone already tj
724588,0x2e8018,ZENii skincareroutine moisturiser health healt...,@skinandbodyclin we're excited 🙌 #ZENii #skinc...,train,trust,excited celebrate zenii skincareroutine moistu...
724589,0x31e324,job,So excited! Just got a call that I have an int...,train,fear,excited got call interview wednesday job
