## Kaggle DM2023 ISA5810 Lab2 Homework

### Load data

In [1]:
import pandas as pd

# Read data_identification.csv
df = pd.read_csv("./data_identification.csv",
                sep=",", header=None,names=["tweet_id", "identification"])

# Obtain train_id and test_id through identification
train_id = df[df["identification"] == "train"]
test_id = df[df["identification"] == "test"]
train_id

Unnamed: 0,tweet_id,identification
2,0x29e452,train
3,0x2b3819,train
5,0x2a2acc,train
6,0x2a8830,train
7,0x20b21d,train
...,...,...
1867531,0x227e25,train
1867532,0x293813,train
1867533,0x1e1a7e,train
1867534,0x2156a5,train


### Processing json

In [2]:
import json

# 讀取 tweets_DM.json
with open("./tweets_DM.json", "r", encoding="utf-8") as f:
    tweets_data = [json.loads(line)["_source"]["tweet"] for line in f]

# 取得推文文字列表
tweet_texts = [tweet["text"] for tweet in tweets_data]

# 刪除 <LH> 標籤
cleaned_texts = [text.replace("<LH>", "") for text in tweet_texts]

# 更新推文文字
for i in range(len(tweets_data)):
    tweets_data[i]["text"] = cleaned_texts[i]

# 將更新後的資料轉換回 JSON 字串
updated_json_str = json.dumps(tweets_data, indent=2)


### Merge dataset

In [4]:
# 將 JSON 字串轉換成 DataFrame
tweets_df = pd.json_normalize(tweets_data)[["tweet_id", "text"]]

# 讀取 emotion.csv
emotion_df = pd.read_csv("./emotion.csv", sep=",", header=None, names=["tweet_id", "emotion"])

# 合併資料集
merged_train_df = pd.merge(train_id, tweets_df, on="tweet_id", how="left")
train_df = pd.merge(merged_train_df, emotion_df, on="tweet_id", how="left")
test_df = pd.merge(test_id, emotion_df, on="tweet_id", how="left")

# 移除 'identification' 列
train_df = train_df.drop(columns=["identification"])
test_df = test_df.drop(columns=["identification"])

# 顯示結果
print("Shape of train_df: ", train_df.shape)
print(train_df.head())


Shape of train_df:  (1455563, 3)
   tweet_id                                               text       emotion
0  0x29e452  Huge Respect🖒 @JohnnyVegasReal talking about l...           joy
1  0x2b3819  Yoooo we hit all our monthly goals with the ne...           joy
2  0x2a2acc  @KIDSNTS @PICU_BCH @uhbcomms @BWCHBoss Well do...         trust
3  0x2a8830  Come join @ambushman27 on #PUBG while he striv...           joy
4  0x20b21d  @fanshixieen2014 Blessings!My #strength little...  anticipation


In [5]:
# shuffle dataset
train_df = train_df.sample(frac=1)
test_df = test_df.sample(frac=1)

In [6]:
print("Shape of Training df: ", train_df.shape)
print("Shape of Testing df: ", test_df.shape)

Shape of Training df:  (1455563, 3)
Shape of Testing df:  (411972, 2)


---
### Save data

We will save our data in Pickle format. The pickle module implements binary protocols for serializing and de-serializing a Python object structure.   
  
Some advantages for using pickle structure:  
* Because it stores the attribute type, it's more convenient for cross-platform use.  
* When your data is huge, it could use less space to store also consume less loading time.   

In [7]:
## save to pickle file
train_df.to_pickle("train_df.pkl") 
test_df.to_pickle("test_df.pkl")

In [8]:
import pandas as pd

## load a pickle file
train_df = pd.read_pickle("train_df.pkl")
test_df = pd.read_pickle("test_df.pkl")

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import torch

# 將情緒標籤轉換成數字
label_encoder = LabelEncoder()
train_df["emotion_label"] = label_encoder.fit_transform(train_df["emotion"])

# 將文本進行分詞
max_words = 5000
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["text"])
sequences = tokenizer.texts_to_sequences(train_df["text"])

# 將序列填充成相同的長度
max_sequence_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# 切割訓練集和驗證集
X_train, X_val, y_train, y_val = train_test_split(
    padded_sequences, train_df["emotion_label"], test_size=0.2, random_state=42
)

In [7]:
# 建立模型
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64))
model.add(Dense(8, activation="softmax"))

# 編譯模型
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# 定義提前停止條件
early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

# 訓練模型
model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), callbacks=[early_stopping])

# 使用測試資料進行預測
test_sequences = tokenizer.texts_to_sequences(test_df["text"])
padded_test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length)
predictions = model.predict(padded_test_sequences)

# 將預測的情緒標籤轉換回文字
predicted_labels = label_encoder.inverse_transform(predictions.argmax(axis=1))

# 建立測試結果的DataFrame
test_results_df = pd.DataFrame({"id": test_df["tweet_id"], "emotion": predicted_labels})

# 顯示結果
print(test_results_df.head())

# 儲存測試結果
test_results_df.to_csv("test_results.csv", index=False)


2023-12-27 12:34:59.656071: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-27 12:34:59.697578: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-27 12:34:59.697979: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-27 12:35:55.504969: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-27 12:35:55.507189: W tensorflow/core/common_runtime/gpu/gpu_device.

Epoch 1/10


2023-12-27 12:35:55.859809: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 465780000 exceeds 10% of free system memory.




2023-12-27 13:07:27.880423: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 116445200 exceeds 10% of free system memory.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


2023-12-27 16:56:44.824483: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 164788800 exceeds 10% of free system memory.


              id       emotion
113935  0x2ef354           joy
234392  0x24b811         trust
123482  0x36c277           joy
395126  0x2cbaa6  anticipation
343251  0x2dd496       disgust


In [9]:
test_results = pd.read_csv("./test_results.csv", sep=",")
print("Shape of Testing df: ", test_results.shape)

Shape of Testing df:  (411972, 2)


---