# 1. Load Data

In [1]:
import json
import pandas as pd
import numpy as np
import nltk

In [2]:
data = []
with open('./dm-2024-isa-5810-lab-2-homework/tweets_DM.json', 'r') as f:
    for line in f:
        try:
            data.append(json.loads(line))  # Safeguard against malformed JSON
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")

f.close()

In [3]:
emotion_list = pd.read_csv('./dm-2024-isa-5810-lab-2-homework/emotion.csv')
data_identification = pd.read_csv('./dm-2024-isa-5810-lab-2-homework/data_identification.csv')

In [4]:
df = pd.DataFrame(data)

# Extract '_source' and validate structure
if '_source' not in df.columns:
    raise KeyError("'_source' column not found in the data")

_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'hashtags': _source.apply(lambda x: x['hashtags']),
    'text': _source.apply(lambda x: x['text']),
})

# Ensure tweet_id is of a consistent type
df['tweet_id'] = df['tweet_id'].astype(str)

# Validate and prepare `data_identification`
data_identification['tweet_id'] = data_identification['tweet_id'].astype(str)
df = df.merge(data_identification, on='tweet_id', how='left')

train_data = df[df['identification'] == 'train']
test_data = df[df['identification'] == 'test']

In [5]:
train_data = train_data.merge(emotion_list, on='tweet_id', how='left')
train_data.head()

Unnamed: 0,tweet_id,hashtags,text,identification,emotion
0,0x376b20,[Snapchat],"People who post ""add me on #Snapchat"" must be ...",train,anticipation
1,0x2d5350,"[freepress, TrumpLegacy, CNN]","@brianklaas As we see, Trump is dangerous to #...",train,sadness
2,0x1cd5b0,[],Now ISSA is stalking Tasha 😂😂😂 <LH>,train,fear
3,0x1d755c,"[authentic, LaughOutLoud]",@RISKshow @TheKevinAllison Thx for the BEST TI...,train,joy
4,0x2c91a8,[],Still waiting on those supplies Liscus. <LH>,train,anticipation


In [6]:
test_data.head()

Unnamed: 0,tweet_id,hashtags,text,identification
2,0x28b412,[bibleverse],"Confident of your obedience, I write to you, k...",test
4,0x2de201,[],"""Trust is not the same as faith. A friend is s...",test
9,0x218443,"[materialism, money, possessions]",When do you have enough ? When are you satisfi...,test
30,0x2939d5,"[GodsPlan, GodsWork]","God woke you up, now chase the day #GodsPlan #...",test
33,0x26289a,[],"In these tough times, who do YOU turn to as yo...",test


In [7]:
train_data.drop_duplicates(subset=['text'], keep=False, inplace=True)

In [8]:
# shuffle dataset
train_data = train_data.sample(frac=1)
test_data = test_data.sample(frac=1)

print("Shape of Training df: ", train_data.shape)
print("Shape of Testing df: ", test_data.shape)
train_data.head()

Shape of Training df:  (1449182, 5)
Shape of Testing df:  (411972, 4)


Unnamed: 0,tweet_id,hashtags,text,identification,emotion
643748,0x309ab8,[],@divyamisra2 @S1dharthM jisne homour wau me ma...,train,disgust
1410239,0x32c971,[],Be in the streets with Issa <LH>,train,fear
295981,0x313632,[],Death does not have the last word. <LH> <LH>,train,joy
787661,0x1d1596,[],@JoyBlackgirl She does tho. <LH>,train,sadness
1345241,0x21fe6b,[],fouseyTUBE CALLS OUT h3h3Productions | <LH> 😭,train,surprise


In [9]:
test_data.head()

Unnamed: 0,tweet_id,hashtags,text,identification
417467,0x320fd4,"[destinationheaven, nolimits]",I pray for people because sometimes I feel we ...,test
576651,0x1f096d,[Prayers],Family over errthang and god above all #Prayer...,test
1078224,0x227763,[],@realDonaldTrump All you have done is turn the...,test
809198,0x1de939,[],I️ don’t want anything more than I want my whi...,test
205630,0x35db7f,[],@Dongho94 What a shame i didn't know u were he...,test


In [10]:
train_data_sample = train_data.sample(frac=0.3, random_state=42)

In [11]:
y_train_alter = train_data_sample['emotion']
y_train_data = pd.DataFrame(y_train_alter)
X_train_data = train_data_sample.drop(['tweet_id', 'emotion', 'identification', 'hashtags'], axis=1)
ans_data = test_data.drop(['tweet_id', 'identification', 'hashtags'], axis=1)

In [12]:
y_train_data.head()

Unnamed: 0,emotion
532966,joy
1092221,joy
141091,trust
770831,sadness
353563,trust


In [13]:
X_train_data.head()

Unnamed: 0,text
532966,@StarPlus @akshaykumar @MallikaDua @Zakirism @...
1092221,Little to say idol..... 👑 @Harry_Styles #Harr...
141091,Just made a three lane shift <LH> #diditforsonic
770831,@realDonaldTrump “Do not try us” are those you...
353563,Anyone got any #spinning #weaving #dyeing <LH>...


In [14]:
ans_data.head()

Unnamed: 0,text
417467,I pray for people because sometimes I feel we ...
576651,Family over errthang and god above all #Prayer...
1078224,@realDonaldTrump All you have done is turn the...
809198,I️ don’t want anything more than I want my whi...
205630,@Dongho94 What a shame i didn't know u were he...


# 2. Deep learning

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_data, y_train_data, test_size=0.2, random_state=42)

In [16]:
import keras

### 2.1 BOW is still needed

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to C:\Users\Daisy
[nltk_data]     Liu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [18]:
# build analyzers (bag-of-words)
BOW_500 = CountVectorizer(max_features=1800, tokenizer=nltk.word_tokenize,
                         token_pattern=None)

In [19]:
# apply analyzer to training data
BOW_500.fit(train_data['text'])

### 2.2 Prepare the data(X, y)

In [20]:
y_train.head()

Unnamed: 0,emotion
620599,trust
929279,disgust
395783,anticipation
572992,sadness
711689,joy


In [21]:
X_test.head()

Unnamed: 0,text
1268043,#Caldwell <LH> coach
501638,@NateStetsonxx Your Beauty is Always Super Stu...
1294852,Waiting for this phone call is gonna be the de...
730343,Let's #KickIt! #Dance! I'm on a #Roll! <LH> (I...
847721,Sipping a glass of fanta😂😍 <LH> #TheUndateables


In [22]:
# standardize name (X, y)
X_trainv2 = BOW_500.transform(X_train['text'])
y_trainv2 = y_train['emotion']

X_testv2 = BOW_500.transform(X_test['text'])
y_testv2 = y_test['emotion']

## check dimension is a good habbit
print('X_train.shape: ', X_trainv2.shape)
print('y_train.shape: ', y_trainv2.shape)
print('X_test.shape: ', X_testv2.shape)
print('y_test.shape: ', y_testv2.shape)

X_train.shape:  (347804, 1800)
y_train.shape:  (347804,)
X_test.shape:  (86951, 1800)
y_test.shape:  (86951,)


In [23]:
ans_data.head()

Unnamed: 0,text
417467,I pray for people because sometimes I feel we ...
576651,Family over errthang and god above all #Prayer...
1078224,@realDonaldTrump All you have done is turn the...
809198,I️ don’t want anything more than I want my whi...
205630,@Dongho94 What a shame i didn't know u were he...


In [24]:
ans_datav2 = BOW_500.transform(ans_data['text'])

In [25]:
ans_datav2.shape

(411972, 1800)

### 2.3 Deal with categorical label(y)

In [26]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_trainv2)
print('check label: ', label_encoder.classes_)
print('\n## Before convert')
print('y_train[0:4]:\n', y_trainv2[0:4])
print('\ny_train.shape: ', y_trainv2.shape)
print('y_test.shape: ', y_testv2.shape)

def label_encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_trainv2 = label_encode(label_encoder, y_trainv2)
y_testv2 = label_encode(label_encoder, y_testv2)

print('\n\n## After convert')
print('y_train[0:4]:\n', y_trainv2[0:4])
print('\ny_train.shape: ', y_trainv2.shape)
print('y_test.shape: ', y_testv2.shape)

check label:  ['anger' 'anticipation' 'disgust' 'fear' 'joy' 'sadness' 'surprise'
 'trust']

## Before convert
y_train[0:4]:
 620599           trust
929279         disgust
395783    anticipation
572992         sadness
Name: emotion, dtype: object

y_train.shape:  (347804,)
y_test.shape:  (86951,)


## After convert
y_train[0:4]:
 [[0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]]

y_train.shape:  (347804, 8)
y_test.shape:  (86951, 8)


### 2.4 Build model

In [27]:
# I/O check
input_shape = X_trainv2.shape[1]
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_)
print('output_shape: ', output_shape)

input_shape:  1800
output_shape:  8


In [28]:
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers import ReLU, Softmax

# input layer
model_input = Input(shape=(input_shape, ))  # 500
X = model_input

# 1st hidden layer
X_W1 = Dense(units=64)(X)  # 64
H1 = ReLU()(X_W1)

# 2nd hidden layer
H1_W2 = Dense(units=64)(H1)  # 64
H2 = ReLU()(H1_W2)

# output layer
H2_W3 = Dense(units=output_shape)(H2)  # 4
H3 = Softmax()(H2_W3)

model_output = H3

# create model
model = Model(inputs=[model_input], outputs=[model_output])

# loss function & optimizer
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# show model construction
model.summary()

### 2.5 Train

In [29]:
from keras.callbacks import CSVLogger

# csv_logger = CSVLogger('/content/drive/My Drive/NTHU/DM-Lab2-HW/logs/training_log.csv')

# training setting
epochs = 40
batch_size = 512

# training!
history = model.fit(X_trainv2, y_trainv2,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data = (X_testv2, y_testv2))
print('training finish')

Epoch 1/40




[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.4351 - loss: 1.5582 - val_accuracy: 0.5155 - val_loss: 1.3507
Epoch 2/40
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5237 - loss: 1.3184 - val_accuracy: 0.5251 - val_loss: 1.3211
Epoch 3/40
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5367 - loss: 1.2770 - val_accuracy: 0.5274 - val_loss: 1.3116
Epoch 4/40
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5473 - loss: 1.2461 - val_accuracy: 0.5286 - val_loss: 1.3102
Epoch 5/40
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5568 - loss: 1.2208 - val_accuracy: 0.5295 - val_loss: 1.3093
Epoch 6/40
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5646 - loss: 1.2016 - val_accuracy: 0.5293 - val_loss: 1.3127
Epoch 7/40
[1m680/680[0m [32m━━━━━━━

### 2.6 Predict data

In [30]:
pred_X_test = model.predict(X_testv2, batch_size=128)
pred_X_test[:5]

[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


array([[2.3538783e-02, 9.6136488e-02, 2.1596494e-01, 1.2982958e-02,
        3.0430767e-01, 7.6994151e-02, 1.3865712e-01, 1.3141786e-01],
       [3.8904184e-03, 2.4113175e-02, 6.6254819e-03, 3.5725336e-03,
        8.9367276e-01, 8.8233883e-03, 1.4669494e-02, 4.4632655e-02],
       [4.9896240e-01, 6.2520415e-02, 1.1282626e-01, 2.4880974e-01,
        5.4115098e-02, 1.3066899e-02, 9.1584669e-03, 5.4064288e-04],
       [9.5914248e-03, 1.7831582e-01, 9.3216583e-02, 1.3483165e-01,
        4.2458272e-01, 5.5549126e-02, 1.4379644e-02, 8.9533083e-02],
       [8.2118906e-02, 5.7810377e-02, 2.6981270e-01, 1.9852199e-02,
        2.2526246e-01, 1.6931698e-01, 4.0597200e-02, 1.3522916e-01]],
      dtype=float32)

In [31]:
pred_X_test = label_decode(label_encoder, pred_X_test)
pred_X_test[:5]

array(['joy', 'joy', 'anger', 'joy', 'disgust'], dtype=object)

In [32]:
from sklearn.metrics import accuracy_score
#Accuracy
print('testing accuracy: {}'.format(round(accuracy_score(label_decode(label_encoder, y_testv2), pred_X_test), 2)))

testing accuracy: 0.49


In [33]:
## predict
pred_result = model.predict(ans_datav2, batch_size=64)
pred_result[:5]

[1m6438/6438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 909us/step


array([[8.1507303e-03, 3.8440329e-01, 4.2355280e-02, 8.9443497e-02,
        1.5531722e-01, 6.1976641e-02, 1.6635915e-02, 2.4171735e-01],
       [3.7897666e-05, 9.8512298e-01, 8.8211018e-06, 1.3993505e-03,
        5.2362741e-03, 3.1651420e-04, 2.3533902e-04, 7.6429122e-03],
       [1.4054726e-03, 2.1502303e-03, 4.1912787e-02, 1.0869135e-03,
        4.9343077e-04, 9.4048935e-01, 7.9537965e-03, 4.5080972e-03],
       [2.7991671e-02, 2.0230529e-01, 1.9906616e-01, 3.3557010e-03,
        4.0911853e-01, 2.3490136e-02, 7.2064827e-04, 1.3395186e-01],
       [1.2215980e-02, 2.0832533e-02, 1.9420098e-01, 4.7702687e-03,
        3.0469856e-01, 4.0040794e-01, 4.7041293e-02, 1.5832432e-02]],
      dtype=float32)

In [34]:
pred_result = label_decode(label_encoder, pred_result)
pred_result[:5]

array(['anticipation', 'anticipation', 'sadness', 'joy', 'sadness'],
      dtype=object)

In [35]:
submission = pd.DataFrame({
    'id': test_data['tweet_id'],
    'emotion': pred_result,
})

In [36]:
submission.to_csv('./submission.csv', index=False)