# 1. Load Data

In [None]:
# import necessary libraries
import json
import pandas as pd
import numpy as np
import nltk

In [2]:
data = []
with open('./dm-2024-isa-5810-lab-2-homework/tweets_DM.json', 'r') as f:
    for line in f:
        try:
            data.append(json.loads(line))  # Safeguard against malformed JSON
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")

f.close()

In [3]:
emotion_list = pd.read_csv('./dm-2024-isa-5810-lab-2-homework/emotion.csv')
data_identification = pd.read_csv('./dm-2024-isa-5810-lab-2-homework/data_identification.csv')

In [4]:
df = pd.DataFrame(data)

# Extract '_source' and validate structure
if '_source' not in df.columns:
    raise KeyError("'_source' column not found in the data")

_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'hashtags': _source.apply(lambda x: x['hashtags']),
    'text': _source.apply(lambda x: x['text']),
})

# Ensure tweet_id is of a consistent type
df['tweet_id'] = df['tweet_id'].astype(str)

# Validate and prepare `data_identification`
data_identification['tweet_id'] = data_identification['tweet_id'].astype(str)
df = df.merge(data_identification, on='tweet_id', how='left')

train_data = df[df['identification'] == 'train']
test_data = df[df['identification'] == 'test']

In [5]:
train_data = train_data.merge(emotion_list, on='tweet_id', how='left')
train_data.head()

Unnamed: 0,tweet_id,hashtags,text,identification,emotion
0,0x376b20,[Snapchat],"People who post ""add me on #Snapchat"" must be ...",train,anticipation
1,0x2d5350,"[freepress, TrumpLegacy, CNN]","@brianklaas As we see, Trump is dangerous to #...",train,sadness
2,0x1cd5b0,[],Now ISSA is stalking Tasha 😂😂😂 <LH>,train,fear
3,0x1d755c,"[authentic, LaughOutLoud]",@RISKshow @TheKevinAllison Thx for the BEST TI...,train,joy
4,0x2c91a8,[],Still waiting on those supplies Liscus. <LH>,train,anticipation


In [6]:
test_data.head()

Unnamed: 0,tweet_id,hashtags,text,identification
2,0x28b412,[bibleverse],"Confident of your obedience, I write to you, k...",test
4,0x2de201,[],"""Trust is not the same as faith. A friend is s...",test
9,0x218443,"[materialism, money, possessions]",When do you have enough ? When are you satisfi...,test
30,0x2939d5,"[GodsPlan, GodsWork]","God woke you up, now chase the day #GodsPlan #...",test
33,0x26289a,[],"In these tough times, who do YOU turn to as yo...",test


In [7]:
train_data.drop_duplicates(subset=['text'], keep=False, inplace=True)

In [8]:
# shuffle dataset
train_data = train_data.sample(frac=1)
test_data = test_data.sample(frac=1)

print("Shape of Training df: ", train_data.shape)
print("Shape of Testing df: ", test_data.shape)
train_data.head()

Shape of Training df:  (1449182, 5)
Shape of Testing df:  (411972, 4)


Unnamed: 0,tweet_id,hashtags,text,identification,emotion
316008,0x247010,[],How can I even prove this? <LH>,train,surprise
487757,0x32334d,[],.@crissle called that rent increase!! <LH> <LH>,train,fear
322506,0x21d312,"[poster, mencap]",Thank you for making my ride to work even happ...,train,joy
1442041,0x1f245d,[],@SondagegadnoS <LH> bloods,train,sadness
586750,0x1e5492,"[spirit, soul]",@JosiePinaRivas 1 Thessalonians 5:23 Now may t...,train,anticipation


In [9]:
test_data.head()

Unnamed: 0,tweet_id,hashtags,text,identification
1068678,0x35fc84,[],@TimHortons you wanna know how my week-end wen...,test
682037,0x1d06f3,"[say, No, No]",@SNRaja_ @kashmir_rise Complete independence ...,test
103113,0x20482d,[],"One of the best things about traveling, is tha...",test
70680,0x325b53,[],"@NerdlifeNow I watched that battle, when @MRDI...",test
534685,0x1e42d9,[],@liesagreedupon @chrislhayes First it was 'too...,test


In [None]:
# train_data_sample = train_data.sample(frac=0.8, random_state=42)
# used .sample() before but now discarded because it can still run without discarding

In [11]:
y_train_alter = train_data['emotion']
y_train_data = pd.DataFrame(y_train_alter)
X_train_data = train_data.drop(['tweet_id', 'emotion', 'identification', 'hashtags'], axis=1)
ans_data = test_data.drop(['tweet_id', 'identification', 'hashtags'], axis=1)

In [12]:
y_train_data.head()

Unnamed: 0,emotion
316008,surprise
487757,fear
322506,joy
1442041,sadness
586750,anticipation


In [13]:
X_train_data.head()

Unnamed: 0,text
316008,How can I even prove this? <LH>
487757,.@crissle called that rent increase!! <LH> <LH>
322506,Thank you for making my ride to work even happ...
1442041,@SondagegadnoS <LH> bloods
586750,@JosiePinaRivas 1 Thessalonians 5:23 Now may t...


In [14]:
ans_data.head()

Unnamed: 0,text
1068678,@TimHortons you wanna know how my week-end wen...
682037,@SNRaja_ @kashmir_rise Complete independence ...
103113,"One of the best things about traveling, is tha..."
70680,"@NerdlifeNow I watched that battle, when @MRDI..."
534685,@liesagreedupon @chrislhayes First it was 'too...


In [None]:
# the .head() above is used for checking the datatype
# sometimes the datatype goes different
# the following heads all serve for the same purpose

# 2. Deep learning

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_data, y_train_data, test_size=0.2, random_state=42)

In [16]:
import keras

### 2.1 BOW is still needed

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to C:\Users\Daisy
[nltk_data]     Liu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# build analyzers (bag-of-words)
BOW_500 = CountVectorizer(max_features=1800, tokenizer=nltk.word_tokenize,
                         token_pattern=None)

In [19]:
# apply analyzer to training data
BOW_500.fit(train_data['text'])

### 2.2 Prepare the data(X, y)

In [20]:
y_train.head()

Unnamed: 0,emotion
82861,joy
668056,anticipation
168143,trust
1226826,joy
1147242,sadness


In [21]:
X_test.head()

Unnamed: 0,text
306777,marajlooks is stealing nickiarchives’ tweets <...
884812,I'm genuinely blessed to have such amazing fri...
513321,i am so mad. paul 100% deserved that half a mi...
319500,@LJPBR @MostRequestLive @OnAirRomeo @FifthHarm...
761381,<LH> has a job for each of us. Get the job done.


In [22]:
# standardize name (X, y)
X_trainv2 = BOW_500.transform(X_train['text'])
y_trainv2 = y_train['emotion']

X_testv2 = BOW_500.transform(X_test['text'])
y_testv2 = y_test['emotion']

## check dimension is a good habbit
print('X_train.shape: ', X_trainv2.shape)
print('y_train.shape: ', y_trainv2.shape)
print('X_test.shape: ', X_testv2.shape)
print('y_test.shape: ', y_testv2.shape)

X_train.shape:  (1159345, 2000)
y_train.shape:  (1159345,)
X_test.shape:  (289837, 2000)
y_test.shape:  (289837,)


In [23]:
ans_data.head()

Unnamed: 0,text
1068678,@TimHortons you wanna know how my week-end wen...
682037,@SNRaja_ @kashmir_rise Complete independence ...
103113,"One of the best things about traveling, is tha..."
70680,"@NerdlifeNow I watched that battle, when @MRDI..."
534685,@liesagreedupon @chrislhayes First it was 'too...


In [24]:
ans_datav2 = BOW_500.transform(ans_data['text'])

In [25]:
ans_datav2.shape

(411972, 2000)

In [None]:
# if .shape() appears, it is used to check the shape
# if the shape is not the same as the input, an error message pops when training
# (and the message is not usually popped initially QQ)

### 2.3 Deal with categorical label(y)

In [26]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_trainv2)
print('check label: ', label_encoder.classes_)
print('\n## Before convert')
print('y_train[0:4]:\n', y_trainv2[0:4])
print('\ny_train.shape: ', y_trainv2.shape)
print('y_test.shape: ', y_testv2.shape)

def label_encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_trainv2 = label_encode(label_encoder, y_trainv2)
y_testv2 = label_encode(label_encoder, y_testv2)

print('\n\n## After convert')
print('y_train[0:4]:\n', y_trainv2[0:4])
print('\ny_train.shape: ', y_trainv2.shape)
print('y_test.shape: ', y_testv2.shape)

check label:  ['anger' 'anticipation' 'disgust' 'fear' 'joy' 'sadness' 'surprise'
 'trust']

## Before convert
y_train[0:4]:
 82861               joy
668056     anticipation
168143            trust
1226826             joy
Name: emotion, dtype: object

y_train.shape:  (1159345,)
y_test.shape:  (289837,)


## After convert
y_train[0:4]:
 [[0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0. 0.]]

y_train.shape:  (1159345, 8)
y_test.shape:  (289837, 8)


### 2.4 Build model

In [27]:
# I/O check
input_shape = X_trainv2.shape[1]
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_)
print('output_shape: ', output_shape)

input_shape:  2000
output_shape:  8


In [28]:
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers import ReLU, Softmax

# input layer
model_input = Input(shape=(input_shape, ))  # 500
X = model_input

# 1st hidden layer
X_W1 = Dense(units=64)(X)  # 64
H1 = ReLU()(X_W1)

# 2nd hidden layer
H1_W2 = Dense(units=64)(H1)  # 64
H2 = ReLU()(H1_W2)

# output layer
H2_W3 = Dense(units=output_shape)(H2)  # 4
H3 = Softmax()(H2_W3)

model_output = H3

# create model
model = Model(inputs=[model_input], outputs=[model_output])

# loss function & optimizer
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# show model construction
model.summary()

### 2.5 Train

In [29]:
from keras.callbacks import CSVLogger

# csv_logger = CSVLogger('/content/drive/My Drive/NTHU/DM-Lab2-HW/logs/training_log.csv')

# training setting
epochs = 40
batch_size = 512

# training!
history = model.fit(X_trainv2, y_trainv2,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data = (X_testv2, y_testv2))
print('training finish')

Epoch 1/40




[1m2265/2265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.4875 - loss: 1.4231 - val_accuracy: 0.5333 - val_loss: 1.2822
Epoch 2/40
[1m2265/2265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.5416 - loss: 1.2587 - val_accuracy: 0.5399 - val_loss: 1.2634
Epoch 3/40
[1m2265/2265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.5514 - loss: 1.2319 - val_accuracy: 0.5427 - val_loss: 1.2603
Epoch 4/40
[1m2265/2265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.5589 - loss: 1.2136 - val_accuracy: 0.5430 - val_loss: 1.2565
Epoch 5/40
[1m2265/2265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 11ms/step - accuracy: 0.5633 - loss: 1.2003 - val_accuracy: 0.5432 - val_loss: 1.2568
Epoch 6/40
[1m2265/2265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 13ms/step - accuracy: 0.5660 - loss: 1.1949 - val_accuracy: 0.5432 - val_loss: 1.2569
Epoch 7/40
[1m2265/

### 2.6 Predict data

In [30]:
pred_X_test = model.predict(X_testv2, batch_size=128)
pred_X_test[:5]

[1m2265/2265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step


array([[1.42088812e-02, 2.36802120e-02, 2.02893019e-01, 7.47117698e-02,
        3.75353098e-01, 1.64794177e-01, 7.00824559e-02, 7.42763877e-02],
       [4.81919160e-05, 9.89136286e-03, 1.04244056e-04, 7.22546189e-04,
        3.02117884e-01, 1.02199975e-03, 3.93541442e-04, 6.85700238e-01],
       [2.12083697e-01, 5.23787849e-02, 2.22781271e-01, 1.38525199e-02,
        2.11864352e-01, 1.91414177e-01, 4.50220965e-02, 5.06031513e-02],
       [4.66801297e-15, 8.79422471e-15, 7.79089927e-14, 2.39966915e-17,
        1.02159274e-10, 1.00000000e+00, 7.30062495e-13, 1.86243156e-16],
       [1.93281025e-02, 3.69024098e-01, 5.35539575e-02, 2.04092842e-02,
        1.23580799e-01, 6.30526617e-02, 3.69356982e-02, 3.14115345e-01]],
      dtype=float32)

In [31]:
pred_X_test = label_decode(label_encoder, pred_X_test)
pred_X_test[:5]

array(['joy', 'trust', 'disgust', 'sadness', 'anticipation'], dtype=object)

In [32]:
from sklearn.metrics import accuracy_score
#Accuracy
print('testing accuracy: {}'.format(round(accuracy_score(label_decode(label_encoder, y_testv2), pred_X_test), 2)))

testing accuracy: 0.53


In [33]:
## predict
pred_result = model.predict(ans_datav2, batch_size=64)
pred_result[:5]

[1m6438/6438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step


array([[0.01388774, 0.1401227 , 0.25539365, 0.03174779, 0.28268674,
        0.1290502 , 0.08422185, 0.06288928],
       [0.03252774, 0.06196433, 0.38484958, 0.00163747, 0.10723633,
        0.22868854, 0.00631213, 0.1767838 ],
       [0.00631628, 0.27391368, 0.01694799, 0.01955212, 0.45259035,
        0.0431312 , 0.02087561, 0.16667283],
       [0.00588445, 0.09687   , 0.23415683, 0.02107537, 0.30342263,
        0.15868835, 0.08594123, 0.09396115],
       [0.05881261, 0.06292286, 0.1672858 , 0.0435072 , 0.19503428,
        0.33528677, 0.07264417, 0.06450633]], dtype=float32)

In [34]:
pred_result = label_decode(label_encoder, pred_result)
pred_result[:5]

array(['joy', 'disgust', 'joy', 'joy', 'sadness'], dtype=object)

In [35]:
submission = pd.DataFrame({
    'id': test_data['tweet_id'],
    'emotion': pred_result,
})

In [36]:
submission.to_csv('./submission.csv', index=False)