# 1. Load Data

In [1]:
import json
import pandas as pd
import numpy as np
import nltk

In [2]:
data = []
with open('./dm-2024-isa-5810-lab-2-homework/tweets_DM.json', 'r') as f:
    for line in f:
        try:
            data.append(json.loads(line))  # Safeguard against malformed JSON
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")

f.close()

In [3]:
emotion_list = pd.read_csv('./dm-2024-isa-5810-lab-2-homework/emotion.csv')
data_identification = pd.read_csv('./dm-2024-isa-5810-lab-2-homework/data_identification.csv')

In [4]:
df = pd.DataFrame(data)

# Extract '_source' and validate structure
if '_source' not in df.columns:
    raise KeyError("'_source' column not found in the data")

_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'hashtags': _source.apply(lambda x: x['hashtags']),
    'text': _source.apply(lambda x: x['text']),
})

# Ensure tweet_id is of a consistent type
df['tweet_id'] = df['tweet_id'].astype(str)

# Validate and prepare `data_identification`
data_identification['tweet_id'] = data_identification['tweet_id'].astype(str)
df = df.merge(data_identification, on='tweet_id', how='left')

train_data = df[df['identification'] == 'train']
test_data = df[df['identification'] == 'test']

In [5]:
train_data = train_data.merge(emotion_list, on='tweet_id', how='left')
train_data.head()

Unnamed: 0,tweet_id,hashtags,text,identification,emotion
0,0x376b20,[Snapchat],"People who post ""add me on #Snapchat"" must be ...",train,anticipation
1,0x2d5350,"[freepress, TrumpLegacy, CNN]","@brianklaas As we see, Trump is dangerous to #...",train,sadness
2,0x1cd5b0,[],Now ISSA is stalking Tasha 😂😂😂 <LH>,train,fear
3,0x1d755c,"[authentic, LaughOutLoud]",@RISKshow @TheKevinAllison Thx for the BEST TI...,train,joy
4,0x2c91a8,[],Still waiting on those supplies Liscus. <LH>,train,anticipation


In [6]:
test_data.head()

Unnamed: 0,tweet_id,hashtags,text,identification
2,0x28b412,[bibleverse],"Confident of your obedience, I write to you, k...",test
4,0x2de201,[],"""Trust is not the same as faith. A friend is s...",test
9,0x218443,"[materialism, money, possessions]",When do you have enough ? When are you satisfi...,test
30,0x2939d5,"[GodsPlan, GodsWork]","God woke you up, now chase the day #GodsPlan #...",test
33,0x26289a,[],"In these tough times, who do YOU turn to as yo...",test


In [7]:
train_data.drop_duplicates(subset=['text'], keep=False, inplace=True)

In [8]:
# shuffle dataset
train_data = train_data.sample(frac=1)
test_data = test_data.sample(frac=1)

print("Shape of Training df: ", train_data.shape)
print("Shape of Testing df: ", test_data.shape)
train_data.head()

Shape of Training df:  (1449182, 5)
Shape of Testing df:  (411972, 4)


Unnamed: 0,tweet_id,hashtags,text,identification,emotion
588766,0x225a82,"[BacktoSchool, WeekofWelcome, SAPro, SAGrad]","My favorite ""Welcome Back"" tradition on my cam...",train,joy
837067,0x24fa99,"[ForcedMeToGoToWalmart, foodlion, noexcuses]",@FoodLion No Blueberry unfrosted pop tarts! U ...,train,anger
1142346,0x28b819,[RamStatueAtAyodhya],@republic It's the need of the hour.... <LH> #...,train,joy
1432731,0x272293,"[uhuruto, lost, canaanisreal]",Certificate of #uhuruto be burnt with polythen...,train,surprise
1195983,0x34ad85,[dream],39 Never give up on your #dream. <LH> me to ma...,train,anticipation


In [9]:
test_data.head()

Unnamed: 0,tweet_id,hashtags,text,identification
637160,0x2fb3ae,[],"Nothing in London, been waiting & looking for ...",test
1039283,0x235b35,"[reality, depravity, good, jesusdiditall, ItIs...",be convinced of your ultimate #reality not you...,test
855514,0x2360e0,[],ooooog did i ever get the coolest gemstone sta...,test
12460,0x360057,[],Stuck on a slow train with the the PIS test on...,test
725602,0x2ee96e,[],@christinawilkie @chrislhayes @VP He looks sad...,test


In [10]:
train_data_sample = train_data.sample(frac=0.02, random_state=42)

In [11]:
y_train_alter = train_data_sample['emotion']
y_train_data = pd.DataFrame(y_train_alter)
X_train_data = train_data_sample.drop(['tweet_id', 'emotion', 'identification', 'hashtags'], axis=1)
ans_data = test_data.drop(['tweet_id', 'identification', 'hashtags'], axis=1)

In [12]:
y_train_data.head()

Unnamed: 0,emotion
365668,disgust
370570,trust
1154690,disgust
1130123,anticipation
1056918,anticipation


In [13]:
X_train_data.head()

Unnamed: 0,text
365668,@cnn your reporter must not think the police c...
370570,Not sure what it says about my life that im ge...
1154690,The people of #PuertoRico are devastated and t...
1130123,Finally some sleep... a whole 9 hours! 👌🏼 <LH>
1056918,<LH> for anyone that's in the path of #Hurrica...


In [14]:
ans_data.head()

Unnamed: 0,text
637160,"Nothing in London, been waiting & looking for ..."
1039283,be convinced of your ultimate #reality not you...
855514,ooooog did i ever get the coolest gemstone sta...
12460,Stuck on a slow train with the the PIS test on...
725602,@christinawilkie @chrislhayes @VP He looks sad...


# 2. N-grams

### 2.1 N-grams processing

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import keras

In [16]:
# Create a CountVectorizer with bigrams (2-grams)
vectorizer = CountVectorizer(ngram_range=(2, 2))  # (2, 2) means bigrams
vectorizer.fit(train_data['text'])

In [17]:
X_training = vectorizer.transform(X_train_data['text'])

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_training, y_train_data, test_size=0.2, random_state=42)

In [19]:
## check dimension is a good habit
print('X_train.shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_test.shape: ', y_test.shape)

X_train.shape:  (23187, 4613735)
y_train.shape:  (23187, 1)
X_test.shape:  (5797, 4613735)
y_test.shape:  (5797, 1)


In [20]:
ans_datav2 = vectorizer.transform(ans_data['text'])

In [21]:
ans_datav2.shape

(411972, 4613735)

### 2.2 Deal with categorical label(y)

In [22]:
import keras
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
print('check label: ', label_encoder.classes_)
print('\n## Before convert')
print('y_train[0:4]:\n', y_train[0:4])
print('\ny_train.shape: ', y_train.shape)
print('y_test.shape: ', y_test.shape)

def label_encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_train = label_encode(label_encoder, y_train)
y_test = label_encode(label_encoder, y_test)

print('\n\n## After convert')
print('y_train[0:4]:\n', y_train[0:4])
print('\ny_train.shape: ', y_train.shape)
print('y_test.shape: ', y_test.shape)

check label:  ['anger' 'anticipation' 'disgust' 'fear' 'joy' 'sadness' 'surprise'
 'trust']

## Before convert
y_train[0:4]:
         emotion
849538  sadness
415065     fear
67921     trust
655844  disgust

y_train.shape:  (23187, 1)
y_test.shape:  (5797, 1)


## After convert
y_train[0:4]:
 [[0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0.]]

y_train.shape:  (23187, 8)
y_test.shape:  (5797, 8)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


### 2.3 Build Model

In [23]:
# I/O check
input_shape = X_train.shape[1]
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_)
print('output_shape: ', output_shape)

input_shape:  4613735
output_shape:  8


In [24]:
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers import ReLU, Softmax

# input layer
model_input = Input(shape=(input_shape, ))  # 500
X = model_input

# 1st hidden layer
X_W1 = Dense(units=64)(X)  # 64
H1 = ReLU()(X_W1)

# 2nd hidden layer
H1_W2 = Dense(units=64)(H1)  # 64
H2 = ReLU()(H1_W2)

# output layer
H2_W3 = Dense(units=output_shape)(H2)  # 4
H3 = Softmax()(H2_W3)

model_output = H3

# create model
model = Model(inputs=[model_input], outputs=[model_output])

# loss function & optimizer
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# show model construction
model.summary()

### 2.4 Train

In [25]:
from keras.callbacks import CSVLogger

# csv_logger = CSVLogger('/content/drive/My Drive/NTHU/DM-Lab2-HW/logs/training_log.csv')

# training setting
epochs = 3
batch_size = 256

# training!
history = model.fit(X_train, y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data = (X_test, y_test))
print('training finish')

Epoch 1/3




[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m495s[0m 5s/step - accuracy: 0.3542 - loss: 1.9412 - val_accuracy: 0.4112 - val_loss: 1.6109
Epoch 2/3
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m530s[0m 6s/step - accuracy: 0.7236 - loss: 0.9635 - val_accuracy: 0.4209 - val_loss: 1.6749
Epoch 3/3
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m418s[0m 5s/step - accuracy: 0.9835 - loss: 0.1516 - val_accuracy: 0.4128 - val_loss: 1.8439
training finish


### 2.5 Predict data

In [26]:
pred_X_test = model.predict(X_test, batch_size=128)
pred_X_test[:5]

[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 1s/step


array([[0.02916964, 0.14712252, 0.02127169, 0.01206742, 0.22055367,
        0.08412079, 0.05441986, 0.4312745 ],
       [0.02050239, 0.0661504 , 0.01479801, 0.0235328 , 0.5001234 ,
        0.14630736, 0.06212623, 0.16645949],
       [0.02980358, 0.1578653 , 0.05756778, 0.10653523, 0.47496772,
        0.05520085, 0.02512129, 0.0929382 ],
       [0.0178059 , 0.3974055 , 0.03648446, 0.0677234 , 0.4010341 ,
        0.01000329, 0.00804611, 0.06149718],
       [0.16079837, 0.07607036, 0.20134313, 0.11727511, 0.00678097,
        0.22817628, 0.13843468, 0.0711211 ]], dtype=float32)

In [27]:
pred_X_test = label_decode(label_encoder, pred_X_test)
pred_X_test[:5]

array(['trust', 'joy', 'joy', 'joy', 'sadness'], dtype=object)

In [28]:
from sklearn.metrics import accuracy_score
#Accuracy
print('testing accuracy: {}'.format(round(accuracy_score(label_decode(label_encoder, y_test), pred_X_test), 2)))

testing accuracy: 0.41


In [29]:
## predict
pred_result = model.predict(ans_datav2, batch_size=64)
pred_result[:5]

[1m6438/6438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3225s[0m 501ms/step


array([[0.04401606, 0.06802962, 0.05158221, 0.02924934, 0.13279283,
        0.20367159, 0.08333143, 0.38732696],
       [0.00373494, 0.12826854, 0.10991728, 0.0168424 , 0.02962064,
        0.68782663, 0.00499513, 0.0187944 ],
       [0.06433347, 0.04251293, 0.06363722, 0.19930093, 0.3722699 ,
        0.1189106 , 0.08884893, 0.050186  ],
       [0.00639918, 0.00881083, 0.1149656 , 0.03263991, 0.09331073,
        0.68348277, 0.0445736 , 0.01581739],
       [0.02366037, 0.11829582, 0.04859892, 0.05067644, 0.15400486,
        0.442912  , 0.04331945, 0.11853211]], dtype=float32)

In [31]:
pred_result2 = label_decode(label_encoder, pred_result)
pred_result2[:5]

array(['trust', 'sadness', 'joy', 'sadness', 'sadness'], dtype=object)

In [32]:
submission = pd.DataFrame({
    'id': test_data['tweet_id'],
    'emotion': pred_result2,
})

In [33]:
submission.to_csv('./submission.csv', index=False)