# 1. Load Data

In [1]:
import json
import pandas as pd
import numpy as np
import nltk

In [2]:
data = []
with open('./dm-2024-isa-5810-lab-2-homework/tweets_DM.json', 'r') as f:
    for line in f:
        try:
            data.append(json.loads(line))  # Safeguard against malformed JSON
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")

f.close()

In [3]:
emotion_list = pd.read_csv('./dm-2024-isa-5810-lab-2-homework/emotion.csv')
data_identification = pd.read_csv('./dm-2024-isa-5810-lab-2-homework/data_identification.csv')

In [4]:
df = pd.DataFrame(data)

# Extract '_source' and validate structure
if '_source' not in df.columns:
    raise KeyError("'_source' column not found in the data")

_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'hashtags': _source.apply(lambda x: x['hashtags']),
    'text': _source.apply(lambda x: x['text']),
})

# Ensure tweet_id is of a consistent type
df['tweet_id'] = df['tweet_id'].astype(str)

# Validate and prepare `data_identification`
data_identification['tweet_id'] = data_identification['tweet_id'].astype(str)
df = df.merge(data_identification, on='tweet_id', how='left')

train_data = df[df['identification'] == 'train']
test_data = df[df['identification'] == 'test']

In [5]:
train_data = train_data.merge(emotion_list, on='tweet_id', how='left')
train_data.head()

Unnamed: 0,tweet_id,hashtags,text,identification,emotion
0,0x376b20,[Snapchat],"People who post ""add me on #Snapchat"" must be ...",train,anticipation
1,0x2d5350,"[freepress, TrumpLegacy, CNN]","@brianklaas As we see, Trump is dangerous to #...",train,sadness
2,0x1cd5b0,[],Now ISSA is stalking Tasha 😂😂😂 <LH>,train,fear
3,0x1d755c,"[authentic, LaughOutLoud]",@RISKshow @TheKevinAllison Thx for the BEST TI...,train,joy
4,0x2c91a8,[],Still waiting on those supplies Liscus. <LH>,train,anticipation


In [6]:
test_data.head()

Unnamed: 0,tweet_id,hashtags,text,identification
2,0x28b412,[bibleverse],"Confident of your obedience, I write to you, k...",test
4,0x2de201,[],"""Trust is not the same as faith. A friend is s...",test
9,0x218443,"[materialism, money, possessions]",When do you have enough ? When are you satisfi...,test
30,0x2939d5,"[GodsPlan, GodsWork]","God woke you up, now chase the day #GodsPlan #...",test
33,0x26289a,[],"In these tough times, who do YOU turn to as yo...",test


In [7]:
train_data.drop_duplicates(subset=['text'], keep=False, inplace=True)

In [8]:
# shuffle dataset
train_data = train_data.sample(frac=1)
test_data = test_data.sample(frac=1)

print("Shape of Training df: ", train_data.shape)
print("Shape of Testing df: ", test_data.shape)
train_data.head()

Shape of Training df:  (1449182, 5)
Shape of Testing df:  (411972, 4)


Unnamed: 0,tweet_id,hashtags,text,identification,emotion
520787,0x32f135,"[Attitude, GoodThoughts, BadThoughts, Thoughts...",Good Thoughts! #Attitude #GoodThoughts #BadTho...,train,joy
718084,0x2a0491,"[Pune, mumbai, Entrepreneurs, supercharged, hu...",<LH> weekend in #Pune and #mumbai. Interacted ...,train,joy
414772,0x22a84d,[newfollowers],"Welcome to all my new followers, you truly are...",train,joy
473271,0x254b8d,"[make, Brexit, work, British, Women]",Business is booming this October #make #Brexit...,train,anticipation
271245,0x1caae2,"[technolochica, Engineer]",@technolochicas I'm very proud of my #technolo...,train,joy


In [9]:
test_data.head()

Unnamed: 0,tweet_id,hashtags,text,identification
1681795,0x208006,[],"It just makes no f'n sense. ""So and so has be...",test
910628,0x37f3fa,[],Amazed at all the Lord did @gbc4me today! The...,test
1592760,0x27a55b,"[Loneliness, pain, solitude, alone, inspiration]",#Loneliness expresses the #pain of being alone...,test
1493159,0x355d6c,[],Was always confused what love was in my past b...,test
1767241,0x2e1432,[Jamaica55],Happy 55th year of Independence Jamaica. We st...,test


In [10]:
train_data_sample = train_data.sample(frac=0.002, random_state=42)

In [11]:
y_train_alter = train_data_sample['emotion']
y_train_data = pd.DataFrame(y_train_alter)
X_train_data = train_data_sample.drop(['tweet_id', 'emotion', 'identification', 'hashtags'], axis=1)
ans_data = test_data.drop(['tweet_id', 'identification', 'hashtags'], axis=1)

In [12]:
y_train_data.head()

Unnamed: 0,emotion
289016,joy
720916,trust
1192584,anticipation
1150400,sadness
1365383,joy


In [13]:
X_train_data.head()

Unnamed: 0,text
289016,#WednesdayWisdom The reason to love the opposi...
720916,@EllaHall617 @NKOTB @DonnieWahlberg @joeymcint...
1192584,im going to <LH> this #Sunday...not fa human <...
1150400,@heygregr Imagine all the weird/gross things w...
1365383,Proud of @livvypep first collegiate double dou...


In [14]:
ans_data.head()

Unnamed: 0,text
1681795,"It just makes no f'n sense. ""So and so has be..."
910628,Amazed at all the Lord did @gbc4me today! The...
1592760,#Loneliness expresses the #pain of being alone...
1493159,Was always confused what love was in my past b...
1767241,Happy 55th year of Independence Jamaica. We st...


# 2. BERT

### 2.1 Use BERT to analyze

In [15]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [17]:
X_train_data_list = X_train_data['text'].tolist()
ans_data_list = ans_data['text'].tolist()

In [18]:
# Tokenize the text
X_train_input = tokenizer(X_train_data_list, padding = True, truncation = True,
                   max_length=256, return_tensors='pt')
ans_data_input = tokenizer(ans_data_list, padding=True, truncation=True,
                    return_tensors='pt')


In [19]:
# extract BERT embeddings
with torch.no_grad():
    outputs = bert_model(**X_train_input)
    # Use hidden states of the last layer for the embeddings
    embeddings = outputs.last_hidden_state.mean(dim=1)

In [20]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(embeddings.numpy(), y_train_data, test_size=0.2, random_state=42)

In [31]:
# train a classifier
classifier = LogisticRegression(max_iter=10000)
classifier.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [32]:
# make predictions
y_pred = classifier.predict(X_test)

In [33]:
from sklearn.metrics import accuracy_score
#Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 38.28%


In [36]:
X_test

array([[ 0.16674782, -0.03980552,  0.19096605, ..., -0.2657351 ,
        -0.06241189,  0.12217647],
       [ 0.19033211,  0.36307645,  0.59384793, ..., -0.08489142,
         0.17117691,  0.00184707],
       [-0.03495505,  0.21789704,  0.25160277, ...,  0.15873216,
        -0.0398658 , -0.03463737],
       ...,
       [ 0.15700385, -0.06212171,  0.47918218, ..., -0.04246101,
         0.12365133,  0.08318207],
       [ 0.17240483, -0.10317243,  0.55691457, ..., -0.352835  ,
         0.02218355,  0.00732354],
       [ 0.14138871,  0.21656565,  0.49834517, ..., -0.3886343 ,
        -0.04773482,  0.08011241]], dtype=float32)

In [39]:
# extract BERT embeddings
with torch.no_grad():
    outputs = bert_model(**ans_data_input)
    # Use hidden states of the last layer for the embeddings
    embeddings = outputs.last_hidden_state.mean(dim=1)

RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 188571119616 bytes.

In [None]:
## predict
pred_result = classifier.predict(ans_data_input)
pred_result[:5]

ValueError: Expected 2D array, got 1D array instead:
array=['input_ids' 'token_type_ids' 'attention_mask'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
submission = pd.DataFrame({
    'id': test_data['tweet_id'],
    'emotion': pred_result,
})

In [None]:
submission.to_csv('./submission.csv', index=False)