# 1. Load Data

In [1]:
import json
import pandas as pd
import numpy as np
import nltk

In [2]:
data = []
with open('./dm-2024-isa-5810-lab-2-homework/tweets_DM.json', 'r') as f:
    for line in f:
        try:
            data.append(json.loads(line))  # Safeguard against malformed JSON
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")

f.close()

In [3]:
emotion_list = pd.read_csv('./dm-2024-isa-5810-lab-2-homework/emotion.csv')
data_identification = pd.read_csv('./dm-2024-isa-5810-lab-2-homework/data_identification.csv')

In [4]:
df = pd.DataFrame(data)

# Extract '_source' and validate structure
if '_source' not in df.columns:
    raise KeyError("'_source' column not found in the data")

_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'hashtags': _source.apply(lambda x: x['hashtags']),
    'text': _source.apply(lambda x: x['text']),
})

# Ensure tweet_id is of a consistent type
df['tweet_id'] = df['tweet_id'].astype(str)

# Validate and prepare `data_identification`
data_identification['tweet_id'] = data_identification['tweet_id'].astype(str)
df = df.merge(data_identification, on='tweet_id', how='left')

train_data = df[df['identification'] == 'train']
test_data = df[df['identification'] == 'test']

In [5]:
train_data = train_data.merge(emotion_list, on='tweet_id', how='left')
train_data.head()

Unnamed: 0,tweet_id,hashtags,text,identification,emotion
0,0x376b20,[Snapchat],"People who post ""add me on #Snapchat"" must be ...",train,anticipation
1,0x2d5350,"[freepress, TrumpLegacy, CNN]","@brianklaas As we see, Trump is dangerous to #...",train,sadness
2,0x1cd5b0,[],Now ISSA is stalking Tasha 😂😂😂 <LH>,train,fear
3,0x1d755c,"[authentic, LaughOutLoud]",@RISKshow @TheKevinAllison Thx for the BEST TI...,train,joy
4,0x2c91a8,[],Still waiting on those supplies Liscus. <LH>,train,anticipation


In [6]:
test_data.head()

Unnamed: 0,tweet_id,hashtags,text,identification
2,0x28b412,[bibleverse],"Confident of your obedience, I write to you, k...",test
4,0x2de201,[],"""Trust is not the same as faith. A friend is s...",test
9,0x218443,"[materialism, money, possessions]",When do you have enough ? When are you satisfi...,test
30,0x2939d5,"[GodsPlan, GodsWork]","God woke you up, now chase the day #GodsPlan #...",test
33,0x26289a,[],"In these tough times, who do YOU turn to as yo...",test


In [7]:
train_data.drop_duplicates(subset=['text'], keep=False, inplace=True)

In [8]:
# shuffle dataset
train_data = train_data.sample(frac=1)
test_data = test_data.sample(frac=1)

print("Shape of Training df: ", train_data.shape)
print("Shape of Testing df: ", test_data.shape)
train_data.head()

Shape of Training df:  (1449182, 5)
Shape of Testing df:  (411972, 4)


Unnamed: 0,tweet_id,hashtags,text,identification,emotion
657314,0x226087,"[COYG, Firedup]",@Arsenal we can still win the league if we do ...,train,anticipation
634561,0x27cfd8,[],Making cheesecake numero 1938473928.2. Let's s...,train,sadness
450597,0x2a865f,[EVEARS],ozil have to cover for ramsey <LH> #EVEARS,train,disgust
867170,0x2c47d6,[],People lied to gain trust <LH> 🤷🏻‍♀️ Then wh...,train,sadness
1199107,0x27dde2,[coffee],Sunrise brings morning thoughts of rebirth and...,train,joy


In [9]:
test_data.head()

Unnamed: 0,tweet_id,hashtags,text,identification
550384,0x2e0d37,[],I have ten toes. I haven’t checked in a while ...,test
1742014,0x1f630e,[flyingmice],@carey_naomi 😉...can cope with a video. It's w...,test
1096779,0x2e065d,[],@DannaMulder It’s bad when u can’t tell if som...,test
899350,0x21da8a,[YouAreYourOwnSOULution],"Sometimes, love finds its SOUL in feelings of ...",test
780621,0x2333bd,[lovewins],Change doesn’t happen because of ever increasi...,test


In [10]:
train_data_sample = train_data.sample(frac=0.002, random_state=42)

In [11]:
y_train_alter = train_data_sample['emotion']
y_train_data = pd.DataFrame(y_train_alter)
X_train_data = train_data_sample.drop(['tweet_id', 'emotion', 'identification', 'hashtags'], axis=1)
ans_data = test_data.drop(['tweet_id', 'identification', 'hashtags'], axis=1)

In [12]:
y_train_data.head()

Unnamed: 0,emotion
233693,fear
884475,joy
1004845,sadness
32742,disgust
1369258,joy


In [13]:
X_train_data.head()

Unnamed: 0,text
233693,S give #oviya u rock..don't call me Julie.. se...
884475,One more day. Oonnee mmooreee ddaayy!!!! <LH> ...
1004845,The moment when you are at your masters gradua...
32742,"If I was a #turkey, I'd be doing everything I ..."
1369258,talked to @_jakegardner on the phone for 2 hou...


In [14]:
ans_data.head()

Unnamed: 0,text
550384,I have ten toes. I haven’t checked in a while ...
1742014,@carey_naomi 😉...can cope with a video. It's w...
1096779,@DannaMulder It’s bad when u can’t tell if som...
899350,"Sometimes, love finds its SOUL in feelings of ..."
780621,Change doesn’t happen because of ever increasi...


# 2. BERT

### 2.0 Split Train-Test

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_data, y_train_data, test_size=0.2, random_state=42)

### 2.1 Import necessary libraries

In [16]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report

### 2.2 Preprocess the data

In [17]:
def preprocess_data(df):
    # Convert text to lowercase (or apply other preprocessing steps as needed)
    df['text'] = df['text'].str.lower()
    return df

In [None]:
# Preprocess the datasets
X_train = preprocess_data(X_train)
X_test = preprocess_data(X_test)

### 2.3 Tokenize BERT

In [19]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [20]:
# Tokenize the text data for BERT input
def encode_data(df, tokenizer, max_length=128):
    return tokenizer(list(df['text']), padding=True, truncation=True, max_length=max_length, return_tensors='pt')

In [None]:
X_train_encodings = encode_data(X_train, tokenizer)
X_test_encodings = encode_data(X_test, tokenizer)

### 2.4 Create labels

In [22]:
from sklearn.preprocessing import LabelEncoder
# Step 1: Initialize the LabelEncoder
label_encoder = LabelEncoder()

In [23]:
# Step 2: Fit the label encoder on the training emotions and transform both train and test labels
y_train['encoded'] = label_encoder.fit_transform(y_train['emotion'])
y_test['encoded'] = label_encoder.transform(y_test['emotion'])


In [24]:
# Convert the labels into tensors
train_labels = torch.tensor(y_train['encoded'].values)
test_labels = torch.tensor(y_test['encoded'].values)

### 2.5 Create Dataloader

In [25]:
# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train_encodings.input_ids, X_train_encodings.attention_mask, train_labels)
test_dataset = TensorDataset(X_test_encodings.input_ids, X_test_encodings.attention_mask, test_labels)

In [26]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

### 2.6 BERT model

In [27]:
# Load the pretrained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(y_train['emotion'].unique()))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)



### 2.7 Train!

In [29]:
# Function to train the model
def train_model(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

In [30]:
# Function to evaluate the model
def evaluate_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [item.to(device) for item in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return all_labels, all_preds

In [None]:
# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(3):  # Train for 3 epochs (can be adjusted)
    print(f"Epoch {epoch + 1}")
    
    model.train()  # Set the model to training mode
    total_loss = 0
    
    for batch in train_loader:
        # Move batch to device
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        
        # Ensure labels are of type Long
        labels = labels.long()  # Convert labels to torch.long
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        # Loss
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Training loss: {avg_train_loss}")

Epoch 1
Training loss: 1.809693743442667
Epoch 2
Training loss: 1.6570002407863222
Epoch 3
Training loss: 1.479933487546855


In [32]:
# Evaluate the model on the test dataset
y_test, y_pred = evaluate_model(model, test_loader, device)

### 2.8 Print result

In [33]:
# Print the accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.4224137931034483

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        14
           1       0.60      0.28      0.38        86
           2       0.17      0.02      0.03        55
           3       0.00      0.00      0.00        35
           4       0.49      0.79      0.61       205
           5       0.28      0.65      0.39        89
           6       0.00      0.00      0.00        16
           7       0.50      0.01      0.02        80

    accuracy                           0.42       580
   macro avg       0.26      0.22      0.18       580
weighted avg       0.39      0.42      0.34       580



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 2.9 Predict answer on Test

In [38]:
ans_data = preprocess_data(ans_data)

In [39]:
ans_encodings = encode_data(ans_data, tokenizer)

In [40]:
# Convert the tokenized text into a DataLoader
new_dataset = TensorDataset(ans_encodings.input_ids, ans_encodings.attention_mask)
new_loader = DataLoader(new_dataset, batch_size=16)

In [41]:
# Function to make predictions on new data
def predict_emotions(model, new_loader, device):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for batch in new_loader:
            input_ids, attention_mask = [item.to(device) for item in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
    return all_preds

In [42]:
# Make predictions on the new dataset
pred_result = predict_emotions(model, new_loader, device)

KeyboardInterrupt: 

In [None]:
# It runs for too long, so I killed it.
# According to ChatGPT, it will take around hours or days to do it.

In [None]:
submission = pd.DataFrame({
    'id': test_data['tweet_id'],
    'emotion': pred_result,
})

In [None]:
submission.to_csv('./submission.csv', index=False)