In [1]:
import pandas as pd
import json

identification = pd.read_csv("data/data_identification.csv")
emotion = pd.read_csv("data/emotion.csv")

records = []
with open('data/tweets_DM.json', "r") as f:
    for line in f:
        try:
            record = json.loads(line.strip())
            records.append(record)
        except json.JSONDecodeError as e:
            print(f"Error decoding line: {line}")
            continue

extracted_data = []
for item in records:
    tweet = item['_source']['tweet']
    tweet_id = tweet['tweet_id']
    text = tweet['text']
    if tweet_id and text:
        extracted_data.append({"tweet_id": tweet_id, "text": text})
tweet = pd.DataFrame(extracted_data)

data = pd.merge(identification, emotion, on="tweet_id", how="outer")
data = pd.merge(data, tweet, on="tweet_id", how="outer")

In [4]:
from sklearn.model_selection import train_test_split

ori_train_df = data[data["identification"] == "train"].reset_index(drop=True)
ori_train_df.drop_duplicates(subset=['text'], keep=False, inplace=True)
ori_train_df_sample = ori_train_df.sample(frac=0.5, random_state=42) # Sample 50% for training

## TFIDF

In [None]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

train_df, val_df = train_test_split(ori_train_df_sample, test_size=0.2, random_state=42)
test_df = data[data["identification"] == "test"].reset_index(drop=True)

TFIDF_1000 = TfidfVectorizer(max_features=1000, tokenizer=nltk.word_tokenize)
TFIDF_1000.fit(train_df['text'])
X_train = TFIDF_1000.transform(train_df['text'])
y_train = train_df['emotion']
X_val = TFIDF_1000.transform(val_df['text'])
y_val = val_df['emotion']
X_test = TFIDF_1000.transform(test_df['text'])




In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

RF_model = RandomForestClassifier()
RF_model = RF_model.fit(X_train, y_train)

y_train_pred = RF_model.predict(X_train)
y_val_pred = RF_model.predict(X_val)
acc_train = accuracy_score(y_true=y_train, y_pred=y_train_pred)
acc_val = accuracy_score(y_true=y_val, y_pred=y_val_pred)
print('Training accuracy: {:.2f}'.format(acc_train))
print('Validation accuracy: {:.2f}'.format(acc_val))

Training accuracy: 0.99
Validation accuracy: 0.51


In [None]:
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import accuracy_score

# ## build model
# NB_model = MultinomialNB()
# ## training!
# NB_model.fit(X_train, y_train)

# ## predict!
# y_train_pred_NB = NB_model.predict(X_train)
# y_val_pred_NB = NB_model.predict(X_val)

# ## accuracy
# acc_train_NB = accuracy_score(y_true=y_train, y_pred=y_train_pred_NB)
# acc_test_NB = accuracy_score(y_true=y_val, y_pred=y_val_pred_NB)
# print('Training Accuracy: {}'.format(round(acc_train_NB, 2)))
# print('Testing Accuracy: {}'.format(round(acc_test_NB, 2)))

Training Accuracy: 0.46
Testing Accuracy: 0.46


In [13]:
y_test_pred_NB = RF_model.predict(X_test)
test_df['emotion'] = y_test_pred_NB
save_df = test_df[['tweet_id', 'emotion']].rename(columns={'tweet_id': 'id'})
save_df.to_csv("data/submission.csv", index=False)

## BERT

In [7]:
import pandas as pd
import torch
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

train_texts, val_texts, train_labels, val_labels = train_test_split(
    ori_train_df['text'], ori_train_df['emotion'], test_size=0.2, random_state=42
)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

def tokenize_function(texts):
    return tokenizer(list(texts), padding=True, truncation=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)


train_dataset = EmotionDataset(train_encodings, train_labels)
val_dataset = EmotionDataset(val_encodings, val_labels)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from transformers import BertForSequenceClassification, AdamW, get_scheduler
from torch.utils.data import DataLoader
from tqdm import tqdm


model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 3
num_training_steps = len(train_loader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

for epoch in range(epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

    print(f"Validation Accuracy: {correct / total:.4f}")
model.save_pretrained("emotion_classifier_model")
tokenizer.save_pretrained("emotion_classifier_model")

2024-12-02 10:52:15.629657: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-02 10:52:16.359906: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733136736.531872 3700308 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733136736.586655 3700308 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-02 10:52:17.159694: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Validation Accuracy: 0.6427


Epoch 1: 100%|██████████| 72460/72460 [1:57:30<00:00, 10.28it/s, loss=0.00107]


Validation Accuracy: 0.6631


Epoch 2: 100%|██████████| 72460/72460 [1:56:35<00:00, 10.36it/s, loss=0.000839]


Validation Accuracy: 0.6672


('emotion_classifier_model/tokenizer_config.json',
 'emotion_classifier_model/special_tokens_map.json',
 'emotion_classifier_model/vocab.txt',
 'emotion_classifier_model/added_tokens.json')

In [11]:
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
import torch
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

test_df = data[data["identification"] == "test"].reset_index(drop=True)
model = BertForSequenceClassification.from_pretrained("emotion_classifier_model")
tokenizer = BertTokenizer.from_pretrained("emotion_classifier_model")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
model.eval()

def tokenize_function(texts):
    return tokenizer(list(texts), padding=True, truncation=True, max_length=128, return_tensors="pt")

test_encodings = tokenize_function(test_df['text'])
test_dataset = torch.utils.data.TensorDataset(
    test_encodings["input_ids"], test_encodings["attention_mask"]
)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids, attention_mask = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())

label_encoder = LabelEncoder()
label_encoder.fit(train_df['emotion'])
test_df['emotion'] = label_encoder.inverse_transform(predictions)

save_df = test_df[['tweet_id', 'emotion']].rename(columns={'tweet_id': 'id'})
save_df.to_csv("data/submission.csv", index=False)

100%|██████████| 25749/25749 [27:52<00:00, 15.39it/s]
