In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

In [None]:
# Load datasets from text files
def load_data(file_path):
    texts, labels = [], []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(";")
            if len(parts) == 2:
                texts.append(parts[0])
                labels.append(parts[1])
    return pd.DataFrame({"tweet": texts, "label": labels})

In [None]:
# Load train, validation, and test datasets
train_df = load_data("/kaggle/input/bertdataset/train.txt")
val_df = load_data("/kaggle/input/bertdataset/val.txt")
test_df = load_data("/kaggle/input/bertdataset/test.txt")

In [None]:
# Encode labels
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["label"])
val_df["label"] = label_encoder.transform(val_df["label"])
test_df["label"] = label_encoder.transform(test_df["label"])

In [None]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Tokenization function
def tokenize_data(texts, labels, tokenizer, max_len=128):
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_len, return_tensors="tf")
    return encodings["input_ids"], encodings["attention_mask"], tf.convert_to_tensor(labels)

In [None]:
# Tokenize datasets
train_inputs, train_masks, train_labels = tokenize_data(train_df["tweet"], train_df["label"], tokenizer)
val_inputs, val_masks, val_labels = tokenize_data(val_df["tweet"], val_df["label"], tokenizer)
test_inputs, test_masks, test_labels = tokenize_data(test_df["tweet"], test_df["label"], tokenizer)

In [None]:
# Create a TensorFlow Dataset
batch_size = 16
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": train_inputs, "attention_mask": train_masks}, train_labels)).shuffle(len(train_df)).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": val_inputs, "attention_mask": val_masks}, val_labels)).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": test_inputs, "attention_mask": test_masks}, test_labels)).batch(batch_size)

In [None]:
# Load BERT model
num_labels = len(label_encoder.classes_)
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-8)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

In [None]:
# Train the model
epochs = 10
history = model.fit(train_dataset, validation_data=val_dataset, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Evaluate on test set
test_loss, test_acc = model.evaluate(test_dataset)
print(f"Test Accuracy: {test_acc:.4f}")

Test Accuracy: 0.9300


In [None]:
# Function to predict emotion
def predict_emotion(model, tokenizer, label_encoder):
    while True:
        text = input("Enter a sentence (or type 'exit' to stop): ")
        if text.lower() == 'exit':
            break
        tokens = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors="tf")
        input_ids, attention_mask = tokens["input_ids"], tokens["attention_mask"]
        logits = model(input_ids, attention_mask=attention_mask).logits
        prediction = tf.argmax(logits, axis=1).numpy()[0]
        print(f"Predicted Emotion: {label_encoder.inverse_transform([prediction])[0]}")

In [None]:
# Start the emotion prediction loop
predict_emotion(model, tokenizer, label_encoder)

Enter a sentence (or type 'exit' to stop):  I can't stop crying, everything feels so hopeless.


Predicted Emotion: sadness


Enter a sentence (or type 'exit' to stop):  I am so nervous about my exam results tomorrow.


Predicted Emotion: fear


Enter a sentence (or type 'exit' to stop):  I just received the best news of my life, I am thrilled!


Predicted Emotion: joy


Enter a sentence (or type 'exit' to stop):  I feel so loved and appreciated by my friends and family


Predicted Emotion: love


Enter a sentence (or type 'exit' to stop):  How could they betray me like this? I am so mad!


Predicted Emotion: anger


Enter a sentence (or type 'exit' to stop):  I feel so irritated when people don't listen to me


Predicted Emotion: anger


Enter a sentence (or type 'exit' to stop):  She planned a surprise party for my birthday, and I was shocked!


Predicted Emotion: surprise


Enter a sentence (or type 'exit' to stop):  exit
