In [62]:
# Import libraries
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /Users/chinnu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [64]:
files = [
    "/Users/chinnu/Desktop/project/Gopi/2017_English_final/2017_English_final/GOLD/Subtask_A/twitter-2013dev-A.txt",
    "/Users/chinnu/Desktop/project/Gopi/2017_English_final/2017_English_final/GOLD/Subtask_A/twitter-2013test-A.txt",
    "/Users/chinnu/Desktop/project/Gopi/2017_English_final/2017_English_final/GOLD/Subtask_A/twitter-2013train-A.txt",
    "/Users/chinnu/Desktop/project/Gopi/2017_English_final/2017_English_final/GOLD/Subtask_A/twitter-2014sarcasm-A.txt",
    "/Users/chinnu/Desktop/project/Gopi/2017_English_final/2017_English_final/GOLD/Subtask_A/twitter-2014test-A.txt",
    "/Users/chinnu/Desktop/project/Gopi/2017_English_final/2017_English_final/GOLD/Subtask_A/twitter-2015test-A.txt",
    "/Users/chinnu/Desktop/project/Gopi/2017_English_final/2017_English_final/GOLD/Subtask_A/twitter-2015train-A.txt",
    "/Users/chinnu/Desktop/project/Gopi/2017_English_final/2017_English_final/GOLD/Subtask_A/twitter-2016dev-A.txt",
    "/Users/chinnu/Desktop/project/Gopi/2017_English_final/2017_English_final/GOLD/Subtask_A/twitter-2016devtest-A.txt",
    "/Users/chinnu/Desktop/project/Gopi/2017_English_final/2017_English_final/GOLD/Subtask_A/twitter-2016test-A.txt",
    "/Users/chinnu/Desktop/project/Gopi/2017_English_final/2017_English_final/GOLD/Subtask_A/twitter-2016train-A.txt",
]

# Load and combine data safely
data = []
for file in files:
    with open(file, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) == 3 and parts[1].lower() in ["positive", "neutral", "negative"]:
                data.append(parts)

# Convert to DataFrame
df = pd.DataFrame(data, columns=["id", "label", "tweet"])


In [66]:
# Load and preprocess your dataset

df = df.dropna()
df = df[df['label'].isin(['positive', 'neutral', 'negative'])]  # sanity check


In [18]:
# Encode labels
label_encoder = LabelEncoder()
df["label_enc"] = label_encoder.fit_transform(df["label"])

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["tweet"].tolist(), df["label_enc"].tolist(), test_size=0.2, random_state=42
)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


In [24]:
# Custom PyTorch Dataset
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.encodings["input_ids"][idx]),
            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx]),
            "labels": torch.tensor(self.labels[idx]),
        }

train_dataset = TweetDataset(train_encodings, train_labels)
test_dataset = TweetDataset(test_encodings, test_labels)


In [26]:
# Load BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
# Evaluation metric function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    print("\n" + classification_report(labels, preds, target_names=label_encoder.classes_))
    return {"accuracy": accuracy_score(labels, preds)}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    eval_strategy="epoch",             
    save_strategy="epoch",             
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)


In [38]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [40]:
# Train and Evaluate
trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5222,0.585284,0.739593



              precision    recall  f1-score   support

    negative       0.68      0.66      0.67      1549
     neutral       0.74      0.72      0.73      4531
    positive       0.77      0.79      0.78      3985

    accuracy                           0.74     10065
   macro avg       0.73      0.72      0.72     10065
weighted avg       0.74      0.74      0.74     10065




              precision    recall  f1-score   support

    negative       0.68      0.66      0.67      1549
     neutral       0.74      0.72      0.73      4531
    positive       0.77      0.79      0.78      3985

    accuracy                           0.74     10065
   macro avg       0.73      0.72      0.72     10065
weighted avg       0.74      0.74      0.74     10065



{'eval_loss': 0.5852842330932617,
 'eval_accuracy': 0.7395926477893691,
 'eval_runtime': 179.0494,
 'eval_samples_per_second': 56.214,
 'eval_steps_per_second': 3.519,
 'epoch': 1.0}

In [86]:
#  Label Encoding
label_encoder = LabelEncoder()
df["label_enc"] = label_encoder.fit_transform(df["label"])

#  Basic whitespace tokenization
df["tokens"] = df["tweet"].apply(lambda x: x.lower().split())

all_tokens = [token for tokens in df["tokens"] for token in tokens]
vocab_counter = Counter(all_tokens)
vocab = {word: i + 2 for i, (word, freq) in enumerate(vocab_counter.items()) if freq >= 2}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1
vocab_size = max(vocab.values()) + 1  # Final vocab size for Embedding layer


In [88]:
#  Encode function
def encode(tokens, vocab, max_len=32):
    encoded = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
    return encoded[:max_len] + [vocab["<PAD>"]] * (max_len - len(encoded))

df["input_ids"] = df["tokens"].apply(lambda x: encode(x, vocab))

#  Dataset
class TweetFastTextDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = torch.tensor(inputs, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]


In [90]:

X_train, X_test, y_train, y_test = train_test_split(
    df["input_ids"].tolist(), df["label_enc"].tolist(), test_size=0.2, random_state=42
)

train_dataset = TweetFastTextDataset(X_train, y_train)
test_dataset = TweetFastTextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [92]:
# ✅ FastText Model
class FastTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(FastTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        embeds = self.embedding(x)           # [batch, seq_len, embed_dim]
        pooled = embeds.mean(dim=1)          # [batch, embed_dim]
        return self.fc(pooled)               # [bat


In [96]:
#  Setup for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FastTextClassifier(vocab_size=vocab_size, embed_dim=100, num_classes=3).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [98]:
#  Training Loop
for epoch in range(5):
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss:.4f}")


Epoch 1 - Loss: 1222.4379
Epoch 2 - Loss: 1028.7580
Epoch 3 - Loss: 886.8214
Epoch 4 - Loss: 779.8127
Epoch 5 - Loss: 692.7348


In [100]:
#  Evaluation
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

#  Final Report
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(all_labels, all_preds))


Classification Report:
              precision    recall  f1-score   support

    negative       0.57      0.40      0.47      1549
     neutral       0.63      0.70      0.67      4531
    positive       0.68      0.68      0.68      3985

    accuracy                           0.65     10065
   macro avg       0.63      0.59      0.60     10065
weighted avg       0.64      0.65      0.64     10065

Accuracy: 0.6455042225534029
