In [None]:
!pip install torch transformers datasets scikit-learn pandas openpyxl

In [None]:
import pandas as pd

file_path = "/content/insta_data.xlsx"
df = pd.read_excel(file_path)

df = df[["Post Description", "Sentiment"]].dropna()

print(df.head(), df["Sentiment"].value_counts())


In [None]:
sentiment_mapping = {"positive": 0, "neutral": 1, "negative": 2}
df["Sentiment"] = df["Sentiment"].map(sentiment_mapping)


In [None]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["Post Description"] = df["Post Description"].apply(preprocess_text)


In [None]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["Post Description"], df["Sentiment"], test_size=0.2, random_state=42
)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)


In [None]:
import torch

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = SentimentDataset(train_encodings, list(train_labels))
test_dataset = SentimentDataset(test_encodings, list(test_labels))


In [None]:
from torch.utils.data import DataLoader

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=3)
model.to("cuda")


In [None]:
from torch.optim import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=2e-5)

num_training_steps = len(train_loader) * 6
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


In [None]:
train_loss_history = []
train_accuracy_history = []
error_rate_history = []

def train_model_with_tracking(model, dataloader, optimizer, scheduler, num_epochs=6):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        correct, total = 0, 0

        for batch in dataloader:
            input_ids = batch["input_ids"].to("cuda")
            attention_mask = batch["attention_mask"].to("cuda")
            labels = batch["labels"].to("cuda")

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            correct += (outputs.logits.argmax(dim=1) == labels).sum().item()
            total += labels.size(0)

        avg_loss = total_loss / len(dataloader)
        accuracy = correct / total
        error_rate = 1 - accuracy

        train_loss_history.append(avg_loss)
        train_accuracy_history.append(accuracy)
        error_rate_history.append(error_rate)

        print(f"Epoch {epoch + 1}: Loss = {avg_loss:.4f}, Accuracy = {accuracy * 100:.2f}%")


In [None]:
train_model_with_tracking(model, train_loader, optimizer, lr_scheduler, num_epochs=6)


In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_curve,
    auc,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)



In [None]:
from sklearn.metrics import accuracy_score

def evaluate_model_with_metrics(model, dataloader):
    model.eval()
    y_true, y_pred, y_scores = [], [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to("cuda")
            attention_mask = batch["attention_mask"].to("cuda")
            labels = batch["labels"].to("cuda")

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
            y_scores.extend(logits.cpu().numpy())


    class_labels = ["positive", "neutral", "negative"]
    print("ðŸ“Š Accuracy per Class:")
    for i, label in enumerate(class_labels):
        class_total = sum(1 for y in y_true if y == i)
        class_correct = sum(1 for y, p in zip(y_true, y_pred) if y == i and y == p)
        class_acc = (class_correct / class_total) * 100 if class_total > 0 else 0
        print(f"  â€¢ {label.capitalize()}: {class_acc:.2f}%")


    overall_accuracy = accuracy_score(y_true, y_pred) * 100
    print(f"\nâœ… Overall Accuracy: {overall_accuracy:.2f}%")


    print("\nðŸ“‹ Classification Report:")
    print(classification_report(y_true, y_pred, target_names=class_labels))


    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=class_labels, yticklabels=class_labels)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()


    y_true_bin = np.eye(3)[y_true]
    y_scores = np.array(y_scores)
    for i, label in enumerate(class_labels):
        fpr, tpr, _ = roc_curve(y_true_bin[:, i], y_scores[:, i])
        plt.plot(fpr, tpr, label=f"{label} (AUC = {auc(fpr, tpr):.2f})")

    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.title("ROC Curve")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.grid(True)
    plt.show()


    precision = precision_score(y_true, y_pred, average="macro")
    recall = recall_score(y_true, y_pred, average="macro")
    f1 = f1_score(y_true, y_pred, average="macro")
    specificity = np.mean([
        cm[i, i] / (cm[:, i].sum() + cm[i, :].sum() - 2 * cm[i, i])
        if (cm[:, i].sum() + cm[i, :].sum() - 2 * cm[i, i]) != 0 else 0
        for i in range(3)
    ])

    print(f"\nðŸŽ¯ Macro Precision: {precision:.4f}")
    print(f"ðŸ“ˆ Macro Recall (Sensitivity): {recall:.4f}")
    print(f"ðŸ’¥ Macro F1 Score: {f1:.4f}")
    print(f"ðŸ§  Macro Specificity (approx): {specificity:.4f}")


In [None]:
evaluate_model_with_metrics(model, test_loader)
