In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('insta_data.csv')


In [None]:
df.columns = df.columns.str.strip()

def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"@\w+|#", '', text)
    text = re.sub(r"[^\w\s]", '', text)
    text = text.strip().lower()
    return text

df['Post Description'] = df['Post Description'].astype(str).apply(clean_text)

label_map = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}


df['Sentiment'] = df['Sentiment'].astype(str).str.strip().str.lower().map(label_map)

df = df.dropna(subset=['Sentiment'])
df['Sentiment'] = df['Sentiment'].astype(int)

pd.set_option('display.max_columns', None)

df.head()


In [None]:

sentiment_counts = df['Sentiment'].value_counts().sort_index()
label_names = ['Negative', 'Neutral', 'Positive']


plt.figure(figsize=(6, 4))
sns.barplot(x=label_names, y=sentiment_counts.values, palette='Set2')
plt.title('Sentiment Distribution - Bar Chart')
plt.ylabel('Number of Posts')
plt.xlabel('Sentiment')
plt.show()


In [None]:

plt.figure(figsize=(5, 5))
plt.pie(sentiment_counts.values, labels=label_names, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('Set3'))
plt.title('Sentiment Distribution - Pie Chart')
plt.axis('equal')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split

texts = df['Post Description'].values
labels = df['Sentiment'].values


X_train, X_test, y_train, y_test = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")


In [None]:
!pip install transformers datasets -q


In [None]:
from transformers import XLMRobertaTokenizer

tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

train_encodings = tokenizer(
    list(X_train),
    truncation=True,
    padding=True,
    max_length=128
)

test_encodings = tokenizer(
    list(X_test),
    truncation=True,
    padding=True,
    max_length=128

    )


In [None]:
import torch

class InstaSentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item


train_dataset = InstaSentimentDataset(train_encodings, y_train)
test_dataset = InstaSentimentDataset(test_encodings, y_test)


In [None]:

import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    save_total_limit=2,
    push_to_hub=False
)


In [None]:

from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    acc = accuracy_score(labels, preds) * 100
    f1 = f1_score(labels, preds, average='macro') * 100

    return {
        'accuracy': round(acc, 2),
        'f1': round(f1, 2)
    }


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    save_total_limit=2,
    push_to_hub=False
)


In [None]:
from transformers import Trainer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=3) # 3 for negative, neutral, positive


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()


In [None]:
eval_results = trainer.evaluate()

print(f"\n Final Test Accuracy: {eval_results['eval_accuracy']:.2f}%")
print(f" Final F1 Score (Macro): {eval_results['eval_f1']:.2f}%")


In [None]:
from sklearn.metrics import classification_report

predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)
y_true = predictions.label_ids

report = classification_report(y_true, y_pred, target_names=['Negative', 'Neutral', 'Positive'])
print("\nClassification Report:\n", report)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

logs = pd.DataFrame(trainer.state.log_history)
logs = logs.dropna(subset=['eval_accuracy'])

plt.plot(logs['epoch'], logs['eval_accuracy'], label='Validation Accuracy')
plt.plot(logs['epoch'], logs['eval_f1'], label='Validation F1')
plt.xlabel("Epoch")
plt.ylabel("Metric")
plt.title("Accuracy & F1 over Epochs")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
logs['error_rate'] = 1 - logs['eval_accuracy']

plt.plot(logs['epoch'], logs['error_rate'], label='Validation Error Rate', color='red')
plt.xlabel("Epoch")
plt.ylabel("Error Rate")
plt.title("Error Rate vs Epoch")
plt.grid(True)
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Negative', 'Neutral', 'Positive'])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import numpy as np

y_test_bin = label_binarize(y_true, classes=[0, 1, 2])
y_score = predictions.predictions

fpr, tpr, roc_auc = {}, {}, {}
for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

for i, label in enumerate(['Negative', 'Neutral', 'Positive']):
    plt.plot(fpr[i], tpr[i], label=f'{label} (AUC = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC Curve (One-vs-Rest)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

sensitivity = recall_score(y_true, y_pred, average=None)

specificity = []
cm = confusion_matrix(y_true, y_pred)
for i in range(3):
    tn = cm.sum() - (cm[i, :].sum() + cm[:, i].sum() - cm[i, i])
    fp = cm[:, i].sum() - cm[i, i]
    specificity.append(tn / (tn + fp))

precision = precision_score(y_true, y_pred, average=None)
fmajor = f1_score(y_true, y_pred, average='macro')

for i, cls in enumerate(['Negative', 'Neutral', 'Positive']):
    print(f"\nClass: {cls}")
    print(f"  Precision   : {precision[i]:.2f}")
    print(f"  Sensitivity : {sensitivity[i]:.2f}")
    print(f"  Specificity : {specificity[i]:.2f}")

print(f"\nFmajor (Macro F1): {fmajor:.2f}")
