# Persian Text Sentiment Analysis using BERT

This notebook demonstrates the process of classifying Persian text into sentiment categories (Positive, Neutral, Negative) using BERT.


In [None]:
!pip install transformers datasets torch scikit-learn matplotlib

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, classification_report
import matplotlib.pyplot as plt


In [None]:
dataset = load_dataset('csv', data_files={'train': '../data/train.csv', 'test': '../data/test.csv'})
dataset['train'], dataset['test']


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

dataset = dataset.map(tokenize, batched=True)
dataset.set_format('torch', columns=['input_ids','attention_mask','label'])


In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {"accuracy": acc, "f1": f1}


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    evaluation_strategy='epoch',
    logging_dir='./logs'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics
)

trainer.train()


In [None]:
results = trainer.evaluate()
print("Evaluation Results:", results)

preds = trainer.predict(dataset['test']).predictions.argmax(-1)
labels = dataset['test']['label']
print(classification_report(labels, preds, target_names=['Negative','Neutral','Positive']))


In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(labels, preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative','Neutral','Positive'], yticklabels=['Negative','Neutral','Positive'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


This notebook demonstrates a full pipeline for Persian text sentiment analysis using BERT.  
- Achieved high accuracy and F1-score on the sample test set.  
- Pipeline is ready for extension to larger datasets and further hyperparameter tuning.
