In [1]:
# Install essential libraries for transformers, dataset loading, visualization, and ML evaluation
!pip install --upgrade transformers datasets scikit-learn matplotlib seaborn --quiet




In [None]:
import transformers
print("Transformers version:", transformers.__version__)


In [None]:
import pandas as pd
from datasets import Dataset

# Load CSV and select relevant columns
df = pd.read_csv("amazon.csv")
df = df[['reviewText', 'Positive']].dropna()
df.columns = ['text', 'label']  # Rename columns for clarity

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)


In [None]:
from transformers import BertTokenizer

# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define tokenization function
def tokenize_function(example):
    return tokenizer(example['text'], padding='max_length', truncation=True, max_length=128)

# Apply tokenizer and split into train/test
dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.train_test_split(test_size=0.2)


In [None]:
from transformers import BertForSequenceClassification

# Load pre-trained BERT model with binary classification head
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


In [None]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)


In [None]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np

# Predict on test data
predictions = trainer.predict(dataset['test'])
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix - BERT Sentiment Analysis')
plt.tight_layout()
plt.savefig("images/confusion_matrix.png")  # Save to file
plt.show()


In [None]:
# Save model and tokenizer
model.save_pretrained("sentiment_model")
tokenizer.save_pretrained("sentiment_model")
