In [None]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score, classification_report
import os
import matplotlib.pyplot as plt

# Sample dataset with three times more samples
data_dict = {
    "review": [
        "Amazing product! The packaging was perfect and delivery was super fast. Will buy again! 😊",  # positive
        "Terrible service. My item arrived broken, and customer support was useless. Do not recommend! #frustrated",  # negative
        "Decent quality, but overpriced. The same item was cheaper on another website.",  # neutral
        "Absolutely love it! Great quality, and fantastic customer support. ❤️",  # positive
        "Product description was misleading. Not what I expected at all.",  # negative
        "The checkout process was smooth, but shipping was slower than expected. Still, not bad!",  # neutral
        "Best purchase ever! Highly recommend to anyone looking for value!",  # positive
        "The product is okay, but the customer service was exceptional!",  # neutral
        "I received a faulty item, but the return process was quick and easy.",  # negative
        "Not worth the price. I found similar items for much cheaper.",  # negative
        "Superb quality and fast delivery. Will definitely order again!",  # positive
        "Didn't meet my expectations. The description was misleading.",  # negative
        "Satisfied with the purchase. The item is exactly as described.",  # positive
        "The item was damaged upon arrival. Very disappointing experience.",  # negative
        "Excellent value for money. Highly recommended!",  # positive
        "The customer support was very helpful and resolved my issue quickly.",  # positive
        "Item arrived late, but it was worth the wait. Great quality!",  # positive
        "Not satisfied with the product. It didn't work as advertised.",  # negative
        "Very happy with my purchase. The quality is top-notch!",  # positive
        "The product is decent, but the packaging could be better.",  # neutral
        "This is my favorite product! I will definitely buy it again.",  # positive
        "Worst purchase ever. The product didn't work and support was unhelpful.",  # negative
        "Average product. There are better options available.",  # neutral
        "I'm in love with this product! The quality exceeded my expectations.",  # positive
        "The product description was inaccurate. Not what I expected.",  # negative
        "The checkout process was easy, but shipping took too long.",  # neutral
        "Great value for the price. Highly recommend this product!",  # positive
        "The product is good, but the delivery was late.",  # neutral
        "Received a defective item. Very disappointed with the purchase.",  # negative
        "Fast shipping and great quality. Will buy again!",  # positive
        "The product did not match the description. Very misleading.",  # negative
        "Happy with my purchase. The product meets my expectations.",  # positive
        "The item was broken when it arrived. Very poor experience.",  # negative
        "Fantastic product for the price. Would buy again.",  # positive
        "Customer service was excellent and resolved my issue quickly.",  # positive
        "The product arrived late, but it is of great quality.",  # positive
        "I'm not happy with the product. It didn't work as described.",  # negative
        "Absolutely thrilled with my purchase! Top-quality product.",  # positive
        "The product is fine, but the packaging was damaged.",  # neutral
        "Love this product! The quality is amazing and it arrived quickly.",  # positive
        "Terrible experience. The product was faulty and customer support was unhelpful.",  # negative
        "Good quality but overpriced. Found a cheaper alternative.",  # neutral
        "Very pleased with this purchase. Will recommend to others.",  # positive
        "The product didn't function as expected. Very disappointed.",  # negative
        "Smooth checkout process, but delivery was delayed.",  # neutral
        "Great product at a great price. Very satisfied!",  # positive
        "The item arrived damaged. Not happy with the purchase.",  # negative
        "The product exceeded my expectations. Fantastic quality!",  # positive
        "The customer support was very responsive and helpful.",  # positive
        "The product arrived late, but it was worth the wait.",  # positive
        "Not happy with the product. It didn't meet my expectations.",  # negative
        "Very satisfied with my purchase. The quality is excellent.",  # positive
        "The product is okay, but the packaging was subpar.",  # neutral
        "I'm extremely happy with this product! Highly recommend.",  # positive
        "The product didn't work and the return process was a hassle.",  # negative
        "Decent quality, but there are better options available.",  # neutral
        "I'm very impressed with the quality of this product.",  # positive
        "The description was misleading. Not what I was expecting.",  # negative
        "The checkout process was seamless, but shipping took too long.",  # neutral
        "Great value for the price. Will definitely buy again."  # positive
    ],
    "label": [
        "positive", "negative", "neutral", "positive", "negative", "neutral", "positive",
        "neutral", "negative", "negative", "positive", "negative", "positive", "negative",
        "positive", "positive", "positive", "negative", "positive", "neutral", "positive",
        "negative", "neutral", "positive", "negative", "neutral", "positive", "neutral",
        "negative", "positive", "negative", "positive", "negative", "positive", "positive",
        "positive", "negative", "positive", "neutral", "positive", "negative", "neutral",
        "positive", "negative", "neutral", "positive", "negative", "positive", "positive",
        "positive", "negative", "positive", "neutral", "positive", "negative", "neutral",
        "positive", "negative", "neutral", "positive"
    ]
}

data = pd.DataFrame(data_dict)

# Clean the text
def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove special characters
    return text

data["cleaned_review"] = data["review"].apply(clean_text)

# Convert labels to numerical values
data["label"] = data["label"].astype("category").cat.codes

# Check for class imbalance
print(data["label"].value_counts())

# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(text):
    return tokenizer(text, truncation=True, padding="max_length", max_length=128)

# Apply tokenization
data["tokenized"] = data["cleaned_review"].apply(tokenize_function)

# Apply tokenization and extract features as separate columns
data["input_ids"] = data["cleaned_review"].apply(lambda x: tokenizer(x, truncation=True, padding="max_length", max_length=128)["input_ids"])
data["attention_mask"] = data["cleaned_review"].apply(lambda x: tokenizer(x, truncation=True, padding="max_length", max_length=128)["attention_mask"])

print(data.head())

# Split data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Ensure 'tokenized' column exists before dropping it
if 'tokenized' in train_data.columns:
    train_dataset = Dataset.from_pandas(train_data.drop(columns=["tokenized"]))
else:
    train_dataset = Dataset.from_pandas(train_data)

if 'tokenized' in test_data.columns:
    test_dataset = Dataset.from_pandas(test_data.drop(columns=["tokenized"]))
else:
    test_dataset = Dataset.from_pandas(test_data)

train_dataset = train_dataset.remove_columns(["review", "cleaned_review"])
test_dataset = test_dataset.remove_columns(["review", "cleaned_review"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,  # Increase the number of epochs
    output_dir="./results",
    save_strategy="epoch"
)

# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

# Set environment variable to avoid tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

trainer.train()

predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
labels = test_dataset["label"]

accuracy = accuracy_score(labels, preds)
f1 = f1_score(labels, preds, average="weighted")

print(f"Accuracy: {accuracy}, F1 Score: {f1}")

# Visualize the results
report = classification_report(labels, preds, target_names=["negative", "neutral", "positive"], labels=[0, 1, 2], output_dict=True)
df_report = pd.DataFrame(report).transpose()

# Plotting the classification report
fig, ax = plt.subplots(figsize=(10, 6))
df_report.iloc[:-1, :-1].plot(kind='bar', ax=ax)
plt.title('Classification Report')
plt.xlabel('Classes')
plt.ylabel('Scores')
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.show()