In [7]:
#IMPORTS
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

### Loading and Filtering the Reviews Data
This code loads the reviews dataset, filters reviews with specific ratings (1, 2, 4, 5), and classifies the reviews as 'Positive' or 'Negative' based on the rating. Finally, it selects 10 random reviews for display.


In [None]:
# Load the reviews data with the correct encoding
file_path = 'HARD-Arabic-Dataset-master/data/balanced-reviews.txt'  # Update with your actual file path
df = pd.read_csv(file_path, sep='\t', names=['no', 'Hotel name', 'rating', 'user type', 'room type', 'nights', 'review'], skiprows=1, encoding='utf-16')

# Continue with filtering and classifying as before
df_filtered = df[df['rating'].isin([1, 2, 4, 5])]
selected_reviews = df_filtered.sample(n=10)

def classify_review(row):
    if row['rating'] in [4, 5]:
        return 'Positive'
    elif row['rating'] in [1, 2]:
        return 'Negative'

selected_reviews['classification'] = selected_reviews.apply(classify_review, axis=1)
selected_reviews[['no','review', 'rating', 'classification']]


### Load, Filter, and Split Reviews Dataset
This code loads the reviews dataset, filters reviews based on specific ratings (1, 2, 4, and 5), and limits the total number of reviews to a maximum of 1000. It classifies the reviews as 'Positive' or 'Negative' based on their rating, saves the classified data to a CSV file, and then splits the dataset into 70% training and 30% testing data, saving them as separate CSV files.


In [None]:
# Set the maximum number of reviews to use
max_reviews = 1000

# Load the reviews data with the correct encoding
file_path = 'HARD-Arabic-Dataset-master/data/balanced-reviews.txt'  # Update with your actual file path
df = pd.read_csv(file_path, sep='\t', names=['no', 'Hotel name', 'rating', 'user type', 'room type', 'nights', 'review'], skiprows=1, encoding='utf-16')

# Filter reviews with ratings 1, 2, 4, and 5
df_filtered = df[df['rating'].isin([1, 2, 4, 5])]

# Limit the number of reviews if max_reviews is set
if max_reviews is not None:
    df_filtered = df_filtered.head(max_reviews)

# Function to classify reviews based on the rating
def classify_review(row):
    if row['rating'] in [4, 5]:
        return 'Positive'
    elif row['rating'] in [1, 2]:
        return 'Negative'

# Apply classification function to the dataset
df_filtered['classification'] = df_filtered.apply(classify_review, axis=1)

# Select necessary columns (id, review text, classification)
df_classified = df_filtered[['no', 'review', 'classification']]

# Save the classified reviews to a CSV file
output_file = 'classified.csv'
df_classified.to_csv(output_file, index=False, encoding='utf-16')

# Split the data into 70% training and 30% testing
train_data, test_data = train_test_split(df_classified, test_size=0.3, random_state=42)

# Save the split datasets
train_data.to_csv('train_reviews.csv', index=False, encoding='utf-16')
test_data.to_csv('test_reviews.csv', index=False, encoding='utf-16')

print(f"Data saved to {output_file}, and split into train_reviews.csv and test_reviews.csv.")


### Load, Tokenize, and Train Arabic BERT Model
This code loads the training and testing datasets, applies label encoding for 'Positive' and 'Negative' reviews, and tokenizes the review texts using a pre-trained Arabic BERT tokenizer. The data is then converted into Hugging Face dataset format. A BERT model for binary classification is loaded, training arguments are set, and a `Trainer` is defined for training and evaluating the model. The model is trained and evaluated, and then saved for future use.


In [None]:
# Load the classified train and test data
train_file = 'train_reviews.csv'
test_file = 'test_reviews.csv'

train_df = pd.read_csv(train_file, encoding='utf-16')
test_df = pd.read_csv(test_file, encoding='utf-16')

# Labeling the data (1 for Positive, 0 for Negative)
train_df['label'] = train_df['classification'].apply(lambda x: 1 if x == 'Positive' else 0)
test_df['label'] = test_df['classification'].apply(lambda x: 1 if x == 'Positive' else 0)

# Split texts and labels
train_texts = train_df['review'].tolist()
train_labels = train_df['label'].tolist()
test_texts = test_df['review'].tolist()
test_labels = test_df['label'].tolist()

# Load pre-trained tokenizer for Arabic BERT model
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Create Hugging Face datasets
train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
test_dataset = Dataset.from_dict({'text': test_texts, 'label': test_labels})

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Load BERT model for binary classification
model = AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabertv02", num_labels=2)

# Define accuracy metric function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)  # Get the predicted class
    acc = accuracy_score(p.label_ids, preds)  # Calculate accuracy
    return {"accuracy": acc}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch",
    logging_dir='./logs',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10
)

# Create a data collator that dynamically pads inputs
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # Add the metrics function here
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the model for future use
model.save_pretrained("arabic_bert_review_classifier")
tokenizer.save_pretrained("arabic_bert_review_classifier")

### Load Data, Predict Classifications, and Display Confusion Matrix
This code block loads the dataset of hotel reviews, filters them based on their ratings (1, 2 for 'Negative' and 4, 5 for 'Positive'), and applies a pre-trained Arabic BERT model to predict whether each review is 'Positive' or 'Negative'. It calculates the accuracy of the model's predictions, generates a confusion matrix to visualize the classification performance, and saves the final output with the actual and predicted classifications to a CSV file.


In [None]:
# Load the reviews data with the correct encoding
file_path = 'HARD-Arabic-Dataset-master/data/balanced-reviews.txt'  # Update with your actual file path
df = pd.read_csv(file_path, sep='\t', names=['no', 'Hotel name', 'rating', 'user type', 'room type', 'nights', 'review'], skiprows=1, encoding='utf-16')

# Filter reviews with ratings 1, 2, 4, and 5
df_filtered = df[df['rating'].isin([1, 2, 4, 5])]

# Limit the number of reviews to 1000
# df_filtered = df_filtered.head(1000)

# Function to classify reviews based on the rating
def classify_review(row):
    if row['rating'] in [4, 5]:
        return 'Positive'
    elif row['rating'] in [1, 2]:
        return 'Negative'

# Apply classification function to the dataset
df_filtered['classification'] = df_filtered.apply(classify_review, axis=1)

# Load pre-trained tokenizer and model for Arabic BERT
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
model = AutoModelForSequenceClassification.from_pretrained("arabic_bert_review_classifier")  # Load the trained model

# Use GPU if available for faster performance
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to predict classifications in batches with progress bar
def predict_in_batches(reviews, batch_size):
    predictions = []
    for i in tqdm(range(0, len(reviews), batch_size), desc="Processing Batches", unit="batch"):
        batch_reviews = reviews[i:i + batch_size]
        inputs = tokenizer(batch_reviews, padding='max_length', truncation=True, return_tensors='pt', max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            preds = np.argmax(outputs.logits.cpu().numpy(), axis=1)  # Move to CPU before converting to NumPy
            predictions.extend(preds)
    return predictions

# Predict classifications for all reviews in batches
predictions = predict_in_batches(df_filtered['review'].tolist(), batch_size=64)

# Map the predictions to the corresponding labels
df_filtered['model_classification'] = ['Positive' if pred == 1 else 'Negative' for pred in predictions]
df_filtered['model_correct'] = df_filtered['classification'] == df_filtered['model_classification']

# Calculate accuracy
accuracy = df_filtered['model_correct'].mean() * 100
print(f"Model Accuracy: {accuracy:.2f}%")

# Generate confusion matrix
y_true = df_filtered['classification']
y_pred = df_filtered['model_classification']
cm = confusion_matrix(y_true, y_pred, labels=['Positive', 'Negative'])

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Positive', 'Negative'], yticklabels=['Positive', 'Negative'])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

# Select necessary columns for the final output
output_df = df_filtered[['no', 'Hotel name', 'rating', 'user type', 'room type', 'nights', 'review', 'classification', 'model_classification', 'model_correct']]

# Save the final DataFrame to a CSV file
output_file = 'model_predictions.csv'
output_df.to_csv(output_file, index=False, encoding='utf-16')

print(f"Predictions saved to {output_file}.")

### Display Examples of True Positive, True Negative, False Positive, and False Negative
This block defines conditions for four classification outcomes: True Positive, True Negative, False Positive, and False Negative. It then selects 5 review examples from each category and prints their review ID, rating, review text, model's predicted classification, and the true classification for comparison.


In [None]:
# Define conditions for True Positive, True Negative, False Positive, and False Negative
true_positive = df_filtered[(df_filtered['classification'] == 'Positive') & (df_filtered['model_classification'] == 'Positive')]
true_negative = df_filtered[(df_filtered['classification'] == 'Negative') & (df_filtered['model_classification'] == 'Negative')]
false_positive = df_filtered[(df_filtered['classification'] == 'Negative') & (df_filtered['model_classification'] == 'Positive')]
false_negative = df_filtered[(df_filtered['classification'] == 'Positive') & (df_filtered['model_classification'] == 'Negative')]

# Function to display 5 examples for each case
def display_examples(df, case_name):
    print(f"\n{case_name} Examples:")
    for index, row in df.head(5).iterrows():
        print(f"Review ID: {row['no']}")
        print(f"Rating: {row['rating']}")
        print(f"Review: {row['review']}")
        print(f"Model Classification: {row['model_classification']}")
        print(f"True Classification: {row['classification']}")
        print("-" * 80)

# Display 5 examples of each case
display_examples(true_positive, "True Positive")
display_examples(true_negative, "True Negative")
display_examples(false_positive, "False Positive")
display_examples(false_negative, "False Negative")
