In [None]:
import os
import logging
import pandas as pd
import matplotlib.pyplot as plt
import regex as re
import torch
import warnings
from typing import Dict, Any
from torch.utils.data import Dataset
from transformers import (
    pipeline,
    AutoModelForSequenceClassification, 
    AutoTokenizer, 
    TrainingArguments, 
    Trainer, 
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    classification_report
)

In [None]:
df = pd.read_csv('../data/cleaned_data.csv')

#### Data Preprocessing

In [None]:
df['reviewText'].str.split().str.len().plot(kind='hist', bins=50, range=(0, 500))
plt.xlabel('Number of Reviews')
plt.ylabel('Review Length')
plt.title('Review Length Distribution')

In [None]:
# Check the maximum word count
max_word_count = df['reviewText'].str.split().str.len().max()
print(f"Maximum word count: {max_word_count}")

# Check distribution at higher thresholds
for threshold in [50, 100, 500, 1000, 2000, 3000]:
    count = (df['reviewText'].str.split().str.len() > threshold).sum()
    print(f"Reviews with more than {threshold} words: {count}")

In [None]:
# Remove reviews with More than 100 words
# Set word count threshold
threshold = 100  

# Original size before filtering
original_size = len(df)

# Create a boolean mask for reviews below the threshold
mask = df['reviewText'].str.split().str.len() <= threshold

# Apply the mask to filter the DataFrame
df = df[mask]

# Reset index after filtering
df.reset_index(drop=True, inplace=True)

# Check the new size
new_size = len(df)

print(f"Original data size: {original_size}")
print(f"New data size: {new_size}")
print(f"Total of {original_size - new_size} reviews were deleted")

In [None]:
# Regex cleaning
def clean_text(text):
    """
    Comprehensive text cleaning function that applies multiple preprocessing steps.
    Args:
        text (str): Input text to be cleaned
    Returns:
        str: Cleaned text with punctuation, URLs, hashtags, usernames removed
    """
    # Compile regex patterns once for better performance
    patterns = [
        (r'http\S+', ''),           # Remove URLs more comprehensively
        (r'#\w+', ''),               # Remove hashtags
        (r'@\w+', ''),               # Remove usernames
        (r'[^\w\s]', ''),            # Remove punctuation
        (r'\s+', ' '),               # Replace multiple whitespaces with single space
        (r'\s\.?\s', ' ')            # Remove single characters between spaces
    ]
    
    # Apply all patterns in sequence
    for pattern, repl in patterns:
        text = re.sub(pattern, repl, text)
    
    return text.strip().lower()

In [None]:
# Apply the function to the filtered data
df['reviewText'] = df['reviewText'].apply(lambda x: clean_text(x))

#### Label Data

In [None]:
# Create a Label column
# Load a sentiment analysis model
sentiment_pipeline = pipeline("sentiment-analysis", model="tabularisai/robust-sentiment-analysis")

# Apply the pipeline to the reviewText column
df['sentiment'] = df['reviewText'].apply(lambda x: sentiment_pipeline(x)[0]["label"])

df.head()

In [None]:
# Delete the Unnamed column
df = df.loc[:, ~df.columns.str.match('Unnamed')]

In [None]:
df['sentiment'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=45)
plt.title('Sentiment Distribution')

#### Convert text labels to numbers

In [None]:
label_mapping = {"Very Negative": 0, "Negative": 0,"Very Positive": 1, "Positive": 1, "Neutral": 2 }
df['label'] = df['sentiment'].map(label_mapping)

In [None]:
df['label'].value_counts().plot(kind='bar', color=['green', 'blue', 'red'])
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')

#### Split Data

In [None]:
# Split the data into training, validation, and testing sets
train_text, temp_text, train_sentiment, temp_sentiment = train_test_split(
    df["reviewText"].tolist(), df["sentiment"].tolist(), test_size=0.2, random_state=42
)

val_text, test_text, val_sentiment, test_sentiment = train_test_split(
    temp_text, temp_sentiment, test_size=0.5, random_state=42
)

#### Tokenization

In [None]:
model_checkpoint = "distilbert-base-uncased"
output_dir = 'models/semtiment_model'

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

In [None]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

#### Create a dataset class 

In [None]:
def tokenize_texts(texts):
    """Tokenizes a list of texts using the pretrained tokenizer."""
    return tokenizer(texts, truncation=True, padding=True, return_tensors="pt")  # returns BatchEncoding object

class SentimentDataset(Dataset):
    """
Custom PyTorch Dataset for sentiment classification.
Args:
    encodings (Dict): Tokenized input encodings
    labels (List): Corresponding sentiment labels
"""
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]) 
        return item


    def __len__(self):
        return len(self.labels)

# Tokenize datasets
train_dataset = SentimentDataset(tokenize_texts(train_text), train_sentiment)
val_dataset = SentimentDataset(tokenize_texts(val_text), val_sentiment)
test_dataset = SentimentDataset(tokenize_texts(test_text), test_sentiment)

#### Train the Model

In [None]:
def compute_metrics(pred):
    """
    Compute evaluation metrics for the model.
    Args:
        pred: Prediction results from Trainer
    Returns:
        Dict of evaluation metrics
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Compute accuracy
    accuracy = accuracy_score(labels, preds)
    
    # Generate classification report
    class_report = classification_report(labels, preds, output_dict=True)
    
    return {
        'accuracy': accuracy,
        'precision': class_report['macro avg']['precision'],
        'recall': class_report['macro avg']['recall'],
        'f1': class_report['macro avg']['f1-score']
    }

def train(
    train_dataset: Dataset, 
    val_dataset: Dataset,
    batch_size: int = 16,
    num_train_epochs: int = 3
):
    """
    Train the sentiment analysis model.
    Args:
        train_dataset (Dataset): Training dataset
        val_dataset (Dataset): Validation dataset
        batch_size (int): Training batch size
        num_train_epochs (int): Number of training epochs
    
    Returns:
        Trained model
    """
    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f'{output_dir}/logs',
        logging_steps=10,
        learning_rate = 2e-5,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='accuracy'
    )
    
    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )
    
    # Train the model
    trainer.train()
    
    return model

#### Model Evaluation

In [None]:
def evaluate(test_dataset: Dataset):
    """
    Evaluate the model on test dataset.
    Args:
        test_dataset (Dataset): Test dataset
    Returns:
        Dictionary of evaluation metrics
    """
    trainer = Trainer(
        model=model,
        compute_metrics=compute_metrics
    )
    
    # Evaluate the model
    eval_results = trainer.evaluate(test_dataset)
    return eval_results

#### Save & Load the Model for Future Use

In [None]:
def save_model(model, tokenizer):
    """Save the trained model and tokenizer."""
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

In [None]:
# Train the model
trained_model = train(train_dataset, val_dataset)

In [None]:

# Evaluate the model
evaluate(test_dataset)

In [None]:

# Save the model
save_model()