In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, KFold
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, TaskType, get_peft_model

os.environ["WANDB_DISABLED"] = "true"  # Disable WandB

# Load the dataset
file_path = '/notebooks/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv'
df = pd.read_csv(file_path)
columns_to_keep = ['name', 'brand', 'primaryCategories', 'reviews.text', 'reviews.rating']
df_selected = df[columns_to_keep]

# Balance the dataset
positive_reviews = df_selected[df_selected['reviews.rating'] >= 4]
neutral_reviews = df_selected[df_selected['reviews.rating'] == 3]
negative_reviews = df_selected[df_selected['reviews.rating'] <= 2]

max_class_size = max(len(positive_reviews), len(neutral_reviews), len(negative_reviews))
positive_upsampled = resample(positive_reviews, replace=True, n_samples=max_class_size, random_state=42)
neutral_upsampled = resample(neutral_reviews, replace=True, n_samples=max_class_size, random_state=42)
negative_upsampled = resample(negative_reviews, replace=True, n_samples=max_class_size, random_state=42)

df_balanced = pd.concat([positive_upsampled, neutral_upsampled, negative_upsampled]).sample(frac=1, random_state=42)

df_balanced['labels'] = df_balanced['reviews.rating'].map(lambda rating: 0 if rating <= 2 else (1 if rating == 3 else 2))

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced['reviews.text'], df_balanced['labels'], test_size=0.2, random_state=42
)

# Tokenize the data
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding='max_length', max_length=512, return_tensors='pt')
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding='max_length', max_length=512, return_tensors='pt')

# Create HuggingFace Datasets for the train and test sets
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'].tolist(),
    'attention_mask': train_encodings['attention_mask'].tolist(),
    'labels': y_train.tolist()
})

# LoRA Configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_lin", "v_lin"]
)

# Load DistilBERT model with LoRA
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
model = get_peft_model(model, lora_config)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,  # You can adjust the number of epochs
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=1,
    no_cuda=False  # Use GPU if available
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Cross-Validation using 5 folds for more accurate evaluation
kf = KFold(n_splits=3)  # Changed from 2 to 5 folds
fold_metrics = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Running Fold {fold + 1}")

    # Create fold-specific datasets
    X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    # Tokenize the fold training and validation data
    fold_train_encodings = tokenizer(X_fold_train.tolist(), truncation=True, padding='max_length', max_length=512, return_tensors='pt')
    fold_val_encodings = tokenizer(X_fold_val.tolist(), truncation=True, padding='max_length', max_length=512, return_tensors='pt')

    # Create fold-specific datasets
    fold_train_dataset = Dataset.from_dict({
        'input_ids': fold_train_encodings['input_ids'].tolist(),
        'attention_mask': fold_train_encodings['attention_mask'].tolist(),
        'labels': y_fold_train.tolist()
    })

    fold_val_dataset = Dataset.from_dict({
        'input_ids': fold_val_encodings['input_ids'].tolist(),
        'attention_mask': fold_val_encodings['attention_mask'].tolist(),
        'labels': y_fold_val.tolist()
    })

    # Update trainer datasets for this fold
    trainer.train_dataset = fold_train_dataset
    trainer.eval_dataset = fold_val_dataset

    # Fine-tune the model on the fold training data
    trainer.train()

    # Predict for the validation set
    predictions = trainer.predict(fold_val_dataset)

    # Convert logits to predictions
    preds = np.argmax(predictions.predictions, axis=1)

    # Calculate metrics
    accuracy = accuracy_score(y_fold_val, preds)
    precision = precision_score(y_fold_val, preds, average='weighted')
    recall = recall_score(y_fold_val, preds, average='weighted')
    f1 = f1_score(y_fold_val, preds, average='weighted')

    fold_metrics.append({
        'fold': fold + 1,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    })

# Calculate and print average metrics
avg_accuracy = np.mean([f['accuracy'] for f in fold_metrics])
avg_precision = np.mean([f['precision'] for f in fold_metrics])
avg_recall = np.mean([f['recall'] for f in fold_metrics])
avg_f1 = np.mean([f['f1'] for f in fold_metrics])

print(f"Avg Accuracy: {avg_accuracy}")
print(f"Avg Precision: {avg_precision}")
print(f"Avg Recall: {avg_recall}")
print(f"Avg F1 score: {avg_f1}")


2024-10-17 13:53:11.959757: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-17 13:53:11.959818: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-17 13:53:11.961141: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-17 13:53:11.969305: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Running Fold 1


Epoch,Training Loss,Validation Loss
1,0.6943,0.553215
2,0.5125,0.46508
3,0.4856,0.411837
4,0.3431,0.382135
5,0.4052,0.370627


Running Fold 2


Epoch,Training Loss,Validation Loss
1,0.4333,0.289155
2,0.3249,0.243419
3,0.2823,0.210019
4,0.3458,0.198603
5,0.2474,0.194467


Running Fold 3


Epoch,Training Loss,Validation Loss
1,0.2586,0.146135
2,0.2006,0.127005
3,0.1678,0.119983
4,0.1685,0.115293
5,0.1985,0.111018


Avg Accuracy: 0.9198147060742481
Avg Precision: 0.920401176453891
Avg Recall: 0.9198147060742481
Avg F1 score: 0.9197869333589356


Saving the model

In [6]:
# Save the final model and tokenizer after all cross-validation and training is done
output_dir = r"C:\Users\fmrol\Documents\GitHub\RobotReviews\fredsmeds_classifier"
trainer.save_model(output_dir)  # Save the fine-tuned model
tokenizer.save_pretrained(output_dir)  # Save the tokenizer

print(f"Final model and tokenizer saved to {output_dir}")


Final model and tokenizer saved to C:\Users\fmrol\Documents\GitHub\RobotReviews\fredsmeds_classifier


In [None]:
model.save_pretrained('./saved_model/fredsmeds_classifier')

# Saving the tokenizer with the same custom name
tokenizer.save_pretrained('./saved_model/fredsmeds_classifier')

# Optionally, save the training arguments if needed
training_args.save('./saved_model/fredsmeds_classifier/training_args.bin')

In [4]:
print(os.getcwd())


/notebooks
