In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
import numpy as np
import warnings

warnings.filterwarnings("ignore") # Optional: Hide warnings

print("Libraries imported successfully.")

# Check available device (GPU/MPS/CPU)
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Apple Silicon GPU)")
else:
    device = torch.device("cpu")
    print("Using CPU")

Libraries imported successfully.
Using GPU: Tesla T4


In [None]:
csv_file_path = 'sentiment_analysis_100k.csv'
text_column = 'Text'
score_column = 'Score'

model_name = 'distilbert-base-uncased'
# model_name = 'bert-base-uncased'

# Training parameters
# Training parameters
output_dir = './rating_model_100k_results' # Changed dir name for this run
logging_dir = './rating_model_100k_logs'   # Changed dir name for this run

# Keep epochs at 3 as decided previously
num_train_epochs = 3

# ***** CHANGED: Increased batch sizes for potentially faster training *****
per_device_train_batch_size = 128 # Originally 16
per_device_eval_batch_size = 256  # Originally 32

learning_rate = 2e-5 # Standard learning rate, keep unless tuning later
weight_decay = 0.01  # Standard weight decay

test_size = 0.2
random_state = 42 # For reproducibility

# --- Label Mapping (remains the same for 1-5 stars) ---
num_labels = 5 # 5 classes for scores 1-5

In [None]:
try:
    df = pd.read_csv(csv_file_path)
    print(f"Successfully loaded data from {csv_file_path}")
    print(f"Dataset shape: {df.shape}")
    print("\nFirst 5 rows:")
    print(df.head())
    print("\nScore distribution:")
    print(df[score_column].value_counts())
except FileNotFoundError:
    print(f"Error: File not found at {csv_file_path}. Please check the path.")
except Exception as e:
    print(f"An error occurred while loading the data: {e}")

Successfully loaded data from sentiment_analysis_100k.csv
Dataset shape: (100000, 2)

First 5 rows:
   Score                                               Text
0      1  These Grape Leaves are very thick and veiny an...
1      1  I ordered this product and only received one b...
2      1  I've been buying ZICO with mango in 11.2 oz. t...
3      1  I am no stranger to health foods and even use ...
4      1  Fisher Macadamia Nut No Salt, 2-Pound Package....

Score distribution:
Score
1    20000
2    20000
3    20000
4    20000
5    20000
Name: count, dtype: int64


In [None]:
# 1. Handle missing values (if any) in the text column
df = df.dropna(subset=[text_column, score_column]) # Also drop if score is missing
df[text_column] = df[text_column].astype(str) # Ensure text is string

# Ensure scores are integers and within the expected range (1-5)
try:
    df[score_column] = df[score_column].astype(int)
    df = df[df[score_column].between(1, 5)] # Keep only scores 1 through 5
    print(f"Validated scores. Data shape after validation: {df.shape}")
except ValueError:
    print(f"Error: Could not convert '{score_column}' to integers. Please check data.")
    # Optional: Stop execution if scores are invalid
    # raise SystemExit()

# 2. ***** REMOVED: Filtering step for score 3 is GONE. We keep all scores. *****

# 3. Create the label column (mapping 1-5 to 0-4)
# Important: Trainer expects zero-indexed labels, so map Score 's' to Label 's-1'
df['labels'] = df[score_column] - 1

# 4. Select relevant columns
df_final = df[[text_column, 'labels']].rename(columns={text_column: 'text'})

print("\nProcessed DataFrame sample (for 1-5 stars):")
print(df_final.head())
print("\nLabel distribution (0 corresponds to 1 star, 4 to 5 stars):")
# Sort index for clarity (0, 1, 2, 3, 4)
print(df_final['labels'].value_counts().sort_index())

# 5. Convert pandas DataFrame to Hugging Face Dataset
hg_dataset = Dataset.from_pandas(df_final)

# 6. Split into train and test sets
train_test_split_dataset = hg_dataset.train_test_split(test_size=test_size, seed=random_state)

# Create a DatasetDict structure
dataset_dict = DatasetDict({
    'train': train_test_split_dataset['train'],
    'test': train_test_split_dataset['test']
})

print("\nDataset structure:")
print(dataset_dict)

Validated scores. Data shape after validation: (100000, 2)

Processed DataFrame sample (for 1-5 stars):
                                                text  labels
0  These Grape Leaves are very thick and veiny an...       0
1  I ordered this product and only received one b...       0
2  I've been buying ZICO with mango in 11.2 oz. t...       0
3  I am no stranger to health foods and even use ...       0
4  Fisher Macadamia Nut No Salt, 2-Pound Package....       0

Label distribution (0 corresponds to 1 star, 4 to 5 stars):
labels
0    20000
1    20000
2    20000
3    20000
4    20000
Name: count, dtype: int64

Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 80000
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 20000
    })
})


In [None]:
# === Cell 5: Tokenization ===

if 'dataset_dict' in locals(): # Proceed only if datasets were created
    # 1. Load tokenizer associated with the chosen model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(f"Tokenizer loaded for model: {model_name}")

    # 2. Define tokenization function
    def tokenize_function(examples):
        # Padding='max_length' ensures all sequences have the same length.
        # Truncation=True ensures sequences longer than model max length are cut.
        # max_length can be adjusted (e.g., 128, 256, 512) based on review lengths and memory.
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

    # 3. Apply tokenization to the datasets
    # batched=True processes multiple elements simultaenously for speed
    tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

    # Optional: Remove the original 'text' column as it's no longer needed after tokenization
    tokenized_datasets = tokenized_datasets.remove_columns(["text"])
    # Optional: Rename 'label' to 'labels' if you didn't do it earlier
    # tokenized_datasets = tokenized_datasets.rename_column("label", "labels") # Already done in Cell 4
    # Set format for PyTorch
    tokenized_datasets.set_format("torch")

    print("\nTokenized dataset structure:")
    print(tokenized_datasets)
    print("\nExample of tokenized input:")
    print(tokenized_datasets['train'][0])
else:
    print("\nSkipping tokenization as dataset creation failed.")

Tokenizer loaded for model: distilbert-base-uncased


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]


Tokenized dataset structure:
DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 80000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 20000
    })
})

Example of tokenized input:
{'labels': tensor(1), 'input_ids': tensor([  101,  1045,  2435,  2023,  2048,  3340,  2069,  2138,  1996,  7829,
         2001,  2006,  2051,  1998,  1999,  1996,  2203,  9733, 25416,  8630,
         2098,  2026,  2769,  1012,  2034,  2292,  2033,  2360,  2008,  2057,
         4521,  5699,  2489,  9485,  2035,  1996,  2051,  1012,  1045,  1005,
         1049,  1037,  2502,  5470,  1997,  3053,  2151,  5699,  2489,  2304,
         5622, 27108,  6610,   999,  2026,  2391,  1999,  3038,  2023,  2003,
         2008,  2057,  2024,  2025,  4895,  7011,  4328,  8017,  2007,  1996,
         2825,  2124,  2217,  3896,  1997,  5699,  2489,  9485,  1998,  2070,
         1997,  1996,  7976,  4086, 2445

In [None]:
# === Cell 6: Load Model ===
if 'tokenized_datasets' in locals(): # Proceed only if tokenization succeeded
    # Load the pre-trained model for sequence classification
    # num_labels (now 5) tells the model how many output classes we have
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    # Move model to the appropriate device (GPU/MPS/CPU)
    model.to(device)
    print(f"\nModel '{model_name}' loaded for sequence classification with {num_labels} labels (for 1-5 stars).")
    print(f"Model moved to device: {device}")
else:
    print("\nSkipping model loading as tokenization failed.")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model 'distilbert-base-uncased' loaded for sequence classification with 5 labels (for 1-5 stars).
Model moved to device: cuda


In [None]:
# === Cell 7: Define Metrics ===

# Define a function to compute metrics during evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # ***** CHANGED: Use 'weighted' averaging for multi-class F1/precision/recall *****
    # 'weighted' accounts for label imbalance. Use 'macro' for unweighted average.
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted'
    )
    acc = accuracy_score(labels, predictions)

    return {
        'accuracy': acc,
        'f1': f1, # This is now weighted F1
        'precision': precision, # Weighted Precision
        'recall': recall # Weighted Recall
    }

print("Metrics computation function defined (for multi-class).")

Metrics computation function defined (for multi-class).


In [None]:
# === Cell 8: Training Arguments ===
# No essential changes needed, but ensure output_dir/logging_dir are updated if desired (done in Cell 2)
# Ensure metric_for_best_model ('f1' or 'accuracy') makes sense for multi-class. 'f1' (weighted) is often good.

if 'model' in locals(): # Check if model loaded
    training_args = TrainingArguments(
        dataloader_num_workers=4,
        output_dir=output_dir,                     # Directory to save model checkpoints (Updated in Cell 2)
        num_train_epochs=num_train_epochs,         # Total number of training epochs (Updated in Cell 2)
        per_device_train_batch_size=per_device_train_batch_size, # Batch size per device during training
        per_device_eval_batch_size=per_device_eval_batch_size,   # Batch size for evaluation
        learning_rate=learning_rate,               # Learning rate
        weight_decay=weight_decay,                 # Strength of weight decay regularization
        logging_dir=logging_dir,                   # Directory for storing logs (Updated in Cell 2)
        logging_steps=50,                          # Log metrics every X updates steps
        eval_strategy="epoch",               # Evaluate performance at the end of each epoch
        save_strategy="epoch",                     # Save model checkpoint at the end of each epoch
        load_best_model_at_end=True,               # Load the best model found during training at the end
        metric_for_best_model="f1",                # Use weighted F1 score to determine the best model
        greater_is_better=True,                    # F1 score should be maximized
        report_to="tensorboard",                   # Report logs to TensorBoard
        fp16=torch.cuda.is_available(),            # Use mixed precision (faster training on NVIDIA GPUs)
    )

    print("TrainingArguments defined.")
else:
    print("\nSkipping TrainingArguments definition as model loading failed.")

TrainingArguments defined.


In [None]:
# === Cell 9: Initialize Trainer ===
# (Ensure this cell content is correct as previously provided)
if 'model' in locals() and 'tokenized_datasets' in locals() and 'training_args' in locals():
    trainer = Trainer(
        model=model,
        args=training_args, # Uses the corrected args from Cell 8
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    print("Trainer initialized successfully.")
else:
    print("\nSkipping Trainer initialization due to previous errors.")

Trainer initialized successfully.


In [None]:
# === Cell 10: Fine-tuning ===
# (Keep this cell as it was)
if 'trainer' in locals():
    print("\nStarting model fine-tuning...")
    try:
        train_result = trainer.train()
        print("Training finished.")
        # Optional: Log some training metrics
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        # trainer.save_metrics("train", metrics) # Can save if needed
    except Exception as e:
        print(f"\nAn error occurred during training: {e}")
        # If training itself fails here, the issue might be deeper (data, resources, model compatibility)
else:
    print("\nSkipping training.")


Starting model fine-tuning...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.9589,0.926248,0.6008,0.596934,0.59711,0.6008
2,0.8594,0.878027,0.63075,0.628942,0.629586,0.63075
3,0.8013,0.870814,0.6389,0.637379,0.63656,0.6389


Training finished.
***** train metrics *****
  epoch                    =        3.0
  total_flos               = 14805177GF
  train_loss               =     0.9143
  train_runtime            = 0:22:05.28
  train_samples_per_second =    181.093
  train_steps_per_second   =      1.415


In [None]:
# === Cell 11: Evaluation ===
if 'trainer' in locals():
    print("\nEvaluating the model MANUALLY after training...")
    try:
        eval_results = trainer.evaluate() # Call evaluate() explicitly
        print("\nEvaluation Results:")
        print(eval_results)
        trainer.log_metrics("eval", eval_results)
        trainer.save_metrics("eval", eval_results) # Save metrics if desired
    except Exception as e:
        print(f"\nAn error occurred during manual evaluation: {e}")
else:
    print("\nSkipping manual evaluation as trainer not initialized.")


Evaluating the model MANUALLY after training...



Evaluation Results:
{'eval_loss': 0.8708140850067139, 'eval_accuracy': 0.6389, 'eval_f1': 0.6373794969788612, 'eval_precision': 0.6365604928432186, 'eval_recall': 0.6389, 'eval_runtime': 34.2089, 'eval_samples_per_second': 584.643, 'eval_steps_per_second': 2.309, 'epoch': 3.0}
***** eval metrics *****
  epoch                   =        3.0
  eval_accuracy           =     0.6389
  eval_f1                 =     0.6374
  eval_loss               =     0.8708
  eval_precision          =     0.6366
  eval_recall             =     0.6389
  eval_runtime            = 0:00:34.20
  eval_samples_per_second =    584.643
  eval_steps_per_second   =      2.309


In [None]:
# === Cell 12: Save the Final Model (Manual Call) ===
if 'trainer' in locals():
    final_model_dir = f"{output_dir}/final_model"
    print(f"\nSaving the final model MANUALLY to {final_model_dir}...")
    try:
        trainer.save_model(final_model_dir) # Call save_model() explicitly
        # Tokenizer saved separately if needed (already done by pipeline loading later, but good practice)
        # tokenizer.save_pretrained(final_model_dir)
        print(f"\nFinal model state saved to {final_model_dir}")
    except Exception as e:
        print(f"\nAn error occurred during manual model saving: {e}")
else:
    print("\nSkipping manual model saving as trainer not initialized.")


Saving the final model MANUALLY to ./rating_model_50k_results/final_model...

Final model state saved to ./rating_model_50k_results/final_model
