In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# reloading data
from datasets import load_from_disk

new_ds = load_from_disk("/content/drive/MyDrive/LHL/llm-project/new_ds_with_distilbert_predictions")

In [4]:
# reloading model
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/LHL/llm-project/lightly_tuned_distilbert_model"
)
tokenizer = AutoTokenizer.from_pretrained(
    "/content/drive/MyDrive/LHL/llm-project/lightly_tuned_distilbert_model"
)

In [5]:
# Check dataset column names
print(new_ds.column_names)

# Check first few label values
print("First 10 labels:", new_ds["train"]["label"][:10])

# Ensure labels are binary (should only contain {0,1})
print("Unique label values:", set(new_ds["train"]["label"]))

{'train': ['artist', 'album', 'genre', 'author', 'score', 'review', 'augmented_review', 'label'], 'test': ['artist', 'album', 'genre', 'author', 'score', 'review', 'augmented_review', 'label']}
First 10 labels: [0, 0, 1, 1, 1, 1, 0, 0, 0, 1]
Unique label values: {0, 1}


In [6]:
print(new_ds["train"].column_names)

['artist', 'album', 'genre', 'author', 'score', 'review', 'augmented_review', 'label']


In [7]:
# Define tokenization function
def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True, padding="max_length", max_length=512)

# Apply tokenization to the dataset
tokenized_ds = new_ds.map(tokenize_function, batched=True)

# Check that tokenization was successful
print(tokenized_ds["train"].column_names)

# Save the tokenized dataset to avoid redoing it later
tokenized_ds.save_to_disk("/content/drive/MyDrive/LHL/llm-project/tokenized_ds")


['artist', 'album', 'genre', 'author', 'score', 'review', 'augmented_review', 'label', 'input_ids', 'attention_mask']


Saving the dataset (0/1 shards):   0%|          | 0/19305 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1566 [00:00<?, ? examples/s]

In [9]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score
import numpy as np

# Define a function to compute accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Convert logits to label predictions
    return {"accuracy": accuracy_score(labels, predictions)}

# Define training parameters (Optimized for GPU)
training_args = TrainingArguments(
    output_dir="./distilbert_enhanced",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,  # ✅ Increased for GPU
    per_device_eval_batch_size=16,  # ✅ Increased for GPU
    num_train_epochs=3,  # Keeping at 3 for consistency
    learning_rate=5e-5,  # Fine-tuning learning rate
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"  # Avoid W&B logging
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],  # ✅ Use full dataset
    eval_dataset=tokenized_ds["train"],   # ✅ Still evaluating on train for now
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Check Trainer setup
print(trainer)

  trainer = Trainer(


<transformers.trainer.Trainer object at 0x7dbdceb64d90>


In [10]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6134,0.448699,0.807822
2,0.4327,0.215688,0.926962
3,0.2255,0.089734,0.972287


TrainOutput(global_step=3621, training_loss=0.4238991725513929, metrics={'train_runtime': 814.418, 'train_samples_per_second': 71.112, 'train_steps_per_second': 4.446, 'total_flos': 7671849393162240.0, 'train_loss': 0.4238991725513929, 'epoch': 3.0})

In [11]:
# Run model predictions on the test set
test_predictions = trainer.predict(tokenized_ds["test"])

import numpy as np
from sklearn.metrics import accuracy_score

# Convert logits to label predictions (0 = Negative, 1 = Positive)
predicted_labels = np.argmax(test_predictions.predictions, axis=-1)

# Extract actual labels from test dataset
true_labels = tokenized_ds["test"]["label"]

# Compute accuracy
test_accuracy = accuracy_score(true_labels, predicted_labels)

print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.7292


In [12]:
# Get size of training and test sets
train_size = len(tokenized_ds["train"])
test_size = len(tokenized_ds["test"])

# Print results
print(f"Training Set Size: {train_size}")
print(f"Test Set Size: {test_size}")
print(f"Total Dataset Size: {train_size + test_size}")

Training Set Size: 19305
Test Set Size: 1566
Total Dataset Size: 20871


In [15]:
from transformers import AutoModelForSequenceClassification

# Reload the model from the last saved checkpoint (ensuring a fresh start)
model = AutoModelForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/LHL/llm-project/lightly_tuned_distilbert_model"
)

In [16]:
training_args = TrainingArguments(
    output_dir="./distilbert_enhanced",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,  # ✅ Keep batch size optimized for GPU
    per_device_eval_batch_size=16,
    num_train_epochs=2,  # ✅ Reduce from 3 → 2
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)



In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["train"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6094,0.436453,0.81238
2,0.4184,0.250682,0.905828


TrainOutput(global_step=2414, training_loss=0.5139082787742077, metrics={'train_runtime': 543.8201, 'train_samples_per_second': 70.998, 'train_steps_per_second': 4.439, 'total_flos': 5114566262108160.0, 'train_loss': 0.5139082787742077, 'epoch': 2.0})

In [18]:
# Run model predictions on the test set
test_predictions = trainer.predict(tokenized_ds["test"])

import numpy as np
from sklearn.metrics import accuracy_score

# Convert logits to label predictions (0 = Negative, 1 = Positive)
predicted_labels = np.argmax(test_predictions.predictions, axis=-1)

# Extract actual labels from test dataset
true_labels = tokenized_ds["test"]["label"]

# Compute accuracy
test_accuracy = accuracy_score(true_labels, predicted_labels)

print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.7261


In [19]:
from transformers import AutoModelForSequenceClassification

# Reload DistilBERT with higher dropout
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english",
    num_labels=2
)

# Manually adjust dropout layers
model.config.hidden_dropout_prob = 0.2  # ✅ Increase from default 0.1 → 0.2
model.config.attention_probs_dropout_prob = 0.2  # ✅ Increase from 0.1 → 0.2

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["train"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6041,0.428287,0.818855
2,0.4103,0.245057,0.908521


TrainOutput(global_step=2414, training_loss=0.5071920981166179, metrics={'train_runtime': 543.7216, 'train_samples_per_second': 71.011, 'train_steps_per_second': 4.44, 'total_flos': 5114566262108160.0, 'train_loss': 0.5071920981166179, 'epoch': 2.0})

In [21]:
# Run model predictions on the test set
test_predictions = trainer.predict(tokenized_ds["test"])

import numpy as np
from sklearn.metrics import accuracy_score

# Convert logits to label predictions (0 = Negative, 1 = Positive)
predicted_labels = np.argmax(test_predictions.predictions, axis=-1)

# Extract actual labels from test dataset
true_labels = tokenized_ds["test"]["label"]

# Compute accuracy
test_accuracy = accuracy_score(true_labels, predicted_labels)

print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.7344


In [22]:
model.save_pretrained("/content/drive/MyDrive/LHL/llm-project/tuned_distilbert_model")
tokenizer.save_pretrained("/content/drive/MyDrive/LHL/llm-project/tuned_distilbert_model")

('/content/drive/MyDrive/LHL/llm-project/tuned_distilbert_model/tokenizer_config.json',
 '/content/drive/MyDrive/LHL/llm-project/tuned_distilbert_model/special_tokens_map.json',
 '/content/drive/MyDrive/LHL/llm-project/tuned_distilbert_model/vocab.txt',
 '/content/drive/MyDrive/LHL/llm-project/tuned_distilbert_model/added_tokens.json',
 '/content/drive/MyDrive/LHL/llm-project/tuned_distilbert_model/tokenizer.json')

In [23]:
tokenized_ds.save_to_disk("/content/drive/MyDrive/LHL/llm-project/tuned_tokenized_ds")

Saving the dataset (0/1 shards):   0%|          | 0/19305 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1566 [00:00<?, ? examples/s]

In [24]:
trainer.save_model("/content/drive/MyDrive/LHL/llm-project/tuned_trainer_checkpoint")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_from_disk

# Load the fine-tuned model
model = AutoModelForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/LHL/llm-project/tuned_distilbert_model"
)
tokenizer = AutoTokenizer.from_pretrained(
    "/content/drive/MyDrive/LHL/llm-project/tuned_distilbert_model"
)

# Load the dataset
tokenized_ds = load_from_disk("/content/drive/MyDrive/LHL/llm-project/tuned_tokenized_ds")