<a href="https://colab.research.google.com/github/guanyaohan/HW1/blob/master/Fully_Finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import Trainer, TrainingArguments
import numpy as np
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

import matplotlib.pyplot as plt

import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import numpy as np

from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Load SST2 dataset
print("Loading SST2 dataset...")
dataset = load_dataset("sst2")

# Log original dataset sizes
for split in dataset.keys():
    print(f"Original {split} dataset size: {len(dataset[split])}")

# Prepare datasets
print("Preparing datasets...")
train_dataset = dataset["train"]
original_val_dataset = dataset["validation"]

# Split validation dataset into new validation and test sets
val_size = len(original_val_dataset) // 2
val_dataset = Dataset.from_dict(original_val_dataset[:val_size])
test_dataset = Dataset.from_dict(original_val_dataset[val_size:])

print(f"Train dataset size: {len(train_dataset)}")
print(f"New validation dataset size: {len(val_dataset)}")
print(f"New test dataset size: {len(test_dataset)}")

# Load RoBERTa tokenizer and model
print("Loading RoBERTa tokenizer and model...")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

# Process and filter the datasets
def process_dataset(dataset, split_name):
    print(f"Processing {split_name} dataset...")
    tokenized = dataset.map(tokenize_function, batched=True, remove_columns=['sentence', 'idx'])
    tokenized = tokenized.rename_column("label", "labels")

    # Log label distribution
    labels = tokenized['labels']
    unique_labels, counts = np.unique(labels, return_counts=True)
    print(f"{split_name} dataset label distribution: {dict(zip(unique_labels, counts))}")
    print(f"{split_name} dataset - Processed size: {len(tokenized)}")
    return tokenized

tokenized_datasets = {
    "train": process_dataset(train_dataset, "train"),
    "validation": process_dataset(val_dataset, "validation"),
    "test": process_dataset(test_dataset, "test")
}


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        if self.args.logging_steps > 0 and self.state.global_step % self.args.logging_steps == 0:
            predictions = torch.argmax(logits, dim=-1)
            accuracy = torch.sum(predictions == labels).item() / len(labels)
            self.log({"train_accuracy": accuracy})
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions)
    }


training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=5,
    evaluation_strategy="steps",
    eval_steps=25,
    save_steps=50,
    load_best_model_at_end=True,
)


trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)


print("Starting training...")
train_result = trainer.train()

# Evaluate the model on the test set
print("Evaluating on test set...")
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print test results
print(f"Test set results: {test_results}")


train_loss = []
train_acc = []
val_acc = []
steps = []
val_steps = []

last_step = 0
for log in trainer.state.log_history:
    if 'step' in log:
        last_step = log['step']

    if 'loss' in log and 'step' in log:
        steps.append(log['step'])
        train_loss.append(log['loss'])

        if len(train_acc) < len(steps):
            train_acc.append(None)

    if 'train_accuracy' in log:

        if len(steps) > 0 and steps[-1] == last_step:
            train_acc[-1] = log['train_accuracy']
        else:
            steps.append(last_step)
            train_acc.append(log['train_accuracy'])
            train_loss.append(None)

    if 'eval_accuracy' in log and 'step' in log:
        val_steps.append(log['step'])
        val_acc.append(log['eval_accuracy'])


max_length = max(len(steps), len(train_loss), len(train_acc))
steps = steps + [steps[-1]] * (max_length - len(steps))
train_loss = train_loss + [None] * (max_length - len(train_loss))
train_acc = train_acc + [None] * (max_length - len(train_acc))


fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(10, 15), sharex=True)


ax1.plot([step for step, loss in zip(steps, train_loss) if loss is not None],
         [loss for loss in train_loss if loss is not None],
         label='Train Loss', color='blue')
ax1.set_ylabel('Loss')
ax1.set_title('Training Loss')
ax1.legend()


ax2.plot([step for step, acc in zip(steps, train_acc) if acc is not None],
         [acc for acc in train_acc if acc is not None],
         label='Train Accuracy', color='green')
ax2.set_ylabel('Accuracy')
ax2.set_title('Training Accuracy')
ax2.legend()


ax3.plot(val_steps, val_acc, label='Validation Accuracy', color='red')
ax3.set_ylabel('Accuracy')
ax3.set_title('Validation Accuracy')
ax3.legend()

ax3.set_xlabel('Steps')
plt.tight_layout()
plt.savefig('training_metrics_fixed.png')
plt.close()

print("Training completed. Fixed metrics plot saved as 'training_metrics_fixed.png'.")


print(f"Final training loss: {next((loss for loss in reversed(train_loss) if loss is not None), 'N/A')}")
print(f"Final training accuracy: {next((acc for acc in reversed(train_acc) if acc is not None), 'N/A')}")
print(f"Final validation accuracy: {val_acc[-1] if val_acc else 'N/A'}")


print(f"Number of training steps: {len(steps)}")
print(f"Number of training loss records: {sum(1 for loss in train_loss if loss is not None)}")
print(f"Number of training accuracy records: {sum(1 for acc in train_acc if acc is not None)}")
print(f"Number of validation accuracy records: {len(val_acc)}")

print("Training history:")
for log in trainer.state.log_history:
    print(log)


Loading SST2 dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Original train dataset size: 67349
Original validation dataset size: 872
Original test dataset size: 1821
Preparing datasets...
Train dataset size: 67349
New validation dataset size: 436
New test dataset size: 436
Loading RoBERTa tokenizer and model...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing train dataset...


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

train dataset label distribution: {0: 29780, 1: 37569}
train dataset - Processed size: 67349
Processing validation dataset...


Map:   0%|          | 0/436 [00:00<?, ? examples/s]

validation dataset label distribution: {0: 208, 1: 228}
validation dataset - Processed size: 436
Processing test dataset...


Map:   0%|          | 0/436 [00:00<?, ? examples/s]

test dataset label distribution: {0: 220, 1: 216}
test dataset - Processed size: 436




Starting training...


Step,Training Loss,Validation Loss,Accuracy
25,0.6762,0.695463,0.522936
50,0.6707,0.62027,0.612385
75,0.5665,0.450664,0.798165
100,0.4645,0.371816,0.899083
125,0.548,0.419356,0.830275
150,0.4651,0.271522,0.896789
175,0.3556,0.25211,0.90367
200,0.4665,0.359265,0.876147
225,0.2825,0.479236,0.834862
250,0.241,0.268005,0.894495


Evaluating on test set...


Test set results: {'eval_loss': 0.27453526854515076, 'eval_accuracy': 0.9174311926605505, 'eval_runtime': 0.7889, 'eval_samples_per_second': 552.668, 'eval_steps_per_second': 8.873, 'epoch': 1.0}
Training completed. Fixed metrics plot saved as 'training_metrics_fixed.png'.
Final training loss: 0.1397
Final training accuracy: 0.9423076923076923
Final validation accuracy: 0.9174311926605505
Number of training steps: 843
Number of training loss records: 842
Number of training accuracy records: 843
Number of validation accuracy records: 169
Training history:
{'train_accuracy': 0.5, 'epoch': 0, 'step': 0}
{'loss': 0.7218, 'grad_norm': 1.9975095987319946, 'learning_rate': 2.5e-06, 'epoch': 0.0011876484560570072, 'step': 5}
{'train_accuracy': 0.5, 'epoch': 0.0011876484560570072, 'step': 5}
{'loss': 0.6697, 'grad_norm': 1.52130126953125, 'learning_rate': 5e-06, 'epoch': 0.0023752969121140144, 'step': 10}
{'train_accuracy': 0.5625, 'epoch': 0.0023752969121140144, 'step': 10}
{'loss': 0.667, 'gr