<a href="https://colab.research.google.com/github/guanyaohan/HW1/blob/master/LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from peft import get_peft_model, LoraConfig, TaskType

print("Loading SST2 dataset...")
dataset = load_dataset("sst2")

# Prepare datasets
print("Preparing datasets...")
train_dataset = dataset["train"]
original_val_dataset = dataset["validation"]

val_size = len(original_val_dataset) // 2
val_dataset = Dataset.from_dict(original_val_dataset[:val_size])
test_dataset = Dataset.from_dict(original_val_dataset[val_size:])

print(f"Train dataset size: {len(train_dataset)}")
print(f"New validation dataset size: {len(val_dataset)}")
print(f"New test dataset size: {len(test_dataset)}")

# Load RoBERTa tokenizer and model
print("Loading RoBERTa tokenizer and model...")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base")

# Configure LoRA
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "key", "value"]
)

# Wrap the model with LoRA
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

# Process dataset function
def process_dataset(dataset, split_name):
    print(f"Processing {split_name} dataset...")
    tokenized = dataset.map(tokenize_function, batched=True, remove_columns=['sentence', 'idx'])
    tokenized = tokenized.rename_column("label", "labels")

    labels = tokenized['labels']
    unique_labels, counts = np.unique(labels, return_counts=True)
    print(f"{split_name} dataset label distribution: {dict(zip(unique_labels, counts))}")
    print(f"{split_name} dataset - Processed size: {len(tokenized)}")
    return tokenized

tokenized_datasets = {
    "train": process_dataset(train_dataset, "train"),
    "validation": process_dataset(val_dataset, "validation"),
    "test": process_dataset(test_dataset, "test")
}

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        if self.args.logging_steps > 0 and self.state.global_step % self.args.logging_steps == 0:
            predictions = torch.argmax(logits, dim=-1)
            accuracy = torch.sum(predictions == labels).item() / len(labels)
            self.log({"train_accuracy": accuracy})

        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions)
    }

training_args = TrainingArguments(
    output_dir="./results_lora",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs_lora",
    logging_steps=5,
    evaluation_strategy="steps",
    eval_steps=25,
    save_steps=50,
    load_best_model_at_end=True,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

print("Starting training...")
train_result = trainer.train()

train_loss = []
train_acc = []
val_acc = []
steps = []
val_steps = []

last_step = 0
for log in trainer.state.log_history:
    if 'step' in log:
        last_step = log['step']

    if 'loss' in log and 'step' in log:
        steps.append(log['step'])
        train_loss.append(log['loss'])
        if len(train_acc) < len(steps):
            train_acc.append(None)

    if 'train_accuracy' in log:
        if len(steps) > 0 and steps[-1] == last_step:
            train_acc[-1] = log['train_accuracy']
        else:
            steps.append(last_step)
            train_acc.append(log['train_accuracy'])
            train_loss.append(None)

    if 'eval_accuracy' in log and 'step' in log:
        val_steps.append(log['step'])
        val_acc.append(log['eval_accuracy'])

max_length = max(len(steps), len(train_loss), len(train_acc))
steps = steps + [steps[-1]] * (max_length - len(steps))
train_loss = train_loss + [None] * (max_length - len(train_loss))
train_acc = train_acc + [None] * (max_length - len(train_acc))

fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(10, 15), sharex=True)

ax1.plot([step for step, loss in zip(steps, train_loss) if loss is not None],
         [loss for loss in train_loss if loss is not None],
         label='Train Loss', color='blue')
ax1.set_ylabel('Loss')
ax1.set_title('Training Loss')
ax1.legend()

ax2.plot([step for step, acc in zip(steps, train_acc) if acc is not None],
         [acc for acc in train_acc if acc is not None],
         label='Train Accuracy', color='green')
ax2.set_ylabel('Accuracy')
ax2.set_title('Training Accuracy')
ax2.legend()

ax3.plot(val_steps, val_acc, label='Validation Accuracy', color='red')
ax3.set_ylabel('Accuracy')
ax3.set_title('Validation Accuracy')
ax3.legend()

ax3.set_xlabel('Steps')
plt.tight_layout()
plt.savefig('training_metrics_lora.png')
plt.close()

print("Training completed. LoRA metrics plot saved as 'training_metrics_lora.png'.")

# Print final metrics
print(f"Final training loss: {next((loss for loss in reversed(train_loss) if loss is not None), 'N/A')}")
print(f"Final training accuracy: {next((acc for acc in reversed(train_acc) if acc is not None), 'N/A')}")
print(f"Final validation accuracy: {val_acc[-1] if val_acc else 'N/A'}")

# Evaluate on test set
test_results = trainer.evaluate(tokenized_datasets["test"])
print(f"Test set results: {test_results}")

Loading SST2 dataset...
Preparing datasets...
Train dataset size: 67349
New validation dataset size: 436
New test dataset size: 436
Loading RoBERTa tokenizer and model...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,034,498 || all params: 125,681,668 || trainable%: 0.8231
Processing train dataset...
train dataset label distribution: {0: 29780, 1: 37569}
train dataset - Processed size: 67349
Processing validation dataset...


Map:   0%|          | 0/436 [00:00<?, ? examples/s]

validation dataset label distribution: {0: 208, 1: 228}
validation dataset - Processed size: 436
Processing test dataset...


Map:   0%|          | 0/436 [00:00<?, ? examples/s]

test dataset label distribution: {0: 220, 1: 216}
test dataset - Processed size: 436
Starting training...




Step,Training Loss,Validation Loss,Accuracy
25,0.6755,0.692064,0.522936
50,0.6846,0.69232,0.522936
75,0.6881,0.69113,0.522936
100,0.6862,0.691447,0.522936
125,0.7082,0.699434,0.522936
150,0.6899,0.683067,0.522936
175,0.6731,0.660867,0.527523
200,0.5317,0.472787,0.880734
225,0.297,0.331118,0.887615
250,0.253,0.335949,0.887615


Training completed. LoRA metrics plot saved as 'training_metrics_lora.png'.
Final training loss: 0.2315
Final training accuracy: 0.875
Final validation accuracy: 0.9311926605504587


Test set results: {'eval_loss': 0.21537663042545319, 'eval_accuracy': 0.9220183486238532, 'eval_runtime': 0.7884, 'eval_samples_per_second': 553.021, 'eval_steps_per_second': 8.879, 'epoch': 1.0}
