# transformers fine tune for model: bert-base-cased dataset: yelp_review_full
```
第二周作业一:
1、使用完整的 YelpReviewFull 数据集训练，对比看 Acc 最高能到多少。课程代码（ https://github.com/DjangoPeng/LLM-quickstart/blob/main/transformers/fine-tune-quickstart.ipynb ）------> this notebook is for this task.
2、加载本地保存的模型，进行评估和再训练更高的 F1 Score。课程代码（ https://github.com/DjangoPeng/LLM-quickstart/blob/main/transformers/fine-tune-QA.ipynb ）

第二周作业二: 
1、在“LoRA 低秩适配 OpenAI Whisper-Large-V2 语音识别任务”中，为中文语料的训练过程增加过程评估，观察 Train Loss 和 Validation Loss 变化。课程代码（ https://github.com/DjangoPeng/LLM-quickstart/blob/main/peft/peft_lora_whisper-large-v2.ipynb ） 
2、在“LoRA 低秩适配 OpenAI Whisper-Large-V2 语音识别任务”中，当 LoRA 模型训练完成后，使用测试集进行完整的模型评估。课程代码（ https://github.com/DjangoPeng/LLM-quickstart/blob/main/peft/peft_lora_whisper-large-v2.ipynb ） 


```

In [1]:
from datasets import load_dataset
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    pipeline
)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

import torch
from datetime import datetime

# output dir prefix
ts = datetime.now().strftime("%Y-%m-%d_%H%M%S")
output_dir_base = f"./finetune/{ts}"
model_dir = f"{output_dir_base}/models/yelp_bert_finetuned"
logging_dir = f"{output_dir_base}/logs"
results_dir = f"{output_dir_base}/results"
use_full_dataset = True  # Set to False for quick testing with smaller subset
train_dataset_count = 10000
eval_dataset_count = 1000
batch_size = 64
print(f"output_dir_base={output_dir_base}, batch_size={batch_size}")

# =============================================
# 1. LOAD DATASET AND MODELS
# =============================================
dataset = load_dataset("yelp_review_full")
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

# Load BOTH models:
# - base_model: original BERT (no fine-tuning)
# - model: will be fine-tuned
base_model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

# =============================================
# 2. PREPROCESS DATA
# =============================================
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

encoded_dataset = dataset.map(preprocess_function, batched=True)

# check if we are using the full dataset or a smaller subset
print(f"Dataset size: {len(encoded_dataset['train'])} training examples, {len(encoded_dataset['test'])} test examples")
if use_full_dataset:
    print("Using full dataset for training and evaluation.")
    train_dataset = encoded_dataset["train"]
    eval_dataset = encoded_dataset["test"]
else:
    print("Using smaller subset of 1000 examples for quick testing.")
    train_dataset = encoded_dataset["train"].shuffle(seed=42).select(range(train_dataset_count))
    eval_dataset = encoded_dataset["test"].shuffle(seed=42).select(range(eval_dataset_count))

# Use the official training split from the dataset
#full_train = encoded_dataset["train"]

# Use Hugging Face's built-in train_test_split() method
#split_data = full_train.train_test_split(test_size=0.1, seed=42)

# Extract the two parts
#train_data = split_data["train"]  # 90% of training data
#val_data   = split_data["test"]   # 10% of training data

# Now take smaller subsets (e.g. 1000 examples each) for quick testing
#train_dataset = train_data.shuffle(seed=42).select(range(1000))
#eval_dataset  = val_data.shuffle(seed=42).select(range(1000))

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# =============================================
# 3. METRICS FUNCTION
# =============================================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }

# =============================================
# 4. TRAINING CONFIGURATION
# =============================================
training_args = TrainingArguments(
    output_dir=results_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir=logging_dir,
    logging_steps=500,
    logging_strategy="steps",
    fp16=True
)

# =============================================
# 5. BEFORE FINE-TUNING: BASELINE INFERENCE
# =============================================
print("=== BEFORE FINE-TUNING (BASE BERT) INFERENCE ===")

# Create a pipeline with the base (untrained) model
base_classifier = pipeline(
    "text-classification",
    model=base_model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Sample reviews (1-star to 5-star)
sample_reviews = [
    "Terrible service, worst food ever. Would not recommend.",  # Likely 1-star
    "It was okay. Nothing special but not bad either.",          # Likely 3-star
    "Absolutely amazing experience. Food was perfect!",          # Likely 5-star
    "The staff was rude and the place was dirty.",               # Likely 1-2 star
    "Excellent quality, friendly employees, will come back!"     # Likely 4-5 star
]

print("Sample Reviews with Base Model Predictions:")
for review in sample_reviews:
    result = base_classifier(review)
    print(f"review result={result}")
    label = int(result[0]['label'].split('_')[1])  # Extract number from 'LABEL_0' etc.
    print(f"Review: {review[:50]}... | Predicted: {label}-star")

# =============================================
# 6. FINE-TUNE THE MODEL
# =============================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("=== STARTING FINE-TUNING ===")
trainer.train()

# Save the fine-tuned model
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)
print(f"finetuned models/tokenizer saved to model_dir={model_dir}")

eval_results = trainer.evaluate()
print(f"Final evaluation: {eval_results}")

# =============================================
# 7. AFTER FINE-TUNING: CUSTOM MODEL INFERENCE
# =============================================
print("=== AFTER FINE-TUNING (CUSTOM BERT) INFERENCE ===")
# reload the fine-tuned model to ensure clean inference
# Create pipeline with the fine-tuned model
fine_tuned_classifier = pipeline(
    "text-classification",
    model=model_dir,
    tokenizer=model_dir,
    device=0 if torch.cuda.is_available() else -1
)

print("Sample Reviews with Fine-Tuned Model Predictions:")
for review in sample_reviews:
    result = fine_tuned_classifier(review)
    print(f"review result={result}")
    label = int(result[0]['label'].split('_')[1])  # Extract number from 'LABEL_0' etc.
    print(f"Review: {review[:50]}... | Predicted: {label}-star")

# =============================================
# 8. OPTIONAL: Compare a specific review side-by-side
# =============================================
if len(sample_reviews) > 0:
    test_review = sample_reviews[0]  # First sample review
    print(f"=== SIDE-BY-SIDE COMPARISON FOR REVIEW ===")
    print(f"Review Text: {test_review}")

    base_result = base_classifier(test_review)
    base_label = int(base_result[0]['label'].split('_')[1])
    base_score = base_result[0]['score']
    print(f"[BASE BERT]   Predicted: {base_label}-star (confidence: {base_score:.2f})")

    fine_tuned_result = fine_tuned_classifier(test_review)
    fine_label = int(fine_tuned_result[0]['label'].split('_')[1])
    fine_score = fine_tuned_result[0]['score']
    print(f"[FINE-TUNED] Predicted: {fine_label}-star (confidence: {fine_score:.2f})")


output_dir_base=./finetune/2025-08-03_100634, batch_size=64


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Dataset size: 650000 training examples, 50000 test examples
Using full dataset for training and evaluation.
=== BEFORE FINE-TUNING (BASE BERT) INFERENCE ===
Sample Reviews with Base Model Predictions:
review result=[{'label': 'LABEL_1', 'score': 0.2645033895969391}]
Review: Terrible service, worst food ever. Would not recom... | Predicted: 1-star
review result=[{'label': 'LABEL_4', 'score': 0.2540808618068695}]
Review: It was okay. Nothing special but not bad either.... | Predicted: 4-star
review result=[{'label': 'LABEL_4', 'score': 0.2601820230484009}]
Review: Absolutely amazing experience. Food was perfect!... | Predicted: 4-star
review result=[{'label': 'LABEL_4', 'score': 0.26809513568878174}]
Review: The staff was rude and the place was dirty.... | Predicted: 4-star
review result=[{'label': 'LABEL_4', 'score': 0.2586650848388672}]
Review: Excellent quality, friendly employees, will come b... | Predicted: 4-star
=== STARTING FINE-TUNING ===


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8184,0.80555,0.64748,0.646246
2,0.735,0.791481,0.65652,0.655851


finetuned models/tokenizer saved to model_dir=./finetune/2025-08-03_100634/models/yelp_bert_finetuned


Final evaluation: {'eval_loss': 0.7914810180664062, 'eval_accuracy': 0.65652, 'eval_f1': 0.6558513560839635, 'eval_runtime': 106.4757, 'eval_samples_per_second': 469.591, 'eval_steps_per_second': 7.344, 'epoch': 2.0}
=== AFTER FINE-TUNING (CUSTOM BERT) INFERENCE ===
Sample Reviews with Fine-Tuned Model Predictions:
review result=[{'label': 'LABEL_0', 'score': 0.9939100742340088}]
Review: Terrible service, worst food ever. Would not recom... | Predicted: 0-star
review result=[{'label': 'LABEL_2', 'score': 0.8115789890289307}]
Review: It was okay. Nothing special but not bad either.... | Predicted: 2-star
review result=[{'label': 'LABEL_4', 'score': 0.936283528804779}]
Review: Absolutely amazing experience. Food was perfect!... | Predicted: 4-star
review result=[{'label': 'LABEL_0', 'score': 0.8042449951171875}]
Review: The staff was rude and the place was dirty.... | Predicted: 0-star
review result=[{'label': 'LABEL_4', 'score': 0.8450307250022888}]
Review: Excellent quality, friendly e