# Homework: 使用完整的 YelpReviewFull 数据集训练

## 公共

In [49]:
model_dir = "/mnt/workspace/models"
dataset_dir = "/mnt/workspace/dataset"
bert_base_cased = f"{model_dir}/google-bert/bert-base-cased"
output_model_dir = f"{model_dir}/bert-base-cased-finetune-yelp"
yelp_review_full = f"{dataset_dir}/yelp_review_full"
max_length = 512

## 数据集

### 加载

In [50]:
from datasets import load_dataset

dataset = load_dataset(yelp_review_full)

In [51]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

### 随机抽样函数

In [52]:
import random
import pandas as pd
import datasets
from IPython.display import display, HTML

# import os
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [53]:
dataset["train"][12]

{'label': 3,
 'text': "I drove by yesterday to get a sneak peak.  It re-opens on July 14th and I can't wait to take my kids.  The new range looks amazing.  The entire range appears to be turf, which may or many not help your game, but it looks really nice.  The tee boxes look state of the art and the club house looks like something you'll see on a newer course.  Can't wait to experience it!"}

In [55]:
show_random_elements(dataset["train"],3)

Unnamed: 0,label,text
0,3 stars,"I ended up eating at Taggia while staying at Firesky so it was a choice of convenience. I've had the food from here several times using room service and it's never anything to complain about. It was the same story the day I had lunch here. I had an organic greens salad and shared the margherita and goat cheese pizzas with my fellow lunchers. All of the food was good - the goat cheese pizza in particular with its thin, crispy crust.\n\nUnfortunately the day we ate here our service was MIA. We were told we could seat ourselves so we did. After about 10 minutes someone came by to take our drink order and maybe 10 minutes later our waters arrived. Well 2 out of 3 of them did anyway. Then we ordered two salads and two pizzas to share. One pizza came first. WTH? Where were the salads? Or the other pizza? The salads showed up a few minutes later and then our server realized that she had forgotten our second pizza. No biggie since we had salads and one pizza to eat. But the service was lackluster with a L. Like Andrea R says, I wouldn't go out of my way to eat here, but when in the area it's a good option to have."
1,2 star,"I recently had a work luncheon at Ricardo's, I had been before years ago and it was extremely unmemorable. This visit would be more memorable but for the wrong reasons. \n\nWhen given the choice, I prefer to order off the menu than choose a buffet. But the whole group went to the buffet and I didn't want to be the oddball. I had two carne asada tacos, cheese enchilada and chips & salsa. The enchilada was bland the only hint of flavor was the acidity from the tomatoes. The salsa, too, was bland and watery. The chips were pretty generic. The first taco was ok, a bit bland, but tender. The second was filled with grizzly meat. It really turned my stomach. Fortunately, the service was friendly and they were able to accomodate our large group."
2,4 stars,"We had a great time at this resort over the long weekend. The staff was super friendly, especially Adam, David and Cassie. Great job!!! And our suite was perfect to accommodate three women with lots of bags, make-up and shoes. The Hole in the Wall restaurant had a really good breakfast, friendly staff and an outdoors patio. Not so for the Rico Restaurant. They were a bit rude, overwhelmed and obviously didn't want our business. We also floated down the Lazy River, it was definitely Lazy...pretty slow but perfect temp. All in all, I'll be back."


## 数据预处理

### 预处理

In [57]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(bert_base_cased)

def tokenize_function(examples):
    #  这里不加这个max_length就无法训练，不知道为什么和老师给的例子有区别，是因为本地加载？？
    # return tokenizer(examples["text"], padding="max_length", truncation=True)
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length)

# num_proc 并行处理
# tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=8)

Map (num_proc=8):   0%|          | 0/650000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/50000 [00:00<?, ? examples/s]

In [58]:
# 随机重新排列
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

## 微调训练配置

### 加载 BERT 模型

In [59]:
from transformers import AutoModelForSequenceClassification

llm_model = AutoModelForSequenceClassification.from_pretrained(bert_base_cased, num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /mnt/workspace/models/google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 训练过程中的指标评估（Evaluate)

In [61]:
import numpy as np
import evaluate

metric = evaluate.load("/mnt/workspace/evaluate/metrics/accuracy/accuracy.py")
# metric = evaluate.load("accuracy")

In [62]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

#### 训练过程指标监控

In [68]:
from transformers import TrainingArguments, Trainer
import os
import wandb
import torch


os.environ["WANDB_PROJECT"]="llm-dev"
os.environ["WANDB_LOG_MODEL"]="true"
os.environ["WANDB_WATCH"]="false"

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True


training_args = TrainingArguments(
    output_dir=output_model_dir,
    evaluation_strategy="epoch",
    num_train_epochs=3, # 默认就是3
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    tf32=True,
    save_total_limit=2, # 控制生成checkpoints文件夹的数量
    hub_strategy="checkpoint",
    save_strategy="steps",
    save_steps=500,
    ignore_data_skip=False,
    report_to="wandb",
    logging_steps=500
)

print(training_args)

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_

## 开始训练



### 实例化训练器（Trainer）

In [69]:
trainer = Trainer(
    model=llm_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

Detected kernel version 4.19.24, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


### 使用 nvidia-smi 查看 GPU 使用

!watch -n 1 nvidia-smi

### 启动训练

In [70]:
from transformers.trainer_utils import get_last_checkpoint

resume_from_checkpoint = get_last_checkpoint(training_args.output_dir)
if resume_from_checkpoint is None:
    trainer.train()
else:
    trainer.train(resume_from_checkpoint=True)

[34m[1mwandb[0m: Currently logged in as: [33mdouspeng[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
2,0.6646,0.714934,0.69136
3,0.5804,0.733519,0.69226


Detected kernel version 4.19.24, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


### 抽样验证

In [71]:
small_test_dataset = tokenized_datasets["test"].shuffle(seed=64).select(range(100))
trainer.evaluate(small_test_dataset)

{'eval_loss': 0.8696637749671936,
 'eval_accuracy': 0.63,
 'eval_runtime': 0.6982,
 'eval_samples_per_second': 143.221,
 'eval_steps_per_second': 7.161,
 'epoch': 3.0}

## 保存模型和训练状态

In [72]:
trainer.save_model(output_model_dir)

In [73]:
trainer.save_state()

In [23]:
# trainer.model.save_pretrained("./")