# Fine tune Chinese bert with words

CS685 Spring 2022 <br />
Apr. 24, 2022<br />
Hongyu Tu <br />

In [1]:
# 
#!pip3 install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
#!pip install datasets
#!pip install transformers==4.17.0
#!pip install numpy
#!pip install pickle
# from tqdm import tqdm
# import torch.nn as nn
# import torch.optim as optim
# import torch.nn.functional as F
# import matplotlib.pyplot as plt
# from torch.autograd import Variable
# from sklearn.model_selection import train_test_split
# from transformers import AutoTokenizer, AutoModelForMaskedLM
#!pip install sklearn

In [2]:
import torch
import pickle
import datasets 
import numpy as np

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Load data

In [4]:
# import matplotlib.pyplot as plt
# a = [len(i) for i in danmu_token]
# plt.hist(a, 50)
# plt.show()

In [5]:
tmp_lst = []

for i in ['danmu', 'comment']:
    with open('../data/{}_token_main.pkl'.format(i), 'rb') as f:
        tmp = pickle.load(f)
        tmp_lst.append(tmp)
    with open('../data/{}_dist_main.pkl'.format(i), 'rb') as f:
        tmp = pickle.load(f)
        tmp_lst.append(tmp)
        
danmu_token, danmu_dist, comment_token, comment_dist = tmp_lst

In [6]:
y = [np.argmax(i) for i in danmu_dist]

tmp = {}
for i in y:
    if i in tmp:
        tmp[i] += 1
    else:
        tmp[i] = 1

In [7]:
t_l = list(tmp.keys())
yy = [t_l.index(i) for i in y]
split_idx = int(len(yy) * 0.9)

In [8]:
dataset = datasets.DatasetDict({"train": datasets.Dataset.from_dict({"text": danmu_token[:split_idx], "label": yy[:split_idx]}), \
                                "test": datasets.Dataset.from_dict({"text": danmu_token[split_idx:], "label": yy[split_idx:]})})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 303652
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 33740
    })
})

### Model tuning

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

  0%|          | 0/304 [00:00<?, ?ba/s]

  0%|          | 0/34 [00:00<?, ?ba/s]

In [10]:
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(50000))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(5000))

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42)
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

In [11]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=len(t_l))

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

In [13]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [16]:
from transformers import TrainingArguments, Trainer

#training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs = 5)
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="steps", num_train_epochs = 5, 
                                  per_device_train_batch_size = 23, save_total_limit = 5, 
                                  eval_steps = 1000,
                                  load_best_model_at_end=True,
                                  save_steps=1000,
                                 )

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [18]:

trainer.train(resume_from_checkpoint=True)

Loading model from test_trainer/checkpoint-62500).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 303652
  Num Epochs = 5
  Instantaneous batch size per device = 23
  Total train batch size (w. parallel, distributed & accumulation) = 23
  Gradient Accumulation steps = 1
  Total optimization steps = 66015
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 4
  Continuing training from global step 62500
  Will skip the first 4 epochs then the first 9688 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/9688 [00:00<?, ?it/s]

Step,Training Loss,Validation Loss,Accuracy
63000,1.1147,1.954473,0.486574
64000,1.1101,1.945613,0.487315
65000,1.1127,1.932341,0.490605
66000,1.1064,1.925018,0.491968


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 33740
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-63000
Configuration saved in test_trainer/checkpoint-63000/config.json
Model weights saved in test_trainer/checkpoint-63000/pytorch_model.bin
Deleting older checkpoint [test_trainer/checkpoint-60500] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 33740
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-64000
Con

TrainOutput(global_step=66015, training_loss=0.05938613459367441, metrics={'train_runtime': 3199.0432, 'train_samples_per_second': 474.598, 'train_steps_per_second': 20.636, 'total_flos': 3.9953196462114816e+17, 'train_loss': 0.05938613459367441, 'epoch': 5.0})