<a href="https://colab.research.google.com/github/daylightzjr/daylightzjr/blob/main/text_classfication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install transformers
!pip install datasets
!pip install evaluate



###step1 导入相关库

In [6]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification,Trainer,TrainingArguments
from datasets import load_dataset

###step2 数据集的加载

In [7]:
dataset = load_dataset("csv",data_files="./ChnSentiCorp_htl_all.csv",split="train")
dataset = dataset.filter(lambda x:x["review"] is not None)
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/7766 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

###step3 数据集的切分

In [8]:
dataset = dataset.train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 1553
    })
})

In [9]:
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [10]:
train_dataset[0]

{'label': 0,
 'review': '酒店外表及大堂看上去挺好的，本来对它也挺有信心的，而且就在市中心，出行挺方便的，可是从进电梯开始就比较恶梦了，那电梯一点都不像是酒店的电梯，而且从电梯到房间都是一股浓浓的很难闻的味道，很恶心，我订的是标间，设施陈旧又简陋，实在不敢恭维，还说是三星，我觉得它只比招待所好一点点而且，248的收费简直是太过分了'}

###step4数据集的预处理

In [11]:
import torch
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-large")

def process_function(examples):
  tokenized_examples = tokenizer(examples["review"],max_length=128,truncation=True,padding="max_length")
  tokenized_examples["labels"] = examples["label"]
  return tokenized_examples

tokenized_datasets = dataset.map(process_function,batched=True,remove_columns=dataset["train"].column_names)
tokenized_datasets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



Map:   0%|          | 0/6212 [00:00<?, ? examples/s]

Map:   0%|          | 0/1553 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1553
    })
})

In [12]:
tokenized_datasets["train"][0]

{'input_ids': [101,
  6983,
  2421,
  1912,
  6134,
  1350,
  1920,
  1828,
  4692,
  677,
  1343,
  2923,
  1962,
  4638,
  8024,
  3315,
  3341,
  2190,
  2124,
  738,
  2923,
  3300,
  928,
  2552,
  4638,
  8024,
  5445,
  684,
  2218,
  1762,
  2356,
  704,
  2552,
  8024,
  1139,
  6121,
  2923,
  3175,
  912,
  4638,
  8024,
  1377,
  3221,
  794,
  6822,
  4510,
  3461,
  2458,
  1993,
  2218,
  3683,
  6772,
  2626,
  3457,
  749,
  8024,
  6929,
  4510,
  3461,
  671,
  4157,
  6963,
  679,
  1008,
  3221,
  6983,
  2421,
  4638,
  4510,
  3461,
  8024,
  5445,
  684,
  794,
  4510,
  3461,
  1168,
  2791,
  7313,
  6963,
  3221,
  671,
  5500,
  3849,
  3849,
  4638,
  2523,
  7410,
  7319,
  4638,
  1456,
  6887,
  8024,
  2523,
  2626,
  2552,
  8024,
  2769,
  6370,
  4638,
  3221,
  3403,
  7313,
  8024,
  6392,
  3177,
  7357,
  3191,
  1348,
  5042,
  7358,
  8024,
  2141,
  1762,
  679,
  3140,
  2621,
  5335,
  8024,
  6820,
  6432,
  3221,
  676,
  3215,
  8024,
  2

###step5创建模型

In [13]:
model = AutoModelForSequenceClassification.from_pretrained("hfl/chinese-macbert-large")


pytorch_model.bin:   0%|          | 0.00/1.31G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


###step6 创建评估函数

In [14]:
import evaluate

acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [15]:
def eval_metric(eval_pred):
  predictions,labels = eval_pred
  predictions = predictions.argmax(axis=-1)
  acc = acc_metric.compute(predictions=predictions,references=labels)
  f1 = f1_metric.compute(predictions=predictions,references=labels)
  acc.update(f1)
  return acc

###step7 创建TrainingArguments

In [16]:
train_args = TrainingArguments(output_dir="./checkpoints",      # 输出文件夹
                 per_device_train_batch_size=2,   # 训练时的batch_size
                 gradient_accumulation_steps=32,  # *** 梯度累加 ***
                 gradient_checkpointing=True,     # *** 梯度检查点 ***
                 optim="adafactor",               # *** adafactor优化器 ***
                 per_device_eval_batch_size=4,    # 验证时的batch_size
                 num_train_epochs=1,              # 训练轮数
                 logging_steps=10,                # log 打印的频率
                 eval_strategy="epoch",         # 评估策略
                 save_strategy="epoch",        # 保存策略
                 save_total_limit=3,              # 最大保存数
                 learning_rate=2e-5,              # 学习率
                 weight_decay=0.001,              # weight_decay
                 metric_for_best_model="f1",          # 设定评估指标
                 load_best_model_at_end=True)          #训练完成后加载最优模型
train_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=epoch,
eval_use_gather_object=False,
evaluation_strategy=None,
fp1

###step8 创建Trainer

In [17]:
from transformers import DataCollatorWithPadding

for name, param in model.bert.named_parameters():
    param.requires_grad = False
    if not param.is_contiguous():
       param.data = param.data.contiguous()


trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=eval_metric
)

###step9模型训练

In [18]:
trainer.train()

  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.6721,0.67311,0.651642,0.781943


TrainOutput(global_step=97, training_loss=0.6955274798206448, metrics={'train_runtime': 236.3588, 'train_samples_per_second': 26.282, 'train_steps_per_second': 0.41, 'total_flos': 1446357475885056.0, 'train_loss': 0.6955274798206448, 'epoch': 0.9993560849967804})

In [19]:
trainer.evaluate(tokenized_datasets['test'])

{'eval_loss': 0.6731097102165222,
 'eval_accuracy': 0.6516419832582099,
 'eval_f1': 0.7819427650141072,
 'eval_runtime': 39.6665,
 'eval_samples_per_second': 39.151,
 'eval_steps_per_second': 9.807,
 'epoch': 0.9993560849967804}

In [20]:
trainer.predict(tokenized_datasets["test"])

PredictionOutput(predictions=array([[-0.35815844, -0.1615712 ],
       [-0.24207294, -0.10017511],
       [-0.46432927, -0.21695606],
       ...,
       [-0.28694606, -0.1150708 ],
       [-0.07830805,  0.03046926],
       [-0.09866145, -0.20493434]], dtype=float32), label_ids=array([1, 1, 0, ..., 1, 1, 0]), metrics={'test_loss': 0.6731097102165222, 'test_accuracy': 0.6516419832582099, 'test_f1': 0.7819427650141072, 'test_runtime': 40.1823, 'test_samples_per_second': 38.649, 'test_steps_per_second': 9.681})

In [21]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

###step10 模型预测

In [24]:
sen = "这家店都挺好的，就是卫生不太好"
id2_label = {0:"negative",1:"positive"}
model.eval()
with torch.inference_mode():
  inputs = tokenizer(sen,return_tensors="pt")
  inputs = {k:v.to(device) for k,v in inputs.items()}
  logits = model(**inputs).logits
  pred = logits.argmax(dim=-1).item()
  print(f"输入：{sen}\n模型预测结果:{id2_label.get(pred)}")

输入：这家店都挺好的，就是卫生不太好
模型预测结果:positive


In [25]:
from transformers import pipeline

model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

In [26]:
pipe(sen)

[{'label': 'positive', 'score': 0.5117309093475342}]