In [22]:
#导入相关的包
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, AutoModelForSequenceClassification
from datasets import load_dataset

In [23]:
#加载数据集
dataset = load_dataset("json",data_files="./train_pair_1w.json",split="train")
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 10000
})

In [24]:
#划分数据集
datasets = dataset.train_test_split(test_size=0.2)


In [25]:
#数据集预处理
import torch
tokenizer = AutoTokenizer.from_pretrained("D:\Hugging Face Hub\chinese-macbert-base")
def process_function(examples):
    tokenizer_examples = tokenizer(examples["sentence1"],examples["sentence2"], truncation=True, max_length=128)
    tokenizer_examples["label"] =[ float(i) for i in examples["label"] ]
    return tokenizer_examples
tokenizer_datasets = datasets.map(process_function, batched=True,remove_columns=datasets["train"].column_names)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [26]:
#创建模型
model = AutoModelForSequenceClassification.from_pretrained("D:\Hugging Face Hub\chinese-macbert-base",num_labels=1)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at D:\Hugging Face Hub\chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
#创建评估函数
import evaluate
acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
def eval_metric(eval_predict):
    predictions,labels = eval_predict
    predictions =[int(i>0.5) for i in predictions]
    labels =[int(i) for i in labels]
    #predictions = predictions.argmax(dim=-1)
    acc = acc_metric.compute(predictions=predictions, labels=labels)
    f1 = f1_metric.compute(predictions=predictions, labels=labels)
    acc.updata(f1)
    return acc


In [28]:
#创建TrainingArguments
train_args = TrainingArguments(
    output_dir="./cross_model",
    per_gpu_eval_batch_size=2,
    per_device_train_batch_size=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy="epoch",

)

In [29]:
#创建Trainer
from transformers import DataCollatorWithPadding
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenizer_datasets["train"],
    eval_dataset=tokenizer_datasets["test"],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=eval_metric,
)


In [30]:
#模型的训练
trainer.train()

Step,Training Loss
500,0.1612
1000,0.1297
1500,0.1128
2000,0.1287
2500,0.0823
3000,0.086
3500,0.0764


KeyboardInterrupt: 

In [11]:
from transformers import pipeline
model.config.id2label = {0:"不相似",1:"相似"}
pipe = pipeline("text-classification",model = model,tokenizer=tokenizer,device=0)

Device set to use cuda:0


In [12]:
pipe({"text":"我喜欢北京","text_pair":"北京真好我喜欢这"})

{'label': '相似', 'score': 0.9898111820220947}