# 文本相似度实例

#### Step1. 导入相关的包

In [None]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification,Trainer,TrainingArguments
from datasets import load_dataset

#### Step2. 加载数据集

In [None]:
dataset = load_dataset("json", data_files="./train_pair_1w.json", split="train")
dataset

#### Step3. 划分数据集

In [None]:
datasets = dataset.train_test_split(test_size=0.2)
datasets

#### Step4. 数据集预处理

In [None]:
import torch
tokenizer = AutoTokenizer.from_pretrained("hf1/chinese-macbert-base")

def process_function(examples):
    tokenized_examples = tokenizer(examples["sentence1"],examples["sentence2"],max_length=128,truncation=True)
    tokenized_examples["labels"] = [int(label) for label in examples["label"]]
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

#### Step5. 创建模型

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("hf1/chinese-macbert-base")

#### Step6. 创建评估函数

In [None]:
import evaluate

acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

In [None]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions,references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

#### Step7. 创建TrainingArguments

In [None]:
train_args = TrainingArguments(
    output_dir = "./cross_model", # 输出文件夹
    per_device_train_batch_size = 32, # 训练时的batch_size
    per_device_eval_batch_size = 32, # 验证时的batch_size
    logging_steps = 10,   # log打印的频率
    evaluation_strategy = "epoch",  # 评估策略
    save_strategy = "epoch", # 保存策略
    save_total_limit = 3, # 最大保存数量
    learning_rate = 2e-5, # 学习率
    weight_decay = 0.01, # weight_decay
    metric_for_best_model = "f1", # 设定评估的指标
    load_best_model_at_end = True # 训练完成后加载最优模型
)
train_args

#### Step8. 创建Trainer

In [None]:
from transformers import DataCollatorWithPadding

trainer = Trainer(
    model = model,
    args = train_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["test"],
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics = eval_metric
)

#### Step9. 模型训练

In [None]:
trainer.train()

#### Step10. 模型评估

In [None]:
trainer.evaluate(tokenized_datasets["test"])

#### Step11. 模型预测

In [None]:
from transformers import pipeline

In [None]:
model.config.id2label = {0:"不相似", 1: "相似"}

In [None]:
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

In [None]:
pipe({"text":"我喜欢北京", "text_pair":"北京是我喜欢的地方"})