In [1]:
import os

os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
# os.environ["HTTP_PROXY"] = "http://127.0.0.1:7890"
# os.environ["HTTPS_PROXY"] = "http://127.0.0.1:7890"

## 数据集加载

In [2]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data_files = {"train": "./data/train.csv"}

dataset = load_dataset("csv", data_files=data_files, num_proc=8)
train_and_test = dataset["train"].train_test_split(test_size=0.2)

In [4]:
print(train_and_test)

DatasetDict({
    train: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 127656
    })
    test: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 31915
    })
})


In [5]:
label2id = {
    "toxic": 0,
    "severe_toxic": 1,
    "obscene": 2,
    "threat": 3,
    "insult": 4,
    "identity_hate": 5,
}

id2label = {
    0: "toxic",
    1: "severe_toxic",
    2: "obscene",
    3: "threat",
    4: "insult",
    5: "identity_hate",
}

## 数据预处理

In [6]:
from transformers import AutoTokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [8]:
def tokenized_fn(example):
    tokenized_example = tokenizer(
        example["comment_text"],
        truncation=True,
        padding="max_length",
        max_length=64,
    )
    return tokenized_example

In [9]:
def gen_labels(label1, label2, label3, label4, label5, label6):
    labels = []

    for i in range(len(label1)):
        labels.append(
            [
                float(label1[i]),
                float(label2[i]),
                float(label3[i]),
                float(label4[i]),
                float(label5[i]),
                float(label6[i]),
            ]
        )

    return {"labels": labels}

In [10]:
def datapipe(dataset):
    dataset = dataset.map(tokenized_fn, batched=True)
    dataset = dataset.map(
        gen_labels,
        input_columns=[
            "toxic",
            "severe_toxic",
            "obscene",
            "threat",
            "insult",
            "identity_hate",
        ],
        batched=True,
    )
    return dataset

In [11]:
train_and_test = datapipe(train_and_test)

Map: 100%|██████████| 127656/127656 [00:19<00:00, 6666.37 examples/s]
Map: 100%|██████████| 31915/31915 [00:04<00:00, 6698.83 examples/s]
Map: 100%|██████████| 127656/127656 [00:00<00:00, 163420.93 examples/s]
Map: 100%|██████████| 31915/31915 [00:00<00:00, 213154.46 examples/s]


In [12]:
train_and_test = train_and_test.select_columns(
    [
        "input_ids",
        "attention_mask",
        "labels",
    ]
)

In [13]:
train_and_test

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 127656
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 31915
    })
})

In [14]:
print(train_and_test["train"][0])

{'input_ids': [101, 1045, 2134, 1005, 1056, 5382, 1998, 1045, 2572, 2047, 2182, 2074, 2893, 1996, 6865, 1997, 2009, 1012, 4283, 2005, 1996, 4641, 2039, 2295, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}


## 预训练模型

In [15]:
from transformers import AutoModelForSequenceClassification

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification",
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
model.requires_grad = False
model.classifier.requires_grad = True

## 评价指标

In [18]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

In [19]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [20]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = sigmoid(logits)
    preds = (preds > 0.5).astype(int).reshape(-1)
    labels = labels.reshape(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return {
        "accuracy": accuracy,
        "f1": f1,
    }

## 训练

In [21]:
from transformers import TrainingArguments, Trainer

2024-06-13 15:41:45.986988: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-13 15:41:46.030624: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-13 15:41:46.030653: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-13 15:41:46.030684: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-13 15:41:46.038994: I tensorflow/core/platform/cpu_feature_g

In [27]:
training_args = TrainingArguments(
    output_dir="./output/",
    eval_strategy="epoch",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    weight_decay=0.01,
    num_train_epochs=5,
    learning_rate=2e-5,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [28]:
print(train_and_test["test"][0])

{'input_ids': [101, 1045, 2123, 1005, 1056, 2113, 2040, 2023, 10812, 3559, 2003, 1010, 2021, 1045, 2123, 1005, 1056, 2903, 1996, 1996, 5464, 2003, 2025, 14485, 1012, 2009, 2003, 2899, 7058, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}


In [29]:
trainer = Trainer(
    args=training_args,
    model=model,
    train_dataset=train_and_test["train"],
    eval_dataset=train_and_test["test"],
    compute_metrics=compute_metrics,
)

Detected kernel version 4.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [30]:
trainer.evaluate()

{'eval_loss': 0.05397845804691315,
 'eval_accuracy': 0.9817588385816491,
 'eval_f1': 0.8570132571465302,
 'eval_runtime': 100.4745,
 'eval_samples_per_second': 317.643,
 'eval_steps_per_second': 2.488}

In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0466,0.042239,0.984312,0.882476
2,0.0363,0.041644,0.984067,0.881441
3,0.0299,0.043375,0.983816,0.883072
4,0.0248,0.048329,0.983503,0.882226
5,0.0211,0.049663,0.983153,0.881117


TrainOutput(global_step=4990, training_loss=0.03134886850574929, metrics={'train_runtime': 6646.9584, 'train_samples_per_second': 96.026, 'train_steps_per_second': 0.751, 'total_flos': 2.099306947802112e+16, 'train_loss': 0.03134886850574929, 'epoch': 5.0})

In [32]:
answer = (sigmoid(trainer.predict(train_and_test["test"]).predictions) > 0.5).astype(
    int
)

In [33]:
answer[2]

array([1, 0, 1, 0, 1, 0])

In [35]:
train_and_test["test"]["labels"][2]

[1.0, 0.0, 1.0, 0.0, 1.0, 1.0]