In [126]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import BCEWithLogitsLoss
from torch import nn, optim
import optuna
from sklearn.metrics import f1_score, accuracy_score

In [96]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [97]:
df = pd.read_csv('data/train.csv').sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,a7ced57c844edbf1,the validity of their message is not what Wiki...,0,0,0,0,0,0
1,2cb4606ad6a418b6,"""\nA few points:\n1) All cabals are bad ideas....",0,0,0,0,0,0
2,487eecc0dee4d797,"Ha \n\nThis is true, but I bet Godwin cries hi...",0,0,0,0,0,0
3,509aaa6f34396818,There we go again: east and west\nI called a B...,0,0,0,0,0,0
4,1c35d609bbe79a9b,"""Pand== Smile ==\n\n \n\n Smile \n\n == Smile ...",0,0,0,0,0,0


In [98]:
cols = df.columns
label_cols = list(cols[2:])
df['one_hot_labels'] = list(df[label_cols].values)
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,one_hot_labels
0,a7ced57c844edbf1,the validity of their message is not what Wiki...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
1,2cb4606ad6a418b6,"""\nA few points:\n1) All cabals are bad ideas....",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
2,487eecc0dee4d797,"Ha \n\nThis is true, but I bet Godwin cries hi...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
3,509aaa6f34396818,There we go again: east and west\nI called a B...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
4,1c35d609bbe79a9b,"""Pand== Smile ==\n\n \n\n Smile \n\n == Smile ...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...,...,...
159566,ce95003cecc9f762,put the Bills link in the,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
159567,6c8cb996be4cd0f2,"""\nAnd this thing with just 60000 redarmy serv...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
159568,6efdb5feac1a5c38,doesn't count when peole just revert and then ...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
159569,1ee5e00c2b4677e8,I have no idea what this unblock request form ...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"


In [99]:
labels = list(df.one_hot_labels.values)  # 标签
comments = list(df.comment_text.values)

In [100]:
max_length = 386  # 文本长度大部分(98%)在386以内
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)  # tokenizer 全部转小写
encodings = tokenizer(comments,
                      truncation=True,
                      max_length=max_length,
                      padding="max_length")
print(encodings.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [101]:
input_ids = encodings['input_ids']
token_type_ids = encodings['token_type_ids']
attention_masks = encodings['attention_mask']

In [102]:
label_counts = df.one_hot_labels.astype(str).value_counts()
label_counts  # [1 1 0 1 0 1], [1 1 0 1 1 0]样本数量为1

[0 0 0 0 0 0]    143346
[1 0 0 0 0 0]      5666
[1 0 1 0 1 0]      3800
[1 0 1 0 0 0]      1758
[1 0 0 0 1 0]      1215
[1 1 1 0 1 0]       989
[1 0 1 0 1 1]       618
[0 0 1 0 0 0]       317
[0 0 0 0 1 0]       301
[1 1 1 0 1 1]       265
[0 0 1 0 1 0]       181
[1 1 1 0 0 0]       158
[1 0 0 0 0 1]       136
[1 0 0 0 1 1]       134
[1 0 1 1 1 0]       131
[1 0 0 1 0 0]       113
[1 1 1 1 1 0]        64
[1 0 1 1 1 1]        56
[0 0 0 0 0 1]        54
[1 1 0 0 0 0]        41
[1 0 1 0 0 1]        35
[1 1 1 1 1 1]        31
[0 0 0 0 1 1]        28
[0 0 0 1 0 0]        22
[0 0 1 0 1 1]        18
[1 0 0 1 1 0]        16
[1 1 0 0 1 0]        14
[1 0 1 1 0 0]        11
[1 1 0 1 0 0]        11
[1 0 0 1 0 1]         7
[1 1 0 0 1 1]         7
[1 1 1 0 0 1]         6
[1 1 1 1 0 0]         4
[1 0 0 1 1 1]         3
[1 1 0 0 0 1]         3
[0 0 1 0 0 1]         3
[0 0 0 1 1 0]         3
[0 0 1 1 0 0]         2
[0 0 1 1 1 0]         2
[1 1 0 1 0 1]         1
[1 1 0 1 1 0]         1
Name: one_hot_la

In [104]:
one_freq = label_counts[label_counts == 1].keys()  # 类别样本数为1的类别(不能进行分层划分)
one_freq_idxs = sorted(list(df[df.one_hot_labels.astype(str).isin(one_freq)].index), reverse=True)
print(one_freq_idxs)

[136977, 120570]


In [105]:
one_freq_input_ids = [input_ids.pop(i) for i in one_freq_idxs]
one_freq_token_types = [token_type_ids.pop(i) for i in one_freq_idxs]
one_freq_attention_masks = [attention_masks.pop(i) for i in one_freq_idxs]
one_freq_labels = [labels.pop(i) for i in one_freq_idxs]

In [106]:
# 划分训练集和验证集划分
train_inputs, validation_inputs, train_labels, validation_labels, train_token_types, validation_token_types, train_masks, validation_masks = train_test_split(
    input_ids, labels, token_type_ids, attention_masks, random_state=2020, test_size=0.10, stratify=labels)

# 添加类别样本数为1样本
train_inputs.extend(one_freq_input_ids)
train_labels.extend(one_freq_labels)
train_masks.extend(one_freq_attention_masks)
train_token_types.extend(one_freq_token_types)

train_inputs = torch.tensor(np.array(train_inputs))
train_labels = torch.tensor(np.array(train_labels))
train_masks = torch.tensor(np.array(train_masks))
train_token_types = torch.tensor(np.array(train_token_types))

validation_inputs = torch.tensor(np.array(validation_inputs))
validation_labels = torch.tensor(np.array(validation_labels))
validation_masks = torch.tensor(np.array(validation_masks))
validation_token_types = torch.tensor(np.array(validation_token_types))

In [107]:
batch_size = 16  # 批次大小

# 训练集
train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
train_dataloader = DataLoader(train_data, batch_size=batch_size)

# 验证数据集
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)
validation_dataloader = DataLoader(validation_data, batch_size=batch_size)

In [108]:
class Classifier(nn.Module):
    def __init__(self, n_classes):
        super(Classifier, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-uncased')  # 预训练模型
        self.drop = nn.Dropout(p=0.1)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)  # 下游分类任务

    def forward(self, input_ids, token_type_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            return_dict=False
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [109]:
num_labels = 6  # 分类数量为6
model = Classifier(num_labels)
model = model.to(device)
model

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Classifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 

In [110]:
loss_func = BCEWithLogitsLoss()  # 二分类任务损失函数(每个分类(类别数为6)上的取值为0或1)
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

In [111]:
epochs = 3  # 训练轮次


def train_epoch(model, train_dataloader, loss_func, optimizer, device):
    model.train()  # Sets the module in training mode
    tr_loss, nb_tr_steps = 0.0, 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels, b_token_types = batch
        # b_labels.shape=(batch_size, num_labels)
        # outputs.shape=(batch_size, num_labels)
        outputs = model(b_input_ids, token_type_ids=b_token_types, attention_mask=b_input_mask)
        loss = loss_func(outputs, b_labels.to(torch.float64))
        tr_loss += loss.item()
        nb_tr_steps += 1
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return tr_loss, nb_tr_steps


def eval_epoch(model, validation_dataloader, device):
    model.eval()  # Sets the module in evaluation mode.
    pred_labels, true_labels = [], []
    for i, batch in enumerate(validation_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels, b_token_types = batch
        with torch.no_grad():
            # outputs.shape=(batch_size, num_labels)
            outs = model(b_input_ids, token_type_ids=b_token_types, attention_mask=b_input_mask)
            pred_label = torch.sigmoid(outs)
            pred_label = pred_label.to('cpu').numpy()
            b_labels = b_labels.to('cpu').numpy()
        pred_labels.append(pred_label)
        true_labels.append(b_labels)
    return pred_labels, true_labels


for epoch in range(epochs):
    print("epoch is:", epoch)

    tr_loss, nb_tr_steps = train_epoch(model, train_dataloader, loss_func, optimizer, device)
    print("Train loss: {}".format(tr_loss / nb_tr_steps))  # 每一轮的平均损失

    pred_labels, true_labels = eval_epoch(model, validation_dataloader, device)

    threshold = 0.5  # 0.5作为阈值
    # 拉平为一维向量(f1_score,accuracy_score要求输入为一维向量)
    pred_bools = np.concatenate([np.where(pl > 0.5, 1, 0) for pl in pred_labels], axis=0).reshape(-1, )
    true_bools = np.concatenate(true_labels, axis=0).reshape(-1, )

    val_f1_accuracy = f1_score(true_bools, pred_bools) * 100
    val_flat_accuracy = accuracy_score(true_bools, pred_bools) * 100
    print('F1 Validation Accuracy: ', val_f1_accuracy)  # 每一轮的平均F1值
    print('Flat Validation Accuracy: ', val_flat_accuracy)  # 每一轮的平均准确率

epoch is: 0
Train loss: 0.04673155557591328
F1 Validation Accuracy:  78.10133954571927
Flat Validation Accuracy:  98.42911157067954
epoch is: 1
Train loss: 0.03165133137187123
F1 Validation Accuracy:  78.17035625274886
Flat Validation Accuracy:  98.44477867602515
epoch is: 2
Train loss: 0.02441543422929628
F1 Validation Accuracy:  77.86826689527075
Flat Validation Accuracy:  98.38211025464268


In [119]:
test_df = pd.read_csv('data/test.csv')
test_labels_df = pd.read_csv('data/test_labels.csv')
test_df = test_df.merge(test_labels_df, on='id', how='left')
test_label_cols = list(test_df.columns[2:])
print('Null values: ', test_df.isnull().values.any())
print('Same columns between train and test: ', label_cols == test_label_cols)
test_df.head()

Null values:  False
Same columns between train and test:  True


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,":If you have a look back at the source, the in...",-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,-1,-1,-1,-1,-1,-1


In [120]:
test_df = test_df[~test_df[test_label_cols].eq(-1).any(axis=1)]  # 删除不相关的评论(即评论中包含-1)
test_df['one_hot_labels'] = list(test_df[test_label_cols].values)
test_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,one_hot_labels
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
7,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"


In [121]:
test_labels = list(test_df.one_hot_labels.values)  # 标签
test_comments = list(test_df.comment_text.values)

In [122]:
test_encodings = tokenizer(test_comments,
                           truncation=True,
                           max_length=max_length,
                           padding="max_length",
                           return_tensors='pt')
test_input_ids = test_encodings['input_ids']
test_token_type_ids = test_encodings['token_type_ids']
test_attention_masks = test_encodings['attention_mask']

test_labels = torch.tensor(np.array(test_labels))

test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels, test_token_type_ids)
# 测试数据集
test_dataloader = DataLoader(test_data, batch_size=batch_size)

In [123]:
pred_labels_test, true_labels_test = eval_epoch(model, test_dataloader, device)

# 拉平为一维向量
pred_labels_test = np.concatenate(pred_labels_test, axis=0).reshape(-1, )
true_labels_test = np.concatenate(true_labels_test, axis=0).reshape(-1, )

In [124]:
pred_bools_test = np.where(pred_labels_test > 0.5, 1, 0)  # 0.5作为阈值

# Print and save classification report
print('Test F1 Accuracy: ', f1_score(true_labels_test, pred_bools_test))
print('Test Flat Accuracy: ', accuracy_score(true_labels_test, pred_bools_test))

Test F1 Accuracy:  0.6688618468146027
Test Flat Accuracy:  0.969872456156804


### 最优阈值的搜索(网格搜索/optuna实现)

In [125]:
macro_thresholds = np.arange(0.1, 1, 0.1)  # 较大的阈值搜索范围(间隔为0.1)
print(macro_thresholds)

f1_results_macro, flat_acc_results_macro = [], []
for th in macro_thresholds:
    pred_bools_macro = np.where(pred_labels_test > th, 1, 0)
    test_f1_accuracy_macro = f1_score(true_labels_test, pred_bools_macro)
    test_flat_accuracy_macro = accuracy_score(true_labels_test, pred_bools_macro)
    f1_results_macro.append(test_f1_accuracy_macro)
    flat_acc_results_macro.append(test_flat_accuracy_macro)

best_macro_th = macro_thresholds[np.argmax(f1_results_macro)]
print(best_macro_th)  # 较大的阈值搜索范围搜索搜索得到的最优阈值(网格搜索)

micro_thresholds = best_macro_th + np.arange(-0.09, 0.1, 0.01)  # 较少的阈值搜索范围(间隔为0.01)
print(micro_thresholds)

f1_results_micro, flat_acc_results_micro = [], []
for th in micro_thresholds:
    pred_bools_micro = np.where(pred_labels_test > th, 1, 0)
    test_f1_accuracy_micro = f1_score(true_labels_test, pred_bools_micro)
    test_flat_accuracy_micro = accuracy_score(true_labels_test, pred_bools_micro)
    f1_results_micro.append(test_f1_accuracy_micro)
    flat_acc_results_micro.append(test_flat_accuracy_micro)

best_f1_idx = np.argmax(f1_results_micro)

print('Best Threshold: ', micro_thresholds[best_f1_idx])  # 进一步网格搜索f1值
print('Test F1 Accuracy: ', f1_results_micro[best_f1_idx])
print('Test Flat Accuracy: ', flat_acc_results_micro[best_f1_idx], '\n')

[0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]
0.7000000000000001
[0.61 0.62 0.63 0.64 0.65 0.66 0.67 0.68 0.69 0.7  0.71 0.72 0.73 0.74
 0.75 0.76 0.77 0.78 0.79]
Best Threshold:  0.71
Test F1 Accuracy:  0.6778794572420321
Test Flat Accuracy:  0.9734075255035585 



In [131]:
def objective(trial):
    """待优化目标函数"""
    threshold_x = trial.suggest_float('x', 0.01, 0.99)
    pred_bools_optuna = np.where(pred_labels_test > threshold_x, 1, 0)
    test_f1_accuracy_micro = f1_score(true_labels_test, pred_bools_optuna)
    return test_f1_accuracy_micro


# 使用optuna寻找最优化阈值
study = optuna.create_study(direction="maximize")  # f1值越大越好
study.optimize(objective,
               n_trials=250)  # 实验250次

[32m[I 2022-04-27 18:01:44,380][0m A new study created in memory with name: no-name-d204f76c-0bef-4fe4-97bf-d8d86e8c5e7e[0m
[32m[I 2022-04-27 18:01:44,507][0m Trial 0 finished with value: 0.6779607744531053 and parameters: {'x': 0.7014568637586571}. Best is trial 0 with value: 0.6779607744531053.[0m
[32m[I 2022-04-27 18:01:44,636][0m Trial 1 finished with value: 0.5902281580411797 and parameters: {'x': 0.07864435210501272}. Best is trial 0 with value: 0.6779607744531053.[0m
[32m[I 2022-04-27 18:01:44,760][0m Trial 2 finished with value: 0.6740419978645154 and parameters: {'x': 0.5811550274528955}. Best is trial 0 with value: 0.6779607744531053.[0m
[32m[I 2022-04-27 18:01:44,882][0m Trial 3 finished with value: 0.6456383942046484 and parameters: {'x': 0.922282784850736}. Best is trial 0 with value: 0.6779607744531053.[0m
[32m[I 2022-04-27 18:01:45,010][0m Trial 4 finished with value: 0.6750041030690956 and parameters: {'x': 0.7793894187559838}. Best is trial 0 with valu

In [132]:
# 最优f1值(优于网格搜索结果)
print(study.best_value)

# 最优阈值结果
print(study.best_params)

0.6780601627041685
{'x': 0.7086083025633505}
