In [1]:
# 查看当前挂载的数据集目录

!ls /home/kesci/input/

In [2]:
# 查看个人持久化工作区文件
!ls /home/kesci/work/

In [3]:
# 查看当前kernel下的package
!pip list --format=columns

In [4]:
# 显示cell运行时长
%load_ext klab-autotime

In [5]:
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple transformers==2.8.0

# 1.导入运行所需包

In [1]:
from tqdm import tqdm, trange
import numpy as np
import pandas as pd
import argparse
import logging
from sklearn.preprocessing import LabelEncoder
import time
import torch
import random
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, Subset, DataLoader
import os

from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig,get_linear_schedule_with_warmup,XLNetModel,XLNetTokenizer,XLNetConfig
from transformers.optimization import get_linear_schedule_with_warmup
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score
from sklearn.utils import shuffle


# 2.超参数配置
其中semi为模型预测的测试集结果(采用相同模型相同参数得出，区别为训练集为原始训练集，在测试集得分为0.66411980)，采用半监督方式进行再训练

In [9]:


device = torch.device('cuda')


#model_path = '../chinese_wwm_pytorch/'
#model_path = '../chinese_xlnet_mid_pytorch/'
#model_path = "../MC-BERT/"
model_path = "/home/kesci/input/model2308/chinese_roberta_wwm_large_ext_pytorch/"


bert_config = BertConfig.from_pretrained(model_path + 'bert_config.json', output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(model_path + 'vocab.txt', config=bert_config)


seed = 2020
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)


MAX_LEN = 512
epoch = 5
num_class = 6
learn_rate = 2e-5
train_batch_size = 4
valid_batch_size = 4


file_path = './log/'
# 创建一个logger
logger = logging.getLogger('mylogger')
logger.setLevel(logging.DEBUG)

train = pd.read_csv('/home/kesci/input/data5871/train.csv')
semi = pd.read_csv('/home/kesci/input/data5871/Semi-supervised_test.csv')
train=pd.concat([train,semi],sort=False)
test = pd.read_csv('/home/kesci/input/data5871/nlp_test.csv')
sub = pd.read_csv('/home/kesci/input/data5871/sample_submission.csv')



train_content = train['Question Sentence'].values.astype(str)
test_content = test['Question Sentence'].values.astype(str)


category_A = train['category_A'].astype(int).values
category_B = train['category_B'].astype(int).values
category_C = train['category_C'].astype(int).values
category_D = train['category_D'].astype(int).values
category_E = train['category_E'].astype(int).values
category_F = train['category_F'].astype(int).values

test_category_A = [0] * len(test)
test_category_B = [0] * len(test)
test_category_C = [0] * len(test)
test_category_D = [0] * len(test)
test_category_E = [0] * len(test)
test_category_F = [0] * len(test)


train_label = np.column_stack((category_A, category_B, category_C,category_D,category_E,category_F))
test_label = np.column_stack((test_category_A, test_category_B, test_category_C,test_category_D,test_category_E,test_category_F))

oof_train = np.zeros((len(train), num_class), dtype=np.float32)
oof_test = np.zeros((len(test), num_class), dtype=np.float32)

# 3.模型定义

In [10]:
class BertForClass(nn.Module):
    def __init__(self, n_classes=num_class):
        super(BertForClass, self).__init__()
        self.model_name = 'BertForClass'
        self.bert_model = BertModel.from_pretrained(model_path, config=bert_config)
        self.dropout = nn.Dropout(p=0.2)
        self.multi_drop = 5
        self.multi_dropouts = nn.ModuleList([nn.Dropout(0.2) for _ in range(self.multi_drop)])
        self.classifier = nn.Linear(bert_config.hidden_size * 2, n_classes)

    def forward(self, input_ids, input_masks, segment_ids):
        sequence_output, pooler_output, hidden_states = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids,
                                                                        attention_mask=input_masks)
        seq_avg = torch.mean(sequence_output, dim=1)
        concat_out = torch.cat((seq_avg, pooler_output), dim=1)
        logit = self.classifier(self.dropout(concat_out))

        '''
        for j,dropout in enumerate(self.multi_dropouts):
            if j == 0:
                logit = self.classifier(dropout(concat_out)) / self.multi_drop
            else:
                logit += self.classifier(dropout(concat_out)) / self.multi_drop
        '''
        
        return logit


# 4.数据格式定义
采用长度512文本，取前255+后255结合的方式

In [11]:
class data_generator:
    def __init__(self, data, batch_size=16, max_length=MAX_LEN, shuffle=False):
        self.data = data
        self.batch_size = batch_size
        self.max_length = max_length
        self.shuffle = shuffle
        self.steps = len(self.data[0]) // self.batch_size
        if len(self.data[0]) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps

    def __iter__(self):
        c, y = self.data
        idxs = list(range(len(self.data[0])))
        if self.shuffle:
            np.random.shuffle(idxs)
        input_ids, input_masks, segment_ids, labels = [], [], [], []

        for index, i in enumerate(idxs):

            text = c[i]
            if len(text) > 512:
              text = text[:255] + text[-255:-1]

            input_id = tokenizer.encode(text, max_length=self.max_length)
            input_mask = [1] * len(input_id)
            segment_id = [0] * len(input_id)
            padding_length = self.max_length - len(input_id)
            input_id += ([0] * padding_length)
            input_mask += ([0] * padding_length)
            segment_id += ([0] * padding_length)

            input_ids.append(input_id)
            input_masks.append(input_mask)
            segment_ids.append(segment_id)
            labels.append(y[i])
            if len(input_ids) == self.batch_size or i == idxs[-1]:
                yield input_ids, input_masks, segment_ids, labels
                input_ids, input_masks, segment_ids, labels = [], [], [], []





# 5.采用5折交叉验证的方式训练

In [12]:
kf = KFold(n_splits=5, shuffle=True, random_state=seed)


for fold, (train_index, valid_index) in enumerate(kf.split(train_content, train_label)):
    print('\n\n------------fold:{}------------\n'.format(fold))
    c = train_content[train_index]
    y = train_label[train_index]

    val_c = train_content[valid_index]
    val_y = train_label[valid_index]

    train_D = data_generator([c, y], batch_size=train_batch_size, shuffle=True)
    val_D = data_generator([val_c, val_y], batch_size=valid_batch_size)

    model = BertForClass().to(device)
    #pgd = PGD(model)
    #K = 3
    loss_fn = nn.BCEWithLogitsLoss() # BCEWithLogitsLoss就是把Sigmoid-BCELoss合成一步
    
    num_train_steps = int(len(train) / train_batch_size * epoch)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=learn_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=int(len(train) / train_batch_size/2), 
        num_training_steps=num_train_steps
    )

    best_f1 = 0
    PATH = '/home/kesci/work/bert_{}.pth'.format(fold)
    for e in range(epoch):
        print('\n------------epoch:{}------------'.format(e))
        model.train()
        f1 = 0
        train_len = 0
        loss_num = 0
        tq = tqdm(train_D)

        for input_ids, input_masks, segment_ids, labels in tq:
            input_ids = torch.tensor(input_ids).to(device)
            input_masks = torch.tensor(input_masks).to(device)
            segment_ids = torch.tensor(segment_ids).to(device)
            label_t = torch.tensor(labels, dtype=torch.long).to(device)

            y_pred = model(input_ids, input_masks, segment_ids)

            loss = loss_fn(y_pred, label_t.float())
            loss.backward()
            '''
            pgd.backup_grad()
            # 对抗训练
            for t in range(K):
                pgd.attack(is_first_attack=(t == 0))  # 在embedding上添加对抗扰动, first attack时备份param.data
                if t != K - 1:
                    model.zero_grad()
                else:
                    pgd.restore_grad()
                y_pred = model(input_ids, input_masks, segment_ids)

                loss_adv = loss_fn(y_pred, label_t.float())
                loss_adv.backward()  # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
            pgd.restore()  # 恢复embedding参数
            '''

            # 梯度下降，更新参数
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()

            y_pred = y_pred.sigmoid()
            y_pred = y_pred.detach().to("cpu").numpy()
            res = []
            for i in range(len(y_pred)):
                tmp = []
                for j in range(len(y_pred[0])):
                    if y_pred[i][j] > 0.5:
                        tmp.append(1)
                    else:
                        tmp.append(0)
                res.append(tmp)

            f1 += f1_score(np.array(res),np.array(labels),average = "macro")
            loss_num += loss.item()
            train_len += len(labels)
            tq.set_postfix(fold=fold, epoch=e, loss=loss_num / train_len, f1=f1 / train_len)


        model.eval()
        with torch.no_grad():
          y_p = []
          train_logit = None
          for input_ids, input_masks, segment_ids, labels in tqdm(val_D):
              input_ids = torch.tensor(input_ids).to(device)
              input_masks = torch.tensor(input_masks).to(device)
              segment_ids = torch.tensor(segment_ids).to(device)
              label_t = torch.tensor(labels, dtype=torch.long).to(device)

              y_pred = model(input_ids, input_masks, segment_ids)

              y_pred = y_pred.sigmoid()
              y_pred = y_pred.detach().to("cpu").numpy()

              if train_logit is None:
                  train_logit = y_pred
              else:
                  train_logit = np.vstack((train_logit, y_pred))
              for i in range(len(y_pred)):
                  tmp = []
                  for j in range(len(y_pred[0])):
                      if y_pred[i][j] > 0.5:
                          tmp.append(1)
                      else:
                          tmp.append(0)
                  y_p.append(tmp)

          f1 = f1_score(np.array(y_p),np.array(val_y),average = "macro")
          print("best_f1:{}  f1:{}\n".format(best_f1, f1))
          if f1 >= best_f1:
              best_f1 = f1
              oof_train[valid_index] = np.array(train_logit)
              torch.save(model, PATH)


    test_D = data_generator([test_content, test_label], batch_size=valid_batch_size)
    model = torch.load(PATH).to(device)
    model.eval()
    with torch.no_grad():
      res = []
      pred_logit = None

      for input_ids, input_masks, segment_ids, labels in tqdm(test_D):
          input_ids = torch.tensor(input_ids).to(device)
          input_masks = torch.tensor(input_masks).to(device)
          segment_ids = torch.tensor(segment_ids).to(device)

          y_pred = model(input_ids, input_masks, segment_ids)
          y_pred = y_pred.sigmoid()
          y_pred = y_pred.detach().to("cpu").numpy()

          if pred_logit is None:
              pred_logit = y_pred
          else:
              pred_logit = np.vstack((pred_logit, y_pred))

    
    oof_test += np.array(pred_logit)

    optimizer.zero_grad()

    del model
    torch.cuda.empty_cache()


%load_ext klab-autotime

# 6.导出结果

In [13]:
oof_test /= 5
res = []
for i in range(len(oof_test)):
    tmp = []
    for j in range(len(oof_test[0])):
        if oof_test[i][j] > 0.5:
            tmp.append(1)
        else:
            tmp.append(0)
    res.append(tmp)


save_result_path = '/home/kesci/work/'
if not os.path.exists(save_result_path):
  os.makedirs(save_result_path)

res = np.array(res)
sub["category_A"],sub["category_B"],sub["category_C"],sub["category_D"],sub["category_E"],sub["category_F"] = res[:, 0],res[:, 1],res[:, 2],res[:, 3],res[:, 4],res[:, 5]
sub.to_csv("/home/kesci/work/result.csv",index=False)


In [1]:
!wget -nv -O kesci_submit https://cdn.kesci.com/submit_tool/v4/kesci_submit&&chmod +x kesci_submit

In [2]:
!./kesci_submit -token 9723d983ffd8620e -file /home/kesci/work/result.csv