In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertConfig, BertModel, AdamW
import torch.utils.data as tud
import random
import torch.nn as nn
import torch.optim as optim
import torch
import torch.nn.functional as F
import torch as t
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, train_test_split
import time

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
SEED = 2020
random.seed(SEED)
np.random.seed(SEED)
t.manual_seed(SEED)
if t.cuda.is_available(): t.cuda.manual_seed(SEED)

gpu_ids = [7, 5, 4]
MULTI_GPU = False
if len(gpu_ids)>1: MULTI_GPU = True
device = t.device('cuda:'+str(gpu_ids[0]) if t.cuda.is_available() else 'cpu')
    
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
MAX_LEN = 256
id2tag = {i[1]:i[0] for i in pd.read_csv('./data/topic2id.csv').values}
N_TAG = len(id2tag)
N_FOLD = 5
BATCH_SIZE = 60
N_EPOCH = 20
df_train = pd.read_csv("./data/train.csv")

In [3]:
class ZHDataset(tud.Dataset):
    def __init__(self, datas, max_len):
        self.datas = datas
        self.max_len = max_len
        
    def __len__(self):
        return len(self.datas)
    
    def __getitem__(self, idx):
        data = self.datas[idx][1]
        if not pd.isnull(data.question_detail): text = data.question_detail
        else: text = data.question_title
        seq_dict = tokenizer.encode_plus(text, max_length=self.max_len, pad_to_max_length=True, add_special_tokens=True)
        input_ids = t.LongTensor(seq_dict['input_ids'])
        atten_mask = t.LongTensor(seq_dict['attention_mask'])

        label = [0]*N_TAG
        if hasattr(data, "tag_ids"):
            for i in ((str)(data.tag_ids)).split('|'):
                i = (int)(i)
                label[i-1] = 1
        label = t.FloatTensor(label)
        return input_ids, atten_mask, label

In [4]:
class OurBert(nn.Module):
    def __init__(self, n_classes):
        super(OurBert, self).__init__()
        config = BertConfig.from_pretrained('bert-base-chinese', output_hidden_states=True)
        self.bert = BertModel.from_pretrained('bert-base-chinese', config=config)
        self.fc = nn.Linear(768, n_classes)
        
    def forward(self, input_ids, atten_mask):
        output = self.bert(input_ids, atten_mask) # [batch, seqlen, hidden_size]
        hidden_states = output[2][-4:]
        hidden = torch.stack(hidden_states, dim=-1).max(dim=-1)[0] #[batch, seqlen, hidden_size]        
        return self.fc(hidden[:,0,:]) # [batch, n_classes]

In [5]:
def train(data_iter, model, loss_fn, optimizer):
    model.train()
    total_cnt, total_loss = 0., 0.
    for idx, (input_ids, atten_mask, label) in enumerate(data_iter):
        input_ids, atten_mask, label = input_ids.to(device), atten_mask.to(device), label.to(device)
        output = model(input_ids, atten_mask)
        loss = loss_fn(output, label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_cnt += len(input_ids)
    return total_loss / total_cnt

def evaluate(data_iter, model, loss_fn):
    model.eval()
    total_cnt, total_loss = 0., 0.
    with torch.no_grad():
        for idx, (input_ids, atten_mask, label) in enumerate(data_iter):
            input_ids, atten_mask, label = input_ids.to(device), atten_mask.to(device), label.to(device)
            output = model(input_ids, atten_mask)
            loss = loss_fn(output, label)
            
            total_loss += loss.item()
            total_cnt += len(input_ids)
    model.train()
    return total_loss / total_cnt

# Singleton Training

In [6]:
df_train_iter = np.array(list(df_train.iterrows()))
train_data, val_data = train_test_split(df_train_iter, test_size=0.01, random_state=SEED)
train_data = ZHDataset(train_data, MAX_LEN)
val_data = ZHDataset(val_data, MAX_LEN)

train_iter = tud.DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
val_iter = tud.DataLoader(val_data, shuffle=False, batch_size=BATCH_SIZE)

model = OurBert(N_TAG)
if MULTI_GPU:
    model = nn.DataParallel(model, device_ids=gpu_ids)
model = model.to(device)
loss_fn = nn.BCEWithLogitsLoss(reduction='sum')

param_optimizer = list(model.named_parameters())  # 模型参数名字列表
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)

val_history = []
for e in range(N_EPOCH):
    start = time.time()
    train_loss = train(train_iter, model,loss_fn, optimizer)
    val_loss = evaluate(val_iter,model, loss_fn)

#         train_loss, val_loss = 0., 0.
    end = time.time()
    if len(val_history)==0 or min(val_history)>val_loss:
        if MULTI_GPU: t.save(model.module.state_dict(), './models/model_alldetail.pt')
        else: t.save(model.state_dict(), './models/model_alldetail.pt')
    val_history.append(val_loss)
    print("Epoch: {}, Cost: {} m".format(e+1, (int)((end-start)/60)))
    print("train loss:{:.4f}, val loss:{:.4f}".format(train_loss, val_loss))

  """Entry point for launching an IPython kernel.


Epoch: 1, Cost: 102 m
train loss:175.5247, val loss:22.8703
Epoch: 2, Cost: 102 m
train loss:19.6697, val loss:17.4917
Epoch: 3, Cost: 102 m
train loss:16.4822, val loss:16.0587
Epoch: 4, Cost: 102 m
train loss:15.1026, val loss:15.3922
Epoch: 5, Cost: 103 m
train loss:14.2153, val loss:15.0587
Epoch: 6, Cost: 102 m
train loss:13.5262, val loss:14.8989
Epoch: 7, Cost: 103 m
train loss:12.9516, val loss:14.8116
Epoch: 8, Cost: 103 m
train loss:12.4478, val loss:14.8028
Epoch: 9, Cost: 103 m
train loss:11.9969, val loss:14.7421
Epoch: 10, Cost: 103 m
train loss:11.5745, val loss:14.7913
Epoch: 11, Cost: 102 m
train loss:11.1844, val loss:14.8812
Epoch: 12, Cost: 102 m
train loss:10.8096, val loss:14.9377
Epoch: 13, Cost: 102 m
train loss:10.4552, val loss:15.0077


KeyboardInterrupt: 

# Cross Validation

In [None]:
skf = KFold(n_splits=N_FOLD, random_state=SEED)
df_train_iter = np.array(list(df_train.iterrows()))
for fold, (train_idx, val_idx) in enumerate(skf.split(df_train_iter)):
    train_data, val_data = df_train_iter[train_idx], df_train_iter[val_idx]
    train_data = ZHDataset(train_data, MAX_LEN)
    val_data = ZHDataset(val_data, MAX_LEN)
    
    train_iter = tud.DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
    val_iter = tud.DataLoader(val_data, shuffle=False, batch_size=BATCH_SIZE)
    
    model = OurBert(N_TAG)
    if MULTI_GPU:
        model = nn.DataParallel(model, device_ids=gpu_ids)
    model = model.to(device)
    loss_fn = nn.BCEWithLogitsLoss(reduction='sum')
    optimizer = optim.Adam(model.parameters(), lr=2e-5)
    
    val_history = []
    for e in range(N_EPOCH):
        start = time.time()
        train_loss = train(train_iter, model,loss_fn, optimizer)
        if (e+1)%2==0: val_loss = evaluate(val_iter,model, loss_fn)
        
#         train_loss, val_loss = 0., 0.
        end = time.time()
        if len(val_history)==0 or min(val_history)>val_loss:
            if MULTI_GPU: t.save(model.module.state_dict(), './models/model_{}.pt'.format(fold))
            else: t.save(model.state_dict(), './models/model_{}.pt'.format(fold))
        val_history.append(val_loss)
        print("Fold: {}, Epoch: {}, Cost: {} m".format(fold, e+1, (int)((end-start)/60)))
        print("train loss:{:.4f}, val loss:{:.4f}".format(train_loss, val_loss))


# Test

In [None]:
df_test = pd.read_csv('./data/test.csv')
test_data = ZHDataset(list(df_test.iterrows()), MAX_LEN)
test_iter = tud.DataLoader(test_data, shuffle=False, batch_size=BATCH_SIZE)
test_labels = []
for fold in range(1):
    best_model = OurBert(N_TAG).to(device)
    best_model.load_state_dict(t.load('./models/model_sigle2.pt'.format(fold), map_location=device))
    test_label_fold = []
    with torch.no_grad():
        for idx, (input_ids, atten_mask, _) in enumerate(test_iter):
            input_ids, atten_mask = input_ids.to(device), atten_mask.to(device)
            output = best_model(input_ids, atten_mask)  #[batch, n_classes]
#             value, arg_idx = torch.topk(output, 5, dim=-1) #[batch, 5]
            test_label_fold.extend(output.detach().cpu().numpy())
    test_labels.append(test_label_fold)   
test_labels = np.array(test_labels)  #[fold, N, n_class]
test_labels = test_labels.sum(0) #[N, n_class]
test_labels = torch.from_numpy(test_labels)
value, arg_idx = torch.topk(test_labels, 5, dim=-1) #[N, 5]
test_labels = arg_idx.cpu().numpy()+1
submit = pd.DataFrame({'question_id':df_test.question_id, 
                       '0':test_labels[:,0],
                       '1':test_labels[:,1],
                       '2':test_labels[:,2],
                       '3':test_labels[:,3],
                       '4':test_labels[:,4],
                      })
submit.to_csv("./res/submit_test.csv", index=False, header=0)