In [6]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertConfig, BertModel, AdamW
import torch.utils.data as tud
import random
import torch.nn as nn
import torch.optim as optim
import torch
import torch.nn.functional as F
import torch as t
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, train_test_split
import time

In [7]:
SEED = 2020
random.seed(SEED)
np.random.seed(SEED)
t.manual_seed(SEED)
if t.cuda.is_available(): t.cuda.manual_seed(SEED)

gpu_ids = [2]
MULTI_GPU = False
if len(gpu_ids)>1: MULTI_GPU = True
device = t.device('cuda:'+str(gpu_ids[0]) if t.cuda.is_available() else 'cpu')
    
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
MAX_LEN = 256
id2tag = {i[1]:i[0] for i in pd.read_csv('./data/topic2id.csv').values}
N_TAG = len(id2tag)
N_FOLD = 5
BATCH_SIZE = 350
N_EPOCH = 20
df_train = pd.read_csv("./data/train.csv")

In [8]:
class ZHDataset(tud.Dataset):
    def __init__(self, datas, max_len):
        self.datas = datas
        self.max_len = max_len
        
    def __len__(self):
        return len(self.datas)
    
    def __getitem__(self, idx):
        data = self.datas[idx][1]
        text = data.question_title
        if not pd.isnull(data.question_detail): text += " "+data.question_detail
        seq_dict = tokenizer.encode_plus(text, max_length=self.max_len, pad_to_max_length=True, add_special_tokens=True)
        input_ids = t.LongTensor(seq_dict['input_ids'])
        atten_mask = t.LongTensor(seq_dict['attention_mask'])

        label = [0]*N_TAG
        if hasattr(data, "tag_ids"):
            for i in ((str)(data.tag_ids)).split('|'):
                i = (int)(i)
                label[i-1] = 1
        label = t.FloatTensor(label)
        return input_ids, atten_mask, label

In [9]:
class OurBert(nn.Module):
    def __init__(self, n_classes):
        super(OurBert, self).__init__()
        config = BertConfig.from_pretrained('bert-base-chinese', output_hidden_states=True)
        self.bert = BertModel.from_pretrained('bert-base-chinese', config=config)
        self.fc = nn.Linear(768, n_classes)
        
    def forward(self, input_ids, atten_mask):
        output = self.bert(input_ids, atten_mask) # [batch, seqlen, hidden_size]
        hidden_states = output[2][-4:]
        hidden = torch.stack(hidden_states, dim=-1).max(dim=-1)[0] #[batch, seqlen, hidden_size]        
        return self.fc(hidden[:,0,:]) # [batch, n_classes]

# Test

In [11]:
df_test = pd.read_csv('./data/test.csv')
test_data = ZHDataset(list(df_test.iterrows()), MAX_LEN)
test_iter = tud.DataLoader(test_data, shuffle=False, batch_size=BATCH_SIZE)
test_labels = []
for fold in range(1):
    best_model = OurBert(N_TAG).to(device)
    best_model.load_state_dict(t.load('./models/model_sigle1.pt'.format(fold), map_location=device))
    test_label_fold = []
    with torch.no_grad():
        for idx, (input_ids, atten_mask, _) in enumerate(test_iter):
            input_ids, atten_mask = input_ids.to(device), atten_mask.to(device)
            output = best_model(input_ids, atten_mask)  #[batch, n_classes]
            test_label_fold.extend(output.detach().cpu().numpy())
    test_labels.append(test_label_fold)   
test_labels = np.array(test_labels)  #[fold, N, n_class]
test_labels = test_labels.sum(0) #[N, n_class]
test_labels = torch.from_numpy(test_labels)
value, arg_idx = torch.topk(test_labels, 5, dim=-1) #[N, 5]
n_sample = test_labels.shape[0]
test_res = []
for i in range(n_sample):
    tmp = []
    for j in range(5):
        if t.sigmoid(value[i][j])<0.55 and j>0: tmp.append(-1)
        else: tmp.append(arg_idx[i][j].item()+1)
    test_res.append(tmp)
test_res = np.array(test_res)
submit = pd.DataFrame({'question_id':df_test.question_id, 
                       '0':test_res[:,0],
                       '1':test_res[:,1],
                       '2':test_res[:,2],
                       '3':test_res[:,3],
                       '4':test_res[:,4],
                      })
submit.to_csv("./res/submit_test_question0.55.csv", index=False, header=0)

In [6]:
!ls -alh ./res/submit_test_pad-1.csv


test_labels = arg_idx.cpu().numpy()+1
pad_res = np.full((test_labels.shape[0], 4), -1)
test_labels = np.concatenate((test_labels, pad_res), axis=1)

-rw-rw-r-- 1 zhongjc zhongjc 3.3M Jul 18 19:43 ./res/submit_test_pad-1.csv


# Tips code

In [None]:
param_optimizer = list(model.named_parameters())  # 模型参数名字列表
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
# optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
NUM_EPOCHS = 3
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=0.05,
                     t_total=len(train_loader) * NUM_EPOCHS
                    )
from transformers import get_linear_schedule_with_warmup
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)