In [3]:
import easydict
def get_train_args():
    args = easydict.EasyDict({
            "data_folder": "../../data/train",
            "vocab_file": "../../data/tags/commonTags_post2vec.csv",
            "logging_steps": 500,
            "num_train_epochs": 20,
            "per_gpu_train_batch_size": 2,
            "seed": 42,
            "gradient_accumulation_steps": 4,
            "weight_decay": 0.0,
            "adam_epsilon": 1e-8,
            "warmup_steps":0,
            "code_bert":'microsoft/codebert-base',
            "max_grad_norm":1.0,
            "save_steps": 1000,
            "output_dir": "../results",
            "learning_rate":1e-6,
})
    return args

In [4]:
import sys
sys.path.append("../")
sys.path.append("/usr/src/bert")
import torch
from torch.optim import AdamW
from transformers import BertConfig, get_linear_schedule_with_warmup, AutoTokenizer
from torch.utils.tensorboard import SummaryWriter
import gc
from sklearn import metrics, preprocessing
from datetime import datetime
import pandas as pd
from util.util import seed_everything
from torch.utils.data import DataLoader
from model.loss import loss_fn
from model.model import TBertT
import numpy as np
import logging

import os
import random
import argparse
from data_structure.question import Question, QuestionDataset
import pandas as pd
from util.util import write_tensor_board

logger = logging.getLogger(__name__)

def get_optimizer_scheduler(args, model, train_steps):
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(
            nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate, eps=args.adam_epsilon)
    # optimizer = optim.SGD(model.parameters(), lr=args.learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=train_steps
    )
    return optimizer, scheduler

def get_tag_encoder(vocab_file):
    tab_vocab_path = vocab_file
    tag_vocab = pd.read_csv(tab_vocab_path)
    tag_list = tag_vocab["tag"].astype(str).tolist()
    mlb = preprocessing.MultiLabelBinarizer()
    mlb.fit([tag_list])
    return mlb, len(mlb.classes_)


def init_train_env(args, tbert_type):
    device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()

    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.warning(
        "device: %s, n_gpu: %s",
        device,
        args.n_gpu,
    )
    
    # Set seed
    seed_everything(args.seed)
    
    # get the encoder for tags
    mlb, num_class = get_tag_encoder(args.vocab_file)
    args.mlb = mlb
    args.num_class = num_class
    
    # get the model
    if tbert_type == 'trinity':
        model = TBertT(BertConfig(), args.code_bert, args.num_class)
    else:
        raise Exception("TBERT type not found")
    
    args.tbert_type = tbert_type
    model.to(args.device)
    logger.info("Training/evaluation parameters %s", args)
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    return model


In [5]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7'
args = get_train_args()
model = init_train_env(args, tbert_type='trinity') 

11/22/2021 06:19:45 - INFO - __main__ -   Training/evaluation parameters {'data_folder': '../../data/train', 'vocab_file': '../../data/tags/commonTags_post2vec.csv', 'logging_steps': 500, 'num_train_epochs': 20, 'per_gpu_train_batch_size': 2, 'seed': 42, 'gradient_accumulation_steps': 4, 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'warmup_steps': 0, 'code_bert': 'microsoft/codebert-base', 'max_grad_norm': 1.0, 'save_steps': 1000, 'output_dir': '../results', 'learning_rate': 1e-06, 'n_gpu': 8, 'device': device(type='cuda'), 'mlb': MultiLabelBinarizer(classes=None, sparse_output=False), 'num_class': 23686, 'tbert_type': 'trinity'}


# Get Files

In [6]:
def get_files_paths_from_directory(input_dir):
    file_paths = []
    for root, dirs, files in os.walk(input_dir):
        for file_name in files:
            file_paths.append(os.path.join(root, file_name))
    file_paths.sort()
    return file_paths

In [7]:
files = get_files_paths_from_directory(args.data_folder)

In [8]:
# total training examples 10279014
train_numbers = 9765063
# 每个epoch有几个batch
epoch_batch_num = train_numbers / args.train_batch_size
# 一共有几个step更新参数
t_total = epoch_batch_num * args.num_train_epochs

optimizer, scheduler = get_optimizer_scheduler(args, model, t_total)

In [9]:
def load_data_to_dataset(mlb,file):
    tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base", local_files_only=True)
    train = pd.read_pickle(file)
    training_set = QuestionDataset(train[:100], mlb, tokenizer)
    return training_set

In [10]:
def get_dataloader(dataset, batch_size):
    data_loader = DataLoader(dataset,
                                batch_size=batch_size,
                                shuffle=True
                                )
    return data_loader

In [11]:
for file_cnt in range(len(files)):
    tr_loss = 0
    # Load dataset and dataloader
    training_set = load_data_to_dataset(args.mlb, files[file_cnt])
    train_size = int(0.95 * len(training_set))
    valid_size = len(training_set) - train_size
    train_dataset, valid_dataset = torch.utils.data.random_split(training_set, [train_size, valid_size])
    args.train_batch_size = 2
    train_data_loader = get_dataloader(train_dataset, args.train_batch_size)
    valid_data_loader = get_dataloader(valid_dataset, args.train_batch_size)
    break
    # print(args.train_batch_size)
    # print('############# FILE {}: Training Start   #############'.format(file_cnt))
    # # Train!
    # for epoch in range(args.num_train_epochs):
    #     print('############# Epoch {}: Training Start   #############'.format(epoch))
    #     model.train()
    #     for step, data in enumerate(train_data_loader):
    #         title_ids = data['titile_ids'].to(args.device, dtype=torch.long)
    #         title_mask = data['title_mask'].to(args.device, dtype=torch.long)
    #         text_ids = data['text_ids'].to(args.device, dtype=torch.long)
    #         text_mask = data['text_mask'].to(args.device, dtype=torch.long)
    #         code_ids = data['code_ids'].to(args.device, dtype=torch.long)
    #         code_mask = data['code_mask'].to(args.device, dtype=torch.long)
    #         targets = data['labels'].to(args.device, dtype=torch.float)
    #         model.zero_grad()
    #         outputs = model(title_ids=title_ids,
    #                         title_attention_mask=title_mask,
    #                         text_ids=text_ids,
    #                         text_attention_mask=text_mask,
    #                         code_ids=code_ids,
    #                         code_attention_mask=code_mask)

    #         loss = loss_fn(outputs, targets)
    #         loss.backward()
    #         tr_loss += loss.item()

            
    #         torch.nn.utils.clip_grad_norm_(
    #                 model.parameters(), args.max_grad_norm)
    #         optimizer.step()
    #         scheduler.step()            
    #         if args.logging_steps > 0 and step % args.logging_steps == 0:
    #             tb_data = {
    #                 'lr': scheduler.get_last_lr()[0],
    #                 'loss': tr_loss / args.logging_steps
    #             }
    #             print(f'tb_data, {tb_data}')
    #             print(f'output size: {outputs.size()}')
    #             print(f'target: target size: {targets.size()}')
    #             print(
    #                 f'Epoch: {epoch}, Batch: {step}， Loss:  {tr_loss / args.logging_steps}')
    #             current_time = datetime.now().strftime("%H:%M:%S")
    #             print("Current Time =", current_time)
    #             tr_loss = 0.0
    #     # evaluation
    #     print('############# Epoch {}: Training End     #############'.format(epoch))
    #     print(
    #         '############# Epoch {}: Validation Start   #############'.format(epoch))
    #     model.eval()
    #     fin_targets = []
    #     fin_outputs = []
    #     with torch.no_grad():
    #         for batch_idx, data in enumerate(valid_data_loader, 0):
    #             title_ids = data['titile_ids'].to(
    #                 args.device, dtype=torch.long)
    #             title_mask = data['title_mask'].to(
    #                 args.device, dtype=torch.long)
    #             text_ids = data['text_ids'].to(
    #                 args.device, dtype=torch.long)
    #             text_mask = data['text_mask'].to(
    #                 args.device, dtype=torch.long)
    #             code_ids = data['code_ids'].to(
    #                 args.device, dtype=torch.long)
    #             code_mask = data['code_mask'].to(
    #                 args.device, dtype=torch.long)
    #             targets = data['labels'].to(
    #                 args.device, dtype=torch.float)

    #             outputs = model(title_ids=title_ids,
    #                             title_attention_mask=title_mask,
    #                             text_ids=text_ids,
    #                             text_attention_mask=text_mask,
    #                             code_ids=code_ids,
    #                             code_attention_mask=code_mask)
    #             # target = targets.cpu().detach().numpy().tolist()
    #             # output = torch.sigmoid(
    #             #     outputs).cpu().detach().numpy().tolist()
    #             print("output")
    #             print(outputs)
    #             print(outputs.size())
    #             print(f'target: target size: {targets.size()}')
    #             fin_targets.extend(targets.cpu().detach().numpy().tolist())
    #             fin_outputs.extend(torch.sigmoid(
    #                 outputs).cpu().detach().numpy().tolist())
    #             [pre, rc, f1, cnt] = evaluate_batch(
    #                 fin_outputs, fin_targets, [1, 2, 3, 4, 5])
    #         print(f"F1 Score = {pre}")
    #         print(f"Recall Score  = {rc}")
    #         print(f"Precision Score  = {f1}")
    #         print(f"Count  = {cnt}")
    #         print(
    #             '############# Epoch {}: Validation End     #############'.format(epoch))
        
    #     logger.info("Training finished")

In [2]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()


In [81]:
model.eval()
fin_targets = []
fin_outputs = []
with torch.no_grad():
      fin_targets = []
      fin_outputs = []
      for batch_idx, data in enumerate(valid_data_loader, 0):
            title_ids = data['titile_ids'].to(
                  args.device, dtype=torch.long)
            title_mask = data['title_mask'].to(
                  args.device, dtype=torch.long)
            text_ids = data['text_ids'].to(
                  args.device, dtype=torch.long)
            text_mask = data['text_mask'].to(
                  args.device, dtype=torch.long)
            code_ids = data['code_ids'].to(
                  args.device, dtype=torch.long)
            code_mask = data['code_mask'].to(
                  args.device, dtype=torch.long)
            targets = data['labels'].to(
                  args.device, dtype=torch.float)

            outputs = model(title_ids=title_ids,
                              title_attention_mask=title_mask,
                              text_ids=text_ids,
                              text_attention_mask=text_mask,
                              code_ids=code_ids,
                              code_attention_mask=code_mask)
            target = targets.cpu().detach().numpy().tolist()
            output = torch.sigmoid(
                  outputs).cpu().detach().numpy().tolist()
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(
                  outputs).cpu().detach().numpy().tolist())
            print(len(fin_targets))
            print(len(fin_outputs))
            [pre, rc, f1, cnt] = evaluate_batch(
                  fin_outputs, fin_targets, [1, 2, 3, 4, 5])
            print(f"F1 Score = {pre}")
            print(f"Recall Score  = {rc}")
            print(f"Precision Score  = {f1}")
            print(f"Count  = {cnt}")


2
2
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
F1 Score = [0.0, 0.0, 0.0, 0.0, 0.0]
Recall Score  = [0.0, 0.0, 0.0, 0.0, 0.0]
Precision Score  = [0.0, 0.0, 0.0, 0.0, 0.0]
Count  = 2
4
4
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
F1 Score = [0.0, 0.0, 0.0, 0.0, 0.0]
Recall Score  = [0.0, 0.0, 0.0, 0.0, 0.0]
Precision Score  = [0.0, 0.0, 0.0, 0.0, 0.0]
Count  = 4
5
5
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.

In [84]:
for i in fin_targets:
    print(sum(i))

4.0
3.0
4.0
4.0
3.0


In [83]:
sum(fin_targets[0])

4.0

In [60]:
def evaluate_ori(pred, label, topk):
    """
    dimension of pred and label should be equal.
    :param pred: a list of prediction
    :param label: a list of true label
    :param topk:
    :return: a dictionary: {'precision': pre_k, 'recall': rec_k, 'f1': f1_k}
    """
    top_idx_list = sorted(range(len(pred)), key=lambda i: pred[i])[-topk:]
    num_of_true_in_topk = len([idx for idx in top_idx_list if label[idx] == 1])
    # precision@k = #true label in topk / k
    pre_k = num_of_true_in_topk / float(topk)
    # recall@k = #true label in topk / #true label
    num_of_true_in_all = sum(label)
    if num_of_true_in_all > topk:
        rec_k = num_of_true_in_topk / float(topk)
    else:
        rec_k = num_of_true_in_topk / float(num_of_true_in_all)
    # f1@k = 2 * precision@k * recall@k / (precision@k + recall@k)
    if pre_k == 0 and rec_k == 0:
        f1_k = 0.0
    else:
        f1_k = 2 * pre_k * rec_k / (pre_k + rec_k)
    # return {'precision': pre_k, 'recall': rec_k, 'f1': f1_k}
    print(pre_k, rec_k, f1_k)
    return pre_k, rec_k, f1_k

In [73]:
def evaluate_batch1(pred, label, topk_list=[1, 2, 3, 4, 5]):
    pre = [0.0] * len(topk_list)
    rc = [0.0] * len(topk_list)
    f1 = [0.0] * len(topk_list)
    cnt = 0
    print("pre")
    print(pre)
    for i in range(0, len(pred)):
        for idx, topk in enumerate(topk_list):
            pre_val, rc_val, f1_val = evaluate_ori(
                pred=pred[i], label=label[i], topk=topk)
            pre[idx] += pre_val
            rc[idx] += rc_val
            f1[idx] += f1_val
        cnt += 1
    print("result")
    print([pre, rc, f1, cnt])
    pre[:] = [x / cnt for x in pre]
    rc[:] = [x / cnt for x in rc]
    f1[:] = [x / cnt for x in f1]
    return [pre, rc, f1, cnt]

In [74]:
[pre, rc, f1, cnt] = evaluate_batch1([[0,0,0,0,0],[0,0,0,0,1]], [[1,0,0,0,1],[1,0,0,1,1]], [1, 2, 3, 4, 5])


pre
[0.0, 0.0, 0.0, 0.0, 0.0]
1.0 1.0 1.0
0.5 0.5 0.5
0.3333333333333333 0.5 0.4
0.25 0.5 0.3333333333333333
0.4 1.0 0.5714285714285715
1.0 1.0 1.0
1.0 1.0 1.0
0.6666666666666666 0.6666666666666666 0.6666666666666666
0.5 0.6666666666666666 0.5714285714285715
0.6 1.0 0.7499999999999999
result
[[2.0, 1.5, 1.0, 0.75, 1.0], [2.0, 1.5, 1.1666666666666665, 1.1666666666666665, 2.0], [2.0, 1.5, 1.0666666666666667, 0.9047619047619049, 1.3214285714285714], 2]


In [75]:
[pre, rc, f1, cnt]

[[1.0, 0.75, 0.5, 0.375, 0.5],
 [1.0, 0.75, 0.5833333333333333, 0.5833333333333333, 1.0],
 [1.0, 0.75, 0.5333333333333333, 0.45238095238095244, 0.6607142857142857],
 2]