In [1]:
import os
import sys
sys.path.append('.')
import argparse
import torch
import numpy as np
from transformers import BertTokenizer, BertModel, AlbertModel, BertForSequenceClassification, \
    AlbertForSequenceClassification


In [2]:

from cblue.models import CDNForCLSModel
from cblue.trainer import CDNForCLSTrainer, CDNForNUMTrainer
from cblue.utils import init_logger, seed_everything
from cblue.data import CDNDataset, CDNDataProcessor
from cblue.models import save_zen_model, ZenModel, ZenForSequenceClassification, ZenNgramDict


In [3]:
DATA_DIR="CBLUEDatasets"

TASK_NAME="cdn"
MODEL_TYPE="bert"
MODEL_DIR="data/model_data"
MODEL_NAME="chinese-macbert-large"
OUTPUT_DIR="data/output"
RESULT_OUTPUT_DIR="data/result_output"

MAX_LENGTH=64

RECALL_K=200
NUM_NEGATIVE_SAMPLES=5
DO_AUGMENT=6

In [4]:

output_dir = OUTPUT_DIR
task_name = TASK_NAME
model_name = MODEL_NAME
model_type = MODEL_TYPE
model_dir = MODEL_DIR
data_dir = DATA_DIR
recall_k = RECALL_K
num_neg = NUM_NEGATIVE_SAMPLES
do_aug = 6

seed = 1

In [5]:
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
output_dir = os.path.join(output_dir, task_name)
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
output_dir = os.path.join(output_dir, model_name)
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    

logger = init_logger(os.path.join(output_dir, f'{task_name}_{model_name}.log'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = device
seed_everything(seed)



MODEL_CLASS = {
    'bert': (BertTokenizer, BertModel),
    'roberta': (BertTokenizer, BertModel),
    'albert': (BertTokenizer, AlbertModel),
    'zen': (BertTokenizer, ZenModel)
}

CLS_MODEL_CLASS = {
    'bert': BertForSequenceClassification,
    'roberta': BertForSequenceClassification,
    'albert': AlbertForSequenceClassification,
    'zen': ZenForSequenceClassification
}
tokenizer_class, model_class = MODEL_CLASS[model_type]


In [6]:
logger.info('Training CLS model...')
tokenizer = tokenizer_class.from_pretrained('./data/model_data/chinese-macbert-large',local_files_only = True)


09/14/2021 15:44:55 - INFO - root -   Training CLS model...


In [7]:
ngram_dict = None

data_processor = CDNDataProcessor(root=data_dir, recall_k=recall_k,
                                    negative_sample=num_neg)


Building prefix dict from the default dictionary ...
09/14/2021 15:45:02 - DEBUG - jieba -   Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\DD045C~1.DES\AppData\Local\Temp\jieba.cache
09/14/2021 15:45:04 - DEBUG - jieba -   Dumping model to file cache C:\Users\DD045C~1.DES\AppData\Local\Temp\jieba.cache
Loading model cost 1.797 seconds.
09/14/2021 15:45:04 - DEBUG - jieba -   Loading model cost 1.797 seconds.
Prefix dict has been built successfully.
09/14/2021 15:45:04 - DEBUG - jieba -   Prefix dict has been built successfully.
09/14/2021 15:45:07 - INFO - gensim.corpora.dictionary -   adding document #0 to Dictionary(0 unique tokens: [])
09/14/2021 15:45:07 - INFO - gensim.corpora.dictionary -   adding document #10000 to Dictionary(4399 unique tokens: ['霍乱', ',', '01', '型', '所致']...)
09/14/2021 15:45:07 - INFO - gensim.corpora.dictionary -   adding document #20000 to Dictionary(6737 unique tokens: ['霍乱', ',', '01', '型', '所致']...)
09/14/2021 

In [8]:
train_samples, recall_orig_train_samples, recall_orig_train_samples_scores = data_processor.get_train_sample(dtype='cls', do_augment=do_aug)


100%|██████████| 6000/6000 [15:57<00:00,  6.27it/s]


In [None]:
a

In [52]:
eval_samples, recall_orig_eval_samples, recall_orig_train_samples_scores = data_processor.get_dev_sample(dtype='cls', do_augment=do_aug)

if data_processor.recall:
    logger.info('first recall score: %s', data_processor.recall)


NameError: name 'data_processor' is not defined

In [None]:

train_dataset = CDNDataset(train_samples, data_processor, dtype='cls', mode='train')
eval_dataset = CDNDataset(eval_samples, data_processor, dtype='cls', mode='eval')

model = CDNForCLSModel(model_class, encoder_path=os.path.join(model_dir, model_name),
                        num_labels=data_processor.num_labels_cls)
cls_model_class = CLS_MODEL_CLASS[model_type]
trainer = CDNForCLSTrainer(args=args, model=model, data_processor=data_processor,
                            tokenizer=tokenizer, train_dataset=train_dataset, eval_dataset=eval_dataset,
                            logger=logger, recall_orig_eval_samples=recall_orig_eval_samples,
                            model_class=cls_model_class, recall_orig_eval_samples_scores=recall_orig_train_samples_scores,
                            ngram_dict=ngram_dict)

global_step, best_step = trainer.train()

model = CDNForCLSModel(model_class, encoder_path=os.path.join(output_dir, f'checkpoint-{best_step}'),
                        num_labels=data_processor.num_labels_cls)
model.load_state_dict(torch.load(os.path.join(output_dir, f'checkpoint-{best_step}', 'pytorch_model.pt')))
tokenizer = tokenizer_class.from_pretrained(os.path.join(output_dir, f'checkpoint-{best_step}'))
torch.save(model.state_dict(), os.path.join(output_dir, 'pytorch_model_cls.pt'))
if not os.path.exists(os.path.join(output_dir, 'cls')):
    os.mkdir(os.path.join(output_dir, 'cls'))

if model_type == 'zen':
    save_zen_model(os.path.join(output_dir, 'cls'), model.encoder, tokenizer, ngram_dict, args)
else:
    model.encoder.save_pretrained(os.path.join(output_dir, 'cls'))

tokenizer.save_vocabulary(save_directory=os.path.join(output_dir, 'cls'))
logger.info('Saving models checkpoint to %s', os.path.join(output_dir, 'cls'))

logger.info('Training NUM model...')
logging_steps = 30
save_steps = 30
train_samples = data_processor.get_train_sample(dtype='num', do_augment=1)
eval_samples = data_processor.get_dev_sample(dtype='num')
train_dataset = CDNDataset(train_samples, data_processor, dtype='num', mode='train')
eval_dataset = CDNDataset(eval_samples, data_processor, dtype='num', mode='eval')

cls_model_class = CLS_MODEL_CLASS[model_type]
model = cls_model_class.from_pretrained(os.path.join(model_dir, model_name),
                                        num_labels=data_processor.num_labels_num)
trainer = CDNForNUMTrainer(args=args, model=model, data_processor=data_processor,
                            tokenizer=tokenizer, train_dataset=train_dataset, eval_dataset=eval_dataset,
                            logger=logger, model_class=cls_model_class, ngram_dict=ngram_dict)

global_step, best_step = trainer.train()