In [None]:
from google.colab import drive
import sys
import os

In [None]:
!/opt/bin/nvidia-smi

In [None]:
!pip install transformers

In [None]:
drive.mount('')
path = ''
os.chdir(path)
os.listdir(path)
sys.path.append('')

In [None]:
import os
import torch
from transformers import BertConfig, ElectraConfig
from NEZHA.model_NEZHA import NEZHAConfig
from NEZHA.NEZHA_utils import torch_init_model

import random
# 参数解析器
import argparse
import logging
from tqdm import trange

import utils
from optimization import BertAdam
# from utils import FGM
from evaluate import evaluate
from dataloader import NERDataLoader
from model import BertForTokenClassification

from train import train,train_and_evaluate

from pathlib import Path
import copy
import random 
from preprocess import get_train_val,get_testset

from test import test
from postprocess import postprocess
from predict import predict

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--seed', type=int, default=2020, help="random seed for initialization")
parser.add_argument('--ex_index', type=int, default=1, help="实验名称索引")
parser.add_argument('--device_id', type=int, default=0, help="GPU index")
parser.add_argument('--restore_file', default=None,
                    help="Optional, name of the file containing weights to reload before training")
parser.add_argument('--epoch_num', required=True, type=int, help="number of epochs")
parser.add_argument('--multi_gpu', action='store_true', help="是否多GPU")
parser.add_argument('--pre_model_type', type=str, help="预训练模型类型")
parser.add_argument('--ds_encoder_type', type=str, help="下游编码器类型")
parser.add_argument('--rate', type=float, help="伪标签权重")

args = parser.parse_args(args = ['--seed','2020','--ex_index','2',
                                 '--device_id','0',
                                 '--restore_file',None,
                                 '--epoch_num','10',
                                 '--pre_model_type','RoBERTa',
                                 '--ds_encoder_type','LSTM',
                                 '--rate','0.8'])

print(args)

In [None]:
params = utils.Params(args.pre_model_type, args.ex_index)
random.seed(args.seed)
torch.manual_seed(args.seed)
utils.set_logger(save=True, log_path=os.path.join(params.params_path, 'train.log'))
logging.info(f"Model type: {params.pre_model_type}_{params.ds_encoder_type}_CRF")
logging.info("device: {}".format(params.device))

logging.info('Init pre-train model...')

if params.pre_model_type == 'NEZHA':
    bert_config = NEZHAConfig.from_json_file(os.path.join(params.bert_model_dir, 'config.json'))
    model = BertForTokenClassification(config=bert_config, params=params)
    # NEZHA init
    torch_init_model(model, os.path.join(params.bert_model_dir, 'pytorch_model.bin'))
elif params.pre_model_type == 'RoBERTa':
    bert_config = BertConfig.from_json_file(os.path.join(params.bert_model_dir, 'config.json'))
    model = BertForTokenClassification.from_pretrained(config=bert_config,
                                                           pretrained_model_name_or_path=params.bert_model_dir,
                                                           params=params)
elif params.pre_model_type == 'BERT':
    bert_config = BertConfig.from_json_file(os.path.join(params.bert_model_dir, 'config.json'))
    model = BertForTokenClassification.from_pretrained(config=bert_config,
                                                           pretrained_model_name_or_path=params.bert_model_dir,
                                                           params=params)
else:
    raise ValueError('Pre-train Model type must be NEZHA or BERT or RoBERTa!')
logging.info('-done')

In [None]:
# Train and evaluate the model
logging.info("Starting training for {} epoch(s)".format(args.epoch_num))
train_and_evaluate(model, params, args, args.restore_file)

In [None]:
# 参数解析器
parser = argparse.ArgumentParser()
# 设定参数
parser.add_argument('--seed', type=int, default=2020, help="random seed for initialization")
parser.add_argument('--ex_index', type=int, default=1, help="实验名称索引")
parser.add_argument('--device_id', type=int, default=0, help="GPU index")
parser.add_argument('--restore_file', type=str, default='best', required=False,
                    help="Optional, name of the file containing weights to reload before training")
parser.add_argument('--mode', default='test', help="'val', 'test' or 'pseudo'")
parser.add_argument('--multi_gpu', action='store_true', help="是否多GPU")
parser.add_argument('--pre_model_type', type=str, help="预训练模型类型")
parser.add_argument('--ds_encoder_type', type=str, help="下游编码器类型")

args = parser.parse_args(args = ['--seed','2020','--ex_index','2',
                                 '--device_id','0',
                                 '--restore_file','best',
                                 '--mode','test',
                                 '--pre_model_type','RoBERTa',
                                 '--ds_encoder_type','LSTM'])

# 测试
params = utils.Params(args.pre_model_type, args.ex_index)
# set type
params.ds_encoder_type = args.ds_encoder_type

# 预测验证集还是测试集
mode = args.mode
# Set the random seed for reproducible experiments
random.seed(args.seed)
torch.manual_seed(args.seed)
params.seed = args.seed

# Set the logger
utils.set_logger()

# get dataloader
dataloader = NERDataLoader(params)

# Define the model
logging.info('Loading the model...')
# Reload weights from the saved file
model, optimizer = utils.load_checkpoint(os.path.join(params.model_dir, args.restore_file + '.pth.tar'))
model.to(params.device)
logging.info('-done.')

logging.info("Loading the dataset...")
loader = dataloader.get_dataloader(data_sign=mode)
logging.info('-done')

logging.info("Starting prediction...")
# Create the input data pipeline
#val_metrics = evaluate(args, model, loader, params, mark='Val',verbose=True)
predict(model, loader, params, mode)
logging.info('-done')

In [None]:

parser = argparse.ArgumentParser()
parser.add_argument('--ex_index', type=int, default=1, help="实验名称索引")
parser.add_argument('--mode', type=str, default='test', help="后处理结果类型")

args = parser.parse_args(args = ['--ex_index','2',
                                 '--mode','test'])

params =  utils.Params(ex_index=args.ex_index)
postprocess(params, mode=args.mode)
utils.set_logger()
test_metrics = test(params, mode=args.mode)

In [None]:
#0
parser = argparse.ArgumentParser()
parser.add_argument('--ex_index', type=int, default=1, help="实验名称索引")
parser.add_argument('--mode', type=str, default='test', help="后处理结果类型")

args = parser.parse_args(args = ['--ex_index','2',
                                 '--mode','test'])

params =  utils.Params(ex_index=args.ex_index)
postprocess(params, mode=args.mode)
utils.set_logger()
test_metrics = test(params, mode=args.mode)

In [None]:
#0.8
parser = argparse.ArgumentParser()
parser.add_argument('--ex_index', type=int, default=1, help="实验名称索引")
parser.add_argument('--mode', type=str, default='test', help="后处理结果类型")

args = parser.parse_args(args = ['--ex_index','4',
                                 '--mode','test'])

params =  utils.Params(ex_index=args.ex_index)
postprocess(params, mode=args.mode)
utils.set_logger()
test_metrics = test(params, mode=args.mode)

In [None]:
#1
parser = argparse.ArgumentParser()
parser.add_argument('--ex_index', type=int, default=1, help="实验名称索引")
parser.add_argument('--mode', type=str, default='test', help="后处理结果类型")

args = parser.parse_args(args = ['--ex_index','4',
                                 '--mode','test'])

params =  utils.Params(ex_index=args.ex_index)
postprocess(params, mode=args.mode)
utils.set_logger()
test_metrics = test(params, mode=args.mode)