In [None]:
import pandas as pd
import torch
from datasets import Dataset
import datasets
import os
import random
import numpy as np
import re
from copy import deepcopy
import json
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
from nlpcda import Similarword


def seedeverything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True  ##
    torch.backends.cudnn.benchmark = True

seedeverything(seed=233)

traindata = pd.read_json('./CHIP-CTC/CHIP-CTC_train.json')
valdata = pd.read_json('./CHIP-CTC/CHIP-CTC_dev.json')
testdata = pd.read_json('./CHIP-CTC/CHIP-CTC_test.json')
testdata_temp = deepcopy(testdata)


def textclean(x):
    a = re.findall('[\u4e00-\u9fa5A-Za-z][\S\s]+',x,re.S)  
    a = "".join(a)
    return a

traindata['text'] = traindata['text'].apply(lambda x: textclean(x))
valdata['text'] = valdata['text'].apply(lambda x: textclean(x))
testdata['text'] = testdata['text'].apply(lambda x: textclean(x))


examplepreddata = pd.read_excel('./CHIP-CTC/category.xlsx')

examplepreddata['label2idx'] = range(examplepreddata.shape[0])

label2idx = dict(
    zip(examplepreddata['Label Name'], examplepreddata['label2idx']))
idx2label = dict(
    zip(examplepreddata['label2idx'], examplepreddata['Label Name']))

with open("idx2label.json", "w", encoding="utf-8") as fp:
            json.dump(idx2label, fp, ensure_ascii=False, indent=4)

print(idx2label)

traindata['labels'] = [label2idx[item] for item in traindata['label']]
valdata['labels'] = [label2idx[item] for item in valdata['label']]

print(len(traindata))
print(len(valdata))
print(len(testdata))

traindataset = Dataset.from_pandas(traindata)
valdataset = Dataset.from_pandas(valdata)
testdataset = Dataset.from_pandas(testdata)

dataset = datasets.DatasetDict({
    'train': traindataset,
    'validation': valdataset,
    'test': testdataset
})

print(dataset)

train_dataset = dataset['train']
print(train_dataset.features)

print(train_dataset[0])

In [None]:
from transformers import BertTokenizer, BertModel, AutoModelForSequenceClassification

checkpoint = "hflchinese-bert-wwm-withpretrain-ext"
tokenizer = BertTokenizer.from_pretrained(checkpoint)

def tokenize_function(sample):
    return tokenizer(sample['text'], truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets['train'] = tokenized_datasets['train'].remove_columns(
    ['id', 'text', 'label'])
tokenized_datasets['validation'] = tokenized_datasets[
    'validation'].remove_columns(['id', 'text', 'label'])
tokenized_datasets['test'] = tokenized_datasets['test'].remove_columns(
    ['id', 'text'])

from transformers import DataCollatorWithPadding  #实现按batch自动padding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print(tokenized_datasets)

In [None]:
from torch.utils.data import DataLoader, Dataset
train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=8, collate_fn=data_collator)  
val_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=8, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=8, collate_fn=data_collator)
for batch in test_dataloader:
    print({k: v.shape for k, v in batch.items()})
    break

In [None]:
from transformers import BertTokenizer, BertModel, AutoModelForSequenceClassification,AutoModel
from transformers import BertForSequenceClassification,BertForMaskedLM

net = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=examplepreddata.shape[0])

for batch in train_dataloader:
    outputs = net(**batch)
    print(outputs)
    break

In [None]:
'''
3.8546

-1.4115e-01

4.1694
'''
from transformers import AdamW, get_scheduler
from datasets import load_metric
from statistics import mean
from sklearn import metrics
from torch import nn
import json
import warnings
from tensorboardX import SummaryWriter

warnings.filterwarnings("ignore")


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs = 5
lr = 2.5e-5
num_labels = examplepreddata.shape[0] 
num_training_steps = num_epochs * len(train_dataloader)  # num of batches * num of epochs
print(num_training_steps)

class Mlp(nn.Module):
    def __init__(self,
                 in_features,
                 hidden_features=1000,
                 out_features=None,
                 act_layer=nn.GELU,
                 drop=0.):
        super().__init__()
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.softmax = nn.Softmax(dim=-1)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return self.softmax(x)

class Bert4textclassification_lightningsystem(pl.LightningModule):

    def __init__(self,net,lr,epoch,len):
        super(Bert4textclassification_lightningsystem, self).__init__()
        self.net = net.to(device)
        self.lr = lr
        self.epoch = epoch
        self.num_training_steps = len
        self.writer = SummaryWriter('./log-'+checkpoint)
        self.iteration = 0
        #self.metric = load_metric("glue", "mrpc",mirror="tuna")

    def configure_optimizers(self):

        self.optimizer = AdamW(self.net.parameters(), lr=self.lr)
        lr_scheduler = get_scheduler(
                'linear',
                optimizer=self.optimizer, 
                num_warmup_steps=0,
                num_training_steps=self.num_training_steps)
        optim_dict = {'optimizer': self.optimizer, 'lr_scheduler': lr_scheduler}
        return optim_dict
        
    def metrics_compute(self,mode,outputs):
        loss = []
        loss.append(outputs[0][mode+'_loss'])
        predictions = outputs[0]['predictions']
        labels = outputs[0]['labels']
        for i in range(1,len(outputs)):
            loss.append(outputs[i][mode+'_loss'])
            predictions = torch.concat([predictions,outputs[i]['predictions']],dim=0)
            labels = torch.concat([labels,outputs[i]['labels']],dim=0)
        loss = torch.tensor(loss)
        predictions = predictions.cpu().detach().numpy()
        labels = labels.cpu().detach().numpy()
        return loss,predictions,labels

    def training_step(self, batch, batch_idx):
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = self.net(**batch)
        loss = outputs.loss
        lr_ = self.lr * (1.0 -
                             self.iteration / self.num_training_steps)**0.9
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr_
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metrics_dict = metrics.classification_report(predictions.cpu().detach().numpy(),batch['labels'].cpu().detach().numpy(),digits = 4,output_dict=True)
        self.writer.add_scalar('info/loss',loss,self.iteration)
        self.writer.add_scalar('info/weighted_avg',metrics_dict['weighted avg']['f1-score'],self.iteration)
        self.iteration += 1
        return loss

    def validation_step(self, batch, batch_idx):
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = self.net(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metrics_dict = metrics.classification_report(predictions.cpu().detach().numpy(),batch['labels'].cpu().detach().numpy(),digits = 4,output_dict=True)
        self.log('macro_avg',metrics_dict['macro avg']['f1-score'])
        #self.metric.add_batch(predictions=predictions, references=batch["labels"])
        return {'val_loss':outputs.loss,'predictions':predictions,'labels':batch['labels']}
    
       
    def test_step(self, batch, batch_idx):
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = self.net(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        return {'test_loss':outputs.loss,'predictions':predictions}
            
    def training_epoch_end(self,outputs):
        pass

    def validation_epoch_end(self, outputs):
        print(outputs[0]['predictions'].shape)
        print(len(outputs))
        val_loss ,predictions,labels= self.metrics_compute('val',outputs)
        print(predictions.shape)
        print('\n',"val_loss: ",val_loss.mean())
        print(metrics.classification_report(predictions, labels,digits = 4))

    def test_epoch_end(self, outputs):
        predictions = outputs[0]['predictions']
        for i in range(1,len(outputs)):
            predictions = torch.concat([predictions,outputs[i]['predictions']],dim=0)
        predictions = predictions.cpu().detach().numpy().tolist()
        test_labels =[ idx2label[idx] for idx in predictions]
        testdata_temp['label'] = test_labels
        test_pred_list = []
        for i in range(testdata_temp.shape[0]):
            temp_dict = {}
            temp_dict['id'] = testdata_temp.iloc[i,0]
            temp_dict['label'] = testdata_temp.iloc[i,2]
            temp_dict['text'] = testdata_temp.iloc[i,1]
            test_pred_list.append(temp_dict)
        print('\n',testdata_temp.head())
        with open("result.json", "w", encoding="utf-8") as fp:
            json.dump(test_pred_list, fp, ensure_ascii=False, indent=4)
        

In [6]:
'''
output_baseline: 0.7829 0.8543 0.8060 3 
output_baseline + process: 0.7718 0.8712 0.8092 3
output_baseline + process + aug: 0.7892 0.8523 0.8136 3
output_baseline + process + TAPT: 0.8135 0.8525 0.8274 5
output_baseline + process + + aug + TAPT: 0.7866 0.8773 0.8230
output_baseline + process + + aug + TAPT + 10layer:0.7918 0.8714 0.8254 
'''
print(testdata.shape)
print(testdata_temp.shape)

model = Bert4textclassification_lightningsystem.load_from_checkpoint(checkpoint_path='./output_baseline/baseline+process+aug+TAPT/hflchinese-bert-wwm-ext-CHIP-CTC-epoch=02-macro_avg=0.7814.ckpt',
net=net,lr=lr,epoch=num_epochs,len=num_training_steps
        )

trainer = Trainer(
            logger=False,
            gpus=1,
            #limit_train_batches=0.05
            #precision=16,
            #accumulate_grad_batches=2,
            #gradient_clip_val=0.5,
        )
trainer.test(model=model, dataloaders=test_dataloader)