In [7]:
import pandas as pd
import torch
from datasets import Dataset
import datasets
import os
import random
import numpy as np

def seedeverything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True  ##
    torch.backends.cudnn.benchmark = True

seedeverything(seed=233)

traindata = pd.read_json('./CHIP-CTC/CHIP-CTC_train.json')
valdata = pd.read_json('./CHIP-CTC/CHIP-CTC_dev.json')
testdata = pd.read_json('./CHIP-CTC/CHIP-CTC_test.json')

examplepreddata = pd.read_excel('./CHIP-CTC/category.xlsx')

examplepreddata['label2idx'] = range(examplepreddata.shape[0])

label2idx = dict(
    zip(examplepreddata['Label Name'], examplepreddata['label2idx']))
idx2label = dict(
    zip(examplepreddata['label2idx'], examplepreddata['Label Name']))

print(idx2label)

traindata['labels'] = [label2idx[item] for item in traindata['label']]
valdata['labels'] = [label2idx[item] for item in valdata['label']]

print(len(traindata))
print(len(valdata))
print(len(testdata))


traindataset = Dataset.from_pandas(traindata)
valdataset = Dataset.from_pandas(valdata)
testdataset = Dataset.from_pandas(testdata)

dataset = datasets.DatasetDict({
    'train': traindataset,
    'validation': valdataset,
    'test': testdataset
})

print(dataset)

train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

print(train_dataset.features)

print(train_dataset[0])

{0: 'Disease', 1: 'Symptom', 2: 'Sign', 3: 'Pregnancy-related Activity', 4: 'Neoplasm Status', 5: 'Non-Neoplasm Disease Stage', 6: 'Allergy Intolerance', 7: 'Organ or Tissue Status', 8: 'Life Expectancy', 9: 'Oral related', 10: 'Pharmaceutical Substance or Drug', 11: 'Therapy or Surgery', 12: 'Device', 13: 'Nursing', 14: 'Diagnostic', 15: 'Laboratory Examinations', 16: 'Risk Assessment', 17: 'Receptor Status', 18: 'Age', 19: 'Special Patient Characteristic', 20: 'Literacy', 21: 'Gender', 22: 'Education', 23: 'Address', 24: 'Ethnicity', 25: 'Consent', 26: 'Enrollment in other studies', 27: 'Researcher Decision', 28: 'Capacity', 29: 'Ethical Audit', 30: 'Compliance with Protocol', 31: 'Addictive Behavior', 32: 'Bedtime', 33: 'Exercise', 34: 'Diet', 35: 'Alcohol Consumer', 36: 'Sexual related', 37: 'Smoking Status', 38: 'Blood Donation', 39: 'Encounter', 40: 'Disabilities', 41: 'Healthy', 42: 'Data Accessible', 43: 'Multiple'}
22962
7682
10192
DatasetDict({
    train: Dataset({
        fe

In [8]:
import argparse
import json
from typing import List

from ltp import LTP
from transformers.models.bert.tokenization_bert import BertTokenizer


def _is_chinese_char(cp):
    """Checks whether CP is the codepoint of a CJK character."""
    # This defines a "chinese character" as anything in the CJK Unicode block:
    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
    #
    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
    # despite its name. The modern Korean Hangul alphabet is a different block,
    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
    # space-separated words, so they are not treated specially and handled
    # like the all of the other languages.
    if (
        (cp >= 0x4E00 and cp <= 0x9FFF)
        or (cp >= 0x3400 and cp <= 0x4DBF)  #
        or (cp >= 0x20000 and cp <= 0x2A6DF)  #
        or (cp >= 0x2A700 and cp <= 0x2B73F)  #
        or (cp >= 0x2B740 and cp <= 0x2B81F)  #
        or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
        or (cp >= 0xF900 and cp <= 0xFAFF)
        or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
    ):  #
        return True

    return False


def is_chinese(word: str):
    # word like '180' or '身高' or '神'
    for char in word:
        char = ord(char)
        if not _is_chinese_char(char):
            return 0
    return 1


def get_chinese_word(tokens: List[str]):
    word_set = set()

    for token in tokens:
        chinese_word = len(token) > 1 and is_chinese(token)
        if chinese_word:
            word_set.add(token)
    word_list = list(word_set)
    return word_list


def add_sub_symbol(bert_tokens: List[str], chinese_word_set: set()):
    if not chinese_word_set:
        return bert_tokens
    max_word_len = max([len(w) for w in chinese_word_set])

    bert_word = bert_tokens
    start, end = 0, len(bert_word)
    while start < end:
        single_word = True
        if is_chinese(bert_word[start]):
            l = min(end - start, max_word_len)
            for i in range(l, 1, -1):
                whole_word = "".join(bert_word[start : start + i])
                if whole_word in chinese_word_set:
                    for j in range(start + 1, start + i):
                        bert_word[j] = "##" + bert_word[j]
                    start = start + i
                    single_word = False
                    break
        if single_word:
            start += 1
    return bert_word


def prepare_ref(lines: List[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokenizer):
    ltp_res = []

    for i in range(0, len(lines), 100):
        res = ltp_tokenizer.seg(lines[i : i + 100])[0]
        res = [get_chinese_word(r) for r in res]
        ltp_res.extend(res)
    assert len(ltp_res) == len(lines)

    bert_res = []
    for i in range(0, len(lines), 100):
        res = bert_tokenizer(lines[i : i + 100], add_special_tokens=True, truncation=True, max_length=512)
        bert_res.extend(res["input_ids"])
    assert len(bert_res) == len(lines)

    ref_ids = []
    for input_ids, chinese_word in zip(bert_res, ltp_res):

        input_tokens = []
        for id in input_ids:
            token = bert_tokenizer._convert_id_to_token(id)
            input_tokens.append(token)
        input_tokens = add_sub_symbol(input_tokens, chinese_word)
        ref_id = []
        # We only save pos of chinese subwords start with ##, which mean is part of a whole word.
        for i, token in enumerate(input_tokens):
            if token[:2] == "##":
                clean_token = token[2:]
                # save chinese tokens' pos
                if len(clean_token) == 1 and _is_chinese_char(ord(clean_token)):
                    ref_id.append(i)
        ref_ids.append(ref_id)

    assert len(ref_ids) == len(bert_res)

    return ref_ids

In [9]:
from transformers.data.data_collator import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from transformers import BertForMaskedLM

path = "hflchinese-bert-wwm-ext"
tokenizer = BertTokenizer.from_pretrained(path)
model = BertForMaskedLM.from_pretrained(path)

from ltp import LTP
ltp = LTP()

sent = [item['text'] for item in train_dataset] + [item['text'] for item in val_dataset] + [item['text'] for item in test_dataset]

ref = prepare_ref(sent, ltp, tokenizer)
print(len(ref))


Some weights of the model checkpoint at hflchinese-bert-wwm-ext were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


40836


In [10]:
from transformers import BertTokenizer, BertModel, AutoModelForSequenceClassification
from datasets import Dataset

tokenizer = BertTokenizer.from_pretrained("hflchinese-bert-wwm-ext")

def tokenize_function(sample):
    return tokenizer(sample['text'],truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets['train'] = tokenized_datasets['train'].remove_columns(
    ['id', 'text', 'label'])
tokenized_datasets['validation'] = tokenized_datasets[
    'validation'].remove_columns(['id', 'text', 'label'])
tokenized_datasets['test'] = tokenized_datasets['test'].remove_columns(
    ['id', 'text'])

  0%|          | 0/23 [00:00<?, ?ba/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 23/23 [00:03<00:00,  5.87ba/s]
100%|██████████| 8/8 [00:01<00:00,  5.86ba/s]
100%|██████████| 11/11 [00:01<00:00,  6.18ba/s]


In [11]:
encoder_dict = tokenized_datasets['train']['input_ids'] + tokenized_datasets['validation']['input_ids'] + tokenized_datasets['test']['input_ids']

# 加上子字信息，而且传入的是List，不是tensor。
train_mlm_dataset = [{'input_ids':encoder_dict[i],'chinese_ref':ref[i]} for i in range(len(ref))]

datacollecter = DataCollatorForWholeWordMask(tokenizer)

In [12]:


from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding  #实现按batch自动padding

train_mlm_dataloader = DataLoader(train_mlm_dataset, shuffle=True, batch_size=12, collate_fn=datacollecter)  
for batch in train_mlm_dataloader:
    print({k: v.shape for k, v in batch.items()})
    break

{'input_ids': torch.Size([12, 101]), 'labels': torch.Size([12, 101])}


In [13]:
for batch in train_mlm_dataloader:
    outputs = model(**batch)
    print(outputs)
    break

MaskedLMOutput(loss=tensor(4.5038, grad_fn=<NllLossBackward0>), logits=tensor([[[ -9.6789,  -8.2672,  -8.9909,  ...,  -9.1919, -10.4901,  -9.9213],
         [-13.6715, -11.8114, -11.4480,  ..., -11.4196, -14.3877, -11.0136],
         [-12.7726, -12.8846, -12.5715,  ..., -11.4791, -14.2202, -15.3056],
         ...,
         [-10.0236,  -6.6894,  -7.7634,  ..., -10.6219, -11.8329,  -6.0387],
         [ -9.9980,  -6.7233,  -7.7489,  ..., -10.6563, -11.5828,  -5.7179],
         [ -9.7295,  -6.7921,  -7.6953,  ..., -10.2023, -10.8552,  -5.5733]],

        [[ -9.5558,  -8.1402,  -9.4869,  ...,  -8.2798,  -9.7860,  -9.6576],
         [ -9.2906,  -8.3148,  -8.3864,  ...,  -7.7399,  -7.8785,  -7.2531],
         [-11.2227, -12.7251, -11.8909,  ...,  -6.8628,  -8.1228, -13.7081],
         ...,
         [ -6.8631,  -5.3354,  -7.1390,  ...,  -7.8046,  -6.5177,  -2.9470],
         [ -6.7111,  -5.3284,  -7.2786,  ...,  -7.6783,  -6.0707,  -2.3571],
         [ -6.7783,  -5.5183,  -7.3706,  ...,  -7.35

In [14]:
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers import AdamW, get_scheduler
from datasets import load_metric
from statistics import mean
from sklearn import metrics
from torch import nn
import json
import warnings
warnings.filterwarnings("ignore")


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs = 8
lr = 3e-5
num_labels = examplepreddata.shape[0] 
num_training_steps = num_epochs * len(train_mlm_dataloader)  # num of batches * num of epochs
print(num_training_steps)

class Bert4wwmtask_lightningsystem(pl.LightningModule):

    def __init__(self,net,lr,epoch,len):
        super(Bert4wwmtask_lightningsystem, self).__init__()
        self.net = net.to(device)
        self.lr = lr
        self.epoch = epoch
        self.num_training_steps = len
        #self.metric = load_metric("glue", "mrpc",mirror="tuna")

    def configure_optimizers(self):

        self.optimizer = AdamW(self.net.parameters(), lr=self.lr)
        lr_scheduler = get_scheduler(
                'linear',
                optimizer=self.optimizer, 
                num_warmup_steps=0,
                num_training_steps=self.num_training_steps)
        optim_dict = {'optimizer': self.optimizer, 'lr_scheduler': lr_scheduler}
        return optim_dict
        
    def metrics_compute(self,mode,outputs):
        loss = []
        loss.append(outputs[0][mode+'_loss'])
        predictions = outputs[0]['predictions']
        labels = outputs[0]['labels']
        for i in range(1,len(outputs)):
            loss.append(outputs[i][mode+'_loss'])
            predictions = torch.concat([predictions,outputs[i]['predictions']],dim=0)
            labels = torch.concat([labels,outputs[i]['labels']],dim=0)
        loss = torch.tensor(loss)
        predictions = predictions.cpu().detach().numpy()
        labels = labels.cpu().detach().numpy()
        return loss,predictions,labels

    def training_step(self, batch, batch_idx):
        batch = {k:v.to(device) for k,v in batch.items()}
        loss = self.net(**batch).loss
        return loss

    def validation_step(self, batch, batch_idx):
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = self.net(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metrics_dict = metrics.classification_report(predictions.cpu().detach().numpy(),batch['labels'].cpu().detach().numpy(),digits = 4,output_dict=True)
        self.log('val_f1',(metrics_dict['macro avg']['f1-score']+metrics_dict['weighted avg']['f1-score']+metrics_dict['accuracy'])/3.0,on_epoch=True, prog_bar=True, logger=True)
        #self.metric.add_batch(predictions=predictions, references=batch["labels"])
        return {'val_loss':outputs.loss,'predictions':predictions,'labels':batch['labels']}
    
       
    def test_step(self, batch, batch_idx):
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = self.net(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        return {'test_loss':outputs.loss,'predictions':predictions}
            
    def training_epoch_end(self,outputs):
        pass

    def validation_epoch_end(self, outputs):
        print(outputs[0]['predictions'].shape)
        print(len(outputs))
        val_loss ,predictions,labels= self.metrics_compute('val',outputs)
        print(predictions.shape)
        print('\n',"val_loss: ",val_loss.mean())
        print(metrics.classification_report(predictions, labels,digits = 4))

NameError: name 'train_dataloader' is not defined