In [46]:
import pandas as pd
import torch
from datasets import Dataset
import datasets


traindata = pd.read_json('./CHIP-CTC/CHIP-CTC_train.json')
valdata = pd.read_json('./CHIP-CTC/CHIP-CTC_dev.json')
testdata = pd.read_json('./CHIP-CTC/CHIP-CTC_test.json')

examplepreddata = pd.read_excel('./CHIP-CTC/category.xlsx')

examplepreddata['label2idx'] = range(examplepreddata.shape[0])

label2idx = dict(
    zip(examplepreddata['Label Name'], examplepreddata['label2idx']))

traindata['labels'] = [label2idx[item] for item in traindata['label']]
valdata['labels'] = [label2idx[item] for item in valdata['label']]

print(len(traindata))
print(len(valdata))
print(len(testdata))

traindataset = Dataset.from_pandas(traindata)
valdataset = Dataset.from_pandas(valdata)
testdataset = Dataset.from_pandas(testdata)

dataset = datasets.DatasetDict({
    'train': traindataset,
    'validation': valdataset,
    'test': testdataset
})

print(dataset)

train_dataset = dataset['train']
print(train_dataset.features)

print(train_dataset[0])

22962
7682
10192
DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'text', 'labels'],
        num_rows: 22962
    })
    validation: Dataset({
        features: ['id', 'label', 'text', 'labels'],
        num_rows: 7682
    })
    test: Dataset({
        features: ['id', 'text'],
        num_rows: 10192
    })
})
{'id': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'labels': Value(dtype='int64', id=None)}
{'id': 's1', 'label': 'Therapy or Surgery', 'text': ' 研究开始前30天内，接受过其他临床方案治疗；', 'labels': 11}


In [47]:
from transformers import BertTokenizer, BertModel, AutoModelForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('uer/chinese_roberta_L-8_H-512',
                                          mirror='tuna')

def tokenize_function(sample):
    return tokenizer(sample['text'], truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets['train'] = tokenized_datasets['train'].remove_columns(
    ['id', 'text', 'label'])
tokenized_datasets['validation'] = tokenized_datasets[
    'validation'].remove_columns(['id', 'text', 'label'])
tokenized_datasets['test'] = tokenized_datasets['test'].remove_columns(
    ['id', 'text'])

from transformers import DataCollatorWithPadding  #实现按batch自动padding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print(tokenized_datasets)

100%|██████████| 23/23 [00:03<00:00,  7.59ba/s]
100%|██████████| 8/8 [00:01<00:00,  7.87ba/s]
100%|██████████| 11/11 [00:01<00:00,  8.47ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 22962
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7682
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10192
    })
})





In [48]:
from torch.utils.data import DataLoader, Dataset
train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=8, collate_fn=data_collator)  
val_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=8, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=8, collate_fn=data_collator)
for batch in test_dataloader:
    print({k: v.shape for k, v in batch.items()})
    break

{'input_ids': torch.Size([8, 36]), 'token_type_ids': torch.Size([8, 36]), 'attention_mask': torch.Size([8, 36])}


In [49]:
from transformers import BertTokenizer, BertModel, AutoModelForSequenceClassification

checkpoint = './results/checkpoint-7000'
net = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=examplepreddata.shape[0])

for batch in train_dataloader:
    outputs = net(**batch)
    print(outputs)
    break

SequenceClassifierOutput(loss=tensor(0.0945, grad_fn=<NllLossBackward0>), logits=tensor([[ 2.3595, -0.8232, -0.5250, -0.1728, -0.6569, -0.2318, -0.5023, -1.2957,
         -0.6132, -1.1516, -0.4859, -0.2183, -1.7783, -1.8908, -1.2756, -1.4352,
         -0.2314, -1.0946, -0.5662, -0.6864, -0.2688, -1.2383, -2.1606, -1.1244,
         -2.4606,  0.5503, -1.1100, -0.3110,  1.5770, -1.9208,  4.5506, -0.4792,
         -2.2947, -1.9314, -0.5908, -1.6749, -1.1083, -1.6656, -1.4088, -0.8657,
         -1.0437, -1.8180, -0.9556,  5.3148],
        [-0.3459, -0.5569, -0.3527,  8.1914, -0.0153,  0.3508, -0.5420, -0.8616,
          0.0102,  0.4932,  0.4420,  0.3072, -0.6349, -0.8259, -0.2744,  0.2811,
         -0.7900, -0.2973,  0.7945, -1.1079, -1.1131, -0.1070,  0.0590, -0.6125,
         -0.4432, -0.9526, -0.0090, -0.5777, -0.9088,  0.2086,  0.3597, -1.7714,
         -0.1879, -0.6060, -0.0837, -0.4378,  0.3161,  0.2662,  0.5132, -0.3632,
         -0.0821,  0.2848, -0.3562,  0.6461],
        [-0.3828,

In [55]:
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers import AdamW, get_scheduler
from datasets import load_metric


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Bert4textclassification_lightningsystem(pl.LightningModule):

    def __init__(self,net,lr,epoch):
        super(Bert4textclassification_lightningsystem, self).__init__()
        self.net = net.to(device)
        self.lr = lr
        self.epoch = epoch
        #self.metric = load_metric("glue", "mrpc",mirror="tuna")
        
    def configure_optimizers(self):

        self.optimizer = AdamW(self.net.parameters(), lr=self.lr)
        return self.optimizer

    def training_step(self, batch, batch_idx):
        batch = {k:v.to(device) for k,v in batch.items()}
        loss = self.net(**batch).loss
        return loss

    def validation_step(self, batch, batch_idx):
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = self.net(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        #self.metric.add_batch(predictions=predictions, references=batch["labels"])
        self.log('val_loss', outputs.loss)
    
    def train_epoch_end(self,outputs):
        print(sum(outputs) / len(outputs))
        
    def val_epoch_end(self, outputs):
        print('1')
        #print(self.metric.compute())

    def test_step(self, batch, batch_idx):
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = self.net(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        return predictions

num_epoches = 3
lr = 3e-5
model = Bert4textclassification_lightningsystem(net,lr,num_epoches)
checkpoint_callback = ModelCheckpoint(
            monitor='val_loss',
            dirpath='./output',
            filename=
            'chinese_roberta_L-8_H-512-CHIP-CTC-{epoch:02d}-{val_loss:.3f}',
            mode='min')
trainer = Trainer(
            logger=False,
            max_epochs=num_epoches,
            gpus=1,
            reload_dataloaders_every_n_epochs=False,
            num_sanity_val_steps=0,  # Skip Sanity Check
            callbacks=[checkpoint_callback],
            #precision=16,
            #accumulate_grad_batches=2,
            #gradient_clip_val=0.5,
        )

trainer.fit(model, train_dataloader, val_dataloader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type                          | Params
-------------------------------------------------------
0 | net  | BertForSequenceClassification | 36.6 M
-------------------------------------------------------
36.6 M    Trainable params
0         Non-trainable params
36.6 M    Total params
146.344   Total estimated model params size (MB)


Epoch 0:   0%|          | 0/3832 [00:00<?, ?it/s]tensor(0.7279, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch 0:   0%|          | 1/3832 [00:00<05:45, 11.08it/s, loss=0.728]tensor(0.0555, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch 0:   0%|          | 2/3832 [00:00<04:20, 14.70it/s, loss=0.392]tensor(0.7714, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch 0:   0%|          | 3/3832 [00:00<03:50, 16.60it/s, loss=0.518]tensor(0.5484, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch 0:   0%|          | 4/3832 [00:00<03:37, 17.63it/s, loss=0.526]tensor(0.3881, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch 0:   0%|          | 5/3832 [00:00<03:27, 18.44it/s, loss=0.498]tensor(0.3662, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch 0:   0%|          | 6/3832 [00:00<03:20, 19.12it/s, loss=0.476]tensor(0.0795, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch 0:   0%|          | 7/3832 [00:00<03:14, 19.71it/s, loss=0.42] tensor(1.2802, device='cuda:0', grad_fn=<NllLoss