In [1]:
! pip install transformers
! pip install tensorflow
! pip install keras
! pip install seqeval
! pip install torch





In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# ! nvidia-smi

In [4]:
import os
import pandas as pd
import numpy as np

# MODEL = 'bert-base-cased'
MODEL = '../../models/word_embeddings/fine_tuned/NER/ACD_15epochs'
# MODEL = '../../models/word_embeddings/fine_tuned/NER/ACD_1epoch'

DATASET = 'ACD'


## Load Data

In [5]:
train = pd.read_csv('../../datasets/NER/'+ DATASET + '/train.tsv', delimiter='\t', names=['word', 'label'], quoting=3, error_bad_lines=False)
test = pd.read_csv('../../datasets/NER/'+ DATASET + '/test.tsv', delimiter='\t', names=['word', 'label'], quoting=3, error_bad_lines=False)
devel = pd.read_csv('../../datasets/NER/'+ DATASET + '/devel.tsv', delimiter='\t', names=['word', 'label'], quoting=3, error_bad_lines=False)
print('Train labels: ', train.groupby('label').size())
print()
print('Devel labels: ', devel.groupby('label').size())
print()
print('Test labels: ', test.groupby('label').size())

Train labels:  label
B-Anatomy       6946
B-Chemical      5203
B-Disease       4182
I-Anatomy       4891
I-Chemical      1900
I-Disease       2918
O             245927
dtype: int64

Devel labels:  label
B-Anatomy       2139
B-Chemical      5347
B-Disease       4246
I-Anatomy       1416
I-Chemical      1748
I-Disease       2723
O             158617
dtype: int64

Test labels:  label
B-Anatomy       4616
B-Chemical      5378
B-Disease       4424
I-Anatomy       3243
I-Chemical      1628
I-Disease       2737
O             202684
dtype: int64


In [6]:
print(train.head(10))

             word label
0  Immunostaining     O
1             and     O
2        confocal     O
3        analysis     O
4             DNA     O
5       labelling     O
6             and     O
7        staining     O
8            with     O
9               5     O


In [7]:
# Separa o dataframe por PONTO-FINAL
def separar_frases(dataframe):
  sentences = []
  labels = []

  sentences_aux = []
  labels_aux = []

  inicio = True

  for word, label in zip(dataframe.word.values, dataframe.label.values):
    if inicio:
        sentences_aux.append('[CLS]')
        labels_aux.append('O')
        inicio = False
    sentences_aux.append(word)
    labels_aux.append(label)

    if (word == '.'):
        sentences_aux.append('[SEP]')
        labels_aux.append('O')
        
        sentences.append(sentences_aux)
        labels.append(labels_aux)
        
        sentences_aux = []
        labels_aux = []
        inicio = True

  return sentences, labels

In [8]:
train_sentences, train_labels = separar_frases(train)
test_sentences, test_labels = separar_frases(test)
devel_sentences, devel_labels = separar_frases(devel)

### Map tags to id 

In [9]:
tag_values = list(set(train["label"].values))
tag_values.append("PAD")
print(sorted(tag_values))
tag2idx = {t: i for i, t in enumerate(tag_values)}
print(tag2idx)

['B-Anatomy', 'B-Chemical', 'B-Disease', 'I-Anatomy', 'I-Chemical', 'I-Disease', 'O', 'PAD']
{'B-Disease': 0, 'I-Anatomy': 1, 'B-Anatomy': 2, 'B-Chemical': 3, 'O': 4, 'I-Disease': 5, 'I-Chemical': 6, 'PAD': 7}


### Prepare sentences and labels

In [10]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

print('Dispositivo:', device)
# torch.cuda.get_device_name(0)

Dispositivo: cpu


In [11]:
# if path:
from typing import Dict
from transformers import AutoConfig
# List, Optional, Tuple

def get_labels(path):
    
    # path= '../../NER/ACD/labels.txt'
    with open(path, "r") as f:
        labels = f.read().splitlines()
        labels = [i if i != 'O' else 'O' for i in labels]
    if "O" not in labels:
        labels = ["O"] + labels
    return labels

labels = get_labels('../../datasets/NER/ACD/labels.txt')
num_labels = len(labels)
label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}

config = AutoConfig.from_pretrained(MODEL,
    num_labels=num_labels,
    id2label=label_map,
    label2id={label: i for i, label in enumerate(labels)},
#     cache_dir=model_args.cache_dir,
#     max_seq_length=128
)
print(config)

BertConfig {
  "_name_or_path": "dmis-lab/biobert-base-cased-v1.1",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "B-Anatomy",
    "1": "B-Chemical",
    "2": "B-Disease",
    "3": "I-Anatomy",
    "4": "I-Chemical",
    "5": "I-Disease",
    "6": "O"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-Anatomy": 0,
    "B-Chemical": 1,
    "B-Disease": 2,
    "I-Anatomy": 3,
    "I-Chemical": 4,
    "I-Disease": 5,
    "O": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}



In [12]:
# # Configura o tamanho máximo da sentenca e tamanho do batch de processamento
# MAX_LEN = 75
# # bs = 32
# bs = 4

## Tokenizar a entrada

In [13]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []
    for word, label in zip(sentence, text_labels):
#         print('sentence ', word)
        # Tokenize the word and count # of subwords the word is broken into
#         print(word)
#         break
        tokenized_word = tokenizer.tokenize(str(word))
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        if label.startswith("B"):
            labels.extend([label])
            new_label = "I-" + label[2:]

            labels.extend([new_label] * (n_subwords-1))
        else:
            labels.extend([label] * n_subwords)


    return tokenized_sentence, labels


In [14]:
tokenizer = BertTokenizer.from_pretrained(MODEL, do_lower_case=False)

train_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(train_sentences, train_labels)
]
test_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(test_sentences, test_labels)
]
devel_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(devel_sentences, devel_labels)
]

print(train_tokenized_texts_and_labels[0])


(['[CLS]', 'I', '##mm', '##uno', '##sta', '##ining', 'and', 'con', '##fo', '##cal', 'analysis', 'DNA', 'label', '##ling', 'and', 'stain', '##ing', 'with', '5', '-', 'br', '##omo', '-', '2', "'", '-', 'de', '##ox', '##yu', '##rid', '##ine', '(', 'B', '##rd', '##U', 'label', '##ling', 'and', 'detection', 'kit', 'I', ';', 'Bo', '##eh', '##ring', '##er', 'Mann', '##heim', ',', 'Germany', ')', 'was', 'performed', 'according', 'to', 'the', 'manufacturer', "'", 's', 'instructions', '.', '[SEP]'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])


In [15]:
#Antigo
# train_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in train_tokenized_texts],
#                           maxlen=MAX_LEN, dtype="long", value=0.0,
#                           truncating="post", padding="post")
# test_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in test_tokenized_texts],
#                           maxlen=MAX_LEN, dtype="long", value=0.0,
#                           truncating="post", padding="post")
# devel_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in devel_tokenized_texts],
#                           maxlen=MAX_LEN, dtype="long", value=0.0,
#                           truncating="post", padding="post")

In [16]:
from dataclasses import dataclass, field
from typing import Optional

@dataclass
class DataTrainingArguments:
    data_dir: str = field(
        metadata={"help": "The input data dir. Should contain the .txt files for a CoNLL-2003-formatted task."}
    )
    labels: Optional[str] = field(
        default=None,
        metadata={"help": "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."},
    )
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )

    # per_device_eval_batch_size: int = field(
    #     default=4, metadata={"help": "Overwrite the cached training and evaluation sets"}
    # )
    block_size: int = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )

data_args = DataTrainingArguments(MODEL)
data_args.max_seq_length = 128

In [17]:
# refatorado
train_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(token[0]) for token in train_tokenized_texts_and_labels],
                          maxlen=data_args.max_seq_length, dtype="long", value=0.0,
                          truncating="post", padding="post")
test_input_ids = pad_sequ
        print(type(devel_input_ids))ences([tokenizer.convert_tokens_to_ids(token[0]) for token in test_tokenized_texts_and_labels],
                          maxlen=data_args.max_seq_length, dtype="long", value=0.0,
                          truncating="post", padding="post")
devel_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(token[0]) for token in devel_tokenized_texts_and_labels],
                          maxlen=data_args.max_seq_length, dtype="long", value=0.0,
                          truncating="post", padding="post")
train_input_ids[0]

array([  101,   146,  6262, 26761,  8419, 16534,  1105, 14255, 14467,
        7867,  3622,  5394,  3107,  1979,  1105, 24754,  1158,  1114,
         126,   118,  9304, 18445,   118,   123,   112,   118,  1260,
       10649,  9379, 10132,  2042,   113,   139,  2956,  2591,  3107,
        1979,  1105, 11432, 11622,   146,   132,  9326, 10486,  3384,
        1200, 10852,  6797,   117,  1860,   114,  1108,  1982,  2452,
        1106,  1103,  7400,   112,   188,  7953,   119,   102,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0]

In [18]:
## presentação com PADids da tag das tags em tamanho de 75 (5 é PAD, 0 É O)

In [19]:
train_tags = pad_sequences([[tag2idx.get(l) for l in lab[1]] for lab in train_tokenized_texts_and_labels],
                     maxlen=data_args.max_seq_length, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")
test_tags = pad_sequences([[tag2idx.get(l) for l in lab[1]] for lab in test_tokenized_texts_and_labels],
                     maxlen=data_args.max_seq_length, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")
devel_tags = pad_sequences([[tag2idx.get(l) for l in lab[1]] for lab in devel_tokenized_texts_and_labels],
                     maxlen=data_args.max_seq_length, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")
print(tag2idx)
print(train_tags[0])

{'B-Disease': 0, 'I-Anatomy': 1, 'B-Anatomy': 2, 'B-Chemical': 3, 'O': 4, 'I-Disease': 5, 'I-Chemical': 6, 'PAD': 7}
[4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7]


> train_tag[i] é a fita de tags normalizada (i.e., adicionando PAD tokens) para um tamanho = 75

In [20]:
train_attention_masks = [[float(i != 0.0) for i in ii] for ii in train_input_ids]
test_attention_masks = [[float(i != 0.0) for i in ii] for ii in test_input_ids]
devel_attention_masks = [[float(i != 0.0) for i in ii] for ii in devel_input_ids]
print(train_attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


> train_attention_masks[i]  é o vetor que diz ao BERT onde prestar atençao 

In [41]:
# tr_inputs, val_inputs, tr_tags, val_tags, tr_masks, val_masks = shuffle(train_input_ids, test_input_ids, train_tags, test_tags, train_attention_masks, test_attention_masks, random_state=2020)
# devel_inputs, devel_tags, devel_masks = shuffle(devel_input_ids, devel_tags, devel_attention_masks, random_state=2020)
# # pŕint(tr_inputs[0])
print(train_input_ids[0])
print(train_tags[0])
tr_inputs, tr_tags, tr_masks = shuffle(train_input_ids, train_tags, train_attention_masks, random_state=2020)
val_inputs, val_tags, val_masks = shuffle(test_input_ids, test_tags, test_attention_masks, random_state=2020)
devel_inputs, devel_tags, devel_masks = shuffle(devel_input_ids, devel_tags, devel_attention_masks, random_state=2020)

[  101   146  6262 26761  8419 16534  1105 14255 14467  7867  3622  5394
  3107  1979  1105 24754  1158  1114   126   118  9304 18445   118   123
   112   118  1260 10649  9379 10132  2042   113   139  2956  2591  3107
  1979  1105 11432 11622   146   132  9326 10486  3384  1200 10852  6797
   117  1860   114  1108  1982  2452  1106  1103  7400   112   188  7953
   119   102     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
[4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7

In [22]:
# converter para ternsores
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
devel_inputs = torch.tensor(devel_inputs)
devel_tags = torch.tensor(devel_tags)
devel_masks = torch.tensor(devel_masks)

In [42]:
type(devel_inputs)

numpy.ndarray

In [23]:
import transformers
from transformers import BertForTokenClassification, AdamW

transformers.__version__


'4.5.1'

In [24]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    MODEL,
    from_tf=bool(".ckpt" in MODEL),
    config=config
)

In [25]:
# construir os DataLoader
# train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
# train_sampler = RandomSampler(train_data)
# # train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=config.block_size)
# train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=128)

# valid_data = TensorDataset(val_inputs, val_masks, val_tags)
# valid_sampler = SequentialSampler(valid_data)
# valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=128)

devel_data = TensorDataset(devel_inputs, devel_masks, devel_tags)
devel_sampler = SequentialSampler(devel_data)
devel_dataloader = DataLoader(devel_data, sampler=devel_sampler, batch_size=128)


In [26]:
# model.cuda()

In [27]:
# from transformers import get_linear_schedule_with_warmup

# epochs = 1
# max_grad_norm = 1.0

# # Total number of training steps is number of batches * number of epochs.
# total_steps = len(train_dataloader) * epochs

# # Create the learning rate scheduler.
# scheduler = get_linear_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=0,
#     num_training_steps=total_steps
# )


In [28]:
from seqeval.metrics import f1_score, accuracy_score


In [39]:
model.eval()
# Reset the validation loss for this epoch.
devel_loss, devel_accuracy = 0, 0
nb_devel_steps, nb_devel_examples = 0, 0
predictions , true_labels = [], []
print(devel_dataloader)
print('len(devel_dataloader) ',len(devel_dataloader))
for batch in devel_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    print('len(b_input_ids) ', len(b_input_ids))
    
    # Telling the model not to compute or store gradients,
    # saving memory and speeding up validation
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
    logits = outputs[1].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    
#     print('outputs[0] ', outputs[0])
#     print('outputs[0].mean() ', outputs[0].mean())

    # Calculate the accuracy for this batch of test sentences.
    devel_loss += outputs[0].mean().item()
#     print(devel_loss)
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    true_labels.extend(label_ids)
#     break



<torch.utils.data.dataloader.DataLoader object at 0x7f1b662cf340>
len(devel_dataloader)  67
len(b_input_ids)  128


KeyboardInterrupt: 

In [36]:
print('devel_loss antes de tirar a media ',devel_loss)

devel_loss = devel_loss / len(devel_dataloader)
print('len(devel_dataloader) ',len(devel_dataloader))

print("devel_loss depois de tirar a media: {}".format(devel_loss))
pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                              for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
valid_tags = [tag_values[l_i] for l in true_labels
                              for l_i in l if tag_values[l_i] != "PAD"]

print("Devel Accuracy: {}".format(accuracy_score([pred_tags], [valid_tags])))
print("Devel F1-Score: {}".format(f1_score([pred_tags], [valid_tags])))
print()


devel_loss antes de tirar a media  0.1455353551836156
devel_loss depois de tirar a media 0.0021721694803524716
len(devel_dataloader)  67
devel_loss depois de tirar a media: 0.0021721694803524716
Devel Accuracy: 0.05664488017429194
Devel F1-Score: 0.0061823802163833074



In [None]:
# test_sentence = "This expression of NT-3 in supporting cells in embryos and neonates may even preserve in Brn3c null mutants the numerous spiral sensory neurons in the apex of 8-day old animals.Hypertensive pneumothorax is more common in tall and thin young adults (primary pneumothorax) or in patients with chronic pulmonary diseases or chest trauma (secondary pneumothorax)."
# test_sentence = 'The cardiac and pulmonary auscultation are normal; chest pain does not worse with palpation of the thorax; there is no jugular stasis nor lower limb edema.'

# test_sentence = 'The pain is continuous and is located just in the middle of my chest, worsening when I breathe and when I lay down on my bed, I suffer from arterial hypertension and smoke 20 cigarettes every day.'
# test_sentence = 'Sepsis is the second most common cause of death in non-coronary intensive care units and the tenth leading cause of death overall in high-income countries. During the past two decades, the incidence of sepsis has increased annually by 9% to reach 240 per 100 000 population in the USA by 2000.'
# test_sentence = 'The true incidence of sepsis in any given country is unknown. The reported incidence is dependent on the specific definition used, the infecting organism, the reporting mechanism (such as the use of the International Classification of Diseases-9 coding systems) and the requirement for either organ support or intensive care. These factors result in marked differences between estimates and discrete geographical locations. Most data describing the incidence of sepsis are from high-income countries, where 2.8 million deaths per year are attributable to sepsis. In 2001, Angus and colleagues reported in, the USA the, that incidence of severe sepsis was more than 750 000 cases per annum (300 cases per 100 000 population), equivalent to 2.26 cases per 100 hospital discharges. In the UK, the reported prevalence of sepsis in ICU-derived cohorts is 27% of all ICU admissions, whereas the prevalence is 12% in the USA. 14 This difference could partly be explained by the sub stantially greater numbers of ICU beds available in the USA than in the UK, and thus the differing triage patterns and admission criteria. It is also possible that, in institutions where clinical staff are trained in sepsis recognition, the previous use of the less-specific SIRS criteria could have led to an over-reporting of sepsis cases. Overall, however, there is probably a substantial under-reporting of the incidence of sepsis and with an ageing population, the incidence will continue to increase. This pattern might be further accentuated by campaigns to increase the awareness of and screening for the condition. Except for maternal and neonatal sepsis, the condition is usually considerably under-reported in the global burden of disease statistics. The true scale of the problem is probably much higher than what has been reported. Data suggest that sepsis contributes to between a third and a half of all in-hospital deaths in the USA. Although these data represent the incidence of sepsis in high-resource countries, most deaths due to sepsis happen in low-resource countries, where the exact incidence of sepsis is difficult to accurately estimate. The available literature suggests that an estimated 90% of worldwide deaths from chest infections occur in low-resource settings; about 70% of the 9 million deaths due to chest infections in neonates and infants are associated with sepsis, and most cases occur in Asia and Africa.'

# test_sentence = " Her eye is green. A 58-year-old African-American woman presents to the ER with episodic pressing/burning anterior chest pain that began two days earlier for the first time in her life. The pain started while she was walking, radiates to the eyes, and is accompanied by nausea, diaphoresis and mild dyspnea, but is not increased on inspiration. The latest episode of pain ended half an hour prior to her arrival. She is known to have hypertension and obesity. She denies smoking, diabetes, hypercholesterolemia, or a family history of heart disease. She currently takes no medications. Physical examination is normal. The EKG shows nonspecific changes. Alprazolan was administred."
# test_sentence = 'My eye is blue. I feel nausea. '
test_sentence = 'Doctor, I am feeling chest pain since yesterday. The pain is continuous and is located just in the middle of my chest, worsening when I breathe and when I lay down on my bed. I suffer from arterial hypertension and smoke 20 cigarettes every day. My father had a “heart attack” at my age and I am very worried about it.'
tokenized_sentence = tokenizer.encode(test_sentence)
input_ids = torch.tensor([tokenized_sentence]).cuda()
with torch.no_grad():
    output = model(input_ids)
# print(output)
label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
# join bpe split tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
new_tokens, new_labels = [], []
for token, label_idx in zip(tokens, label_indices[0]):
    print(token, label_idx)
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(tag_values[label_idx])
        new_tokens.append(token)
for token, label in zip(new_tokens, new_labels):
    print("{}\t\t{}".format(label, token))

    

In [None]:
label_indices.shape

In [None]:
# # label_list = ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "[CLS]", "[SEP]"]

# import json, os

# # Save a trained model and the associated configuration
# model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
# model_to_save.save_pretrained('model')
# tokenizer.save_pretrained('model')
# label_map = {i : label for i, label in enumerate(tag_values,1)}        
# model_config = {"bert_model":"bert-base-cased","do_lower":False,"max_seq_length":128,"num_labels":len(tag_values)+1,"label_map":label_map}
# json.dump(model_config,open(os.path.join("model","model_config.json"),"w"))

In [None]:
model.save_pretrained(OUTPUT_MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_MODEL_PATH)

# !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1R84voFKHfWV9xjzeLzWBbmY1uOMYpnyD' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1R84voFKHfWV9xjzeLzWBbmY1uOMYpnyD" -O biobert_weights && rm -rf /tmp/cookies.txt