In [1]:
from sentence_transformers import SentenceTransformer, models

word_embedding_model = models.Transformer('../models/clinical_bert')

tokens = ["dyspnea", "fagner", "jessica"]
word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
word_embedding_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
word_embedding_model.save('../models/clinical_bert/finetuned/input')

In [None]:
print(word_embedding_model)

In [3]:
from sentence_transformers import InputExample
from torch.utils.data import DataLoader

# model = SentenceTransformer('distilbert-base-nli-mean-tokens')
train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
   InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)


In [4]:
len(word_embedding_model.tokenizer)

28999

In [5]:
from sentence_transformers import losses

train_loss = losses.CosineSimilarityLoss(word_embedding_model)

#Tune the model
word_embedding_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

word_embedding_model.save('./models/clinical_bert/finetuned/output')
word_embedding_model.tokenizer.save_vocabulary('./models/clinical_bert/finetuned/output')
word_embedding_model.tokenizer.save_pretrained('./models/clinical_bert/finetuned/output')

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

('./models/clinical_bert/finetuned/output/tokenizer_config.json',
 './models/clinical_bert/finetuned/output/special_tokens_map.json',
 './models/clinical_bert/finetuned/output/vocab.txt',
 './models/clinical_bert/finetuned/output/added_tokens.json')

In [6]:
added_vocab = word_embedding_model.tokenizer.get_added_vocab()
print(list(added_vocab.keys())[list(added_vocab.values()).index(28996)])  # Prints george

dyspnea


In [10]:
import pdb
import torch

md = torch.load('../models/clinical_bert/finetuned/output/0_Transformer/pytorch_model.bin',map_location='cpu')
# md = torch.load('../models/clinical_bert/pytorch_model.bin',map_location='cpu')

for k in md:
#     print(k)
# modelos finetunados tão vindo sem o prefixo bert.
    if (k == 'bert.embeddings.word_embeddings.weight' or k == 'embeddings.word_embeddings.weight'):
        embeds = md[k]

vectors = []

for l in range(len(embeds)):
    vector = embeds[l]
    tsv_row = ''
    for m in range(len(vector)):
        tsv_row += str(vector[m].tolist()) + '\t'

    vectors.append(tsv_row)

len(vectors)

28999

In [12]:
with open('tsv_files/clinical_bert/finetuned/output/word_embeddings.tsv', "w") as f, open('tsv_files/clinical_bert/finetuned/output/labels.tsv', "a") as labels_file:
    i = 0
    for e in vectors:
#         print(i)
        i = i+1
        try:
            label = list(added_vocab.keys())[list(added_vocab.values()).index(i)]  # Prints george
            print(label)
            print (e, file=f)
            
            labels_file.write(label+'\n')

        except:
            print (e, file=f)


dyspnea
fagner
jessica


In [9]:
# from sentence_transformers import evaluation
# sentences1 = ['This list contains the first column', 'With your sentences', 'You want your model to evaluate on']
# sentences2 = ['Sentences contains the other column', 'The evaluator matches sentences1[i] with sentences2[i]', 'Compute the cosine similarity and compares it to scores[i]']
# scores = [0.3, 0.6, 0.2]

# evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)

# # ... Your other code to load training data

# model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100, evaluator=evaluator, evaluation_steps=500)


In [40]:
import csv, logging

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")

sts_dataset_path = '../datasets/snli/'


train_samples = []
dev_samples = []
test_samples = []

with open(sts_dataset_path + 'snli_1.0_train.txt', "r") as f:
#     labels = f.readlines()
#     print(f.readlines())
    reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
#         print(row)
#         score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1

        gold_label = row['gold_label']  # Normalize score to range 0 ... 1
        float_gold_label = 0
        
        if gold_label == 'contradiction':
            float_gold_label = 0 / 3.0
        if gold_label == 'entailment':
            float_gold_label = 1 / 3.0
        if gold_label == 'neutral':
            float_gold_label = 2 / 3.0
        
        
#         print(gold_label)
        inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=float_gold_label)

#         if row['split'] == 'dev':
        train_samples.append(inp_example)
#         elif row['split'] == 'test':
#             test_samples.append(inp_example)
#         else:
#             train_samples.append(inp_example)

    
print(train_samples[0])
# with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
# reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)

<InputExample> label: 0.6666666666666666, texts: A person on a horse jumps over a broken down airplane.; A person is training his horse for a competition.


In [41]:
import math 

from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

num_epochs = 4
train_batch_size = 16

# model_save_path = 'output/training_stsbenchmark_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model_save_path = './models/clinical_bert/finetuned/output2'

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=word_embedding_model)


# Development set: Measure correlation between cosine score and gold labels
logging.info("Read SNLIbenchmark dev dataset")
# evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
word_embedding_model.fit(train_objectives=[(train_dataloader, train_loss)],
#           evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

# model = SentenceTransformer(model_save_path)
# test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
# test_evaluator(model, output_path=model_save_path)


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/34385 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

print('Dispositivo:', device)
torch.cuda.get_device_name(0)

Dispositivo: cpu


AssertionError: 
Found no NVIDIA driver on your system. Please check that you
have an NVIDIA GPU and installed a driver from
http://www.nvidia.com/Download/index.aspx

In [3]:
import torch