# Generate embeddings for textual descriptors in snomed


In [3]:
from transformers import *
import torch
import os
import pandas as pd
import numpy as np
import json

from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
from tqdm import tqdm, trange

from sklearn.decomposition import PCA
import time

In [5]:
torch.cuda.set_device(0)
device = 0

In [42]:
# model_type = 'bert-base-uncased'
# model_type = 'clinicalbert'
model_type = 'ccbert'

In [43]:
if model_type == 'bert-base-uncased':
    pretrained_weights = 'bert-base-uncased'
    config = BertConfig.from_pretrained(pretrained_weights)
    tokenizer = BertTokenizer.from_pretrained(pretrained_weights, do_lower_case=True)
    model = BertModel.from_pretrained(pretrained_weights, config=config)
    model.to(device);
elif model_type == 'clinicalbert':
    config_name = '/home/dc925/project/data/models/clinicalbert/config.json'
    tokenizer_name = '/home/dc925/project/data/models/clinicalbert/vocab.txt'
    model_name = '/home/dc925/project/data/models/clinicalbert/pytorch_model.bin'
    config = BertConfig.from_pretrained(config_name)
    tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=True)
    model = BertModel.from_pretrained(model_name, config=config)
    model.to(device);
elif model_type == 'ccbert':
    config_name = '/home/dc925/project/data/models/ccbert/config.json'
    tokenizer_name = '/home/dc925/project/data/models/ccbert/vocab.txt'
    model_name = '/home/dc925/project/data/models/ccbert/pytorch_model.bin'
    config = BertConfig.from_pretrained(config_name)
    tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=True)
    model = BertModel.from_pretrained(model_name, config=config)
    model.to(device);

In [44]:
# we need cui2string and relations
relations = pd.read_csv('snomed_relations.csv')
relation_examples = list(relations['relations'])
encoded_relations = torch.tensor([tokenizer.encode(t, max_length=64, pad_to_max_length=True) for t in relation_examples], dtype=torch.long)
relation_dataset = TensorDataset(encoded_relations)

In [46]:
with open('snomed_cui2string.json', 'r') as fin:
    concepts = json.load(fin)
concept_df = pd.DataFrame.from_dict(concepts, orient='index', columns=['string'])
concept_df.to_csv('concepts_df.csv')
concept_examples = list(concept_df['string'])

In [47]:
encoded_concepts = torch.tensor([tokenizer.encode(t, max_length=64, pad_to_max_length=True) for t in concept_examples], dtype=torch.long)
# torch.save(encoded_concepts, 'cached_encoded_concepts_clinical')
# encoded_concepts = torch.load('cached_encoded_concepts_clinical')
concept_dataset = TensorDataset(encoded_concepts)

In [49]:
dataset = concept_dataset + relation_dataset

In [51]:

batch_size = 128
sampler = SequentialSampler(dataset)
dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)

In [52]:
logits = None
for batch in tqdm(dataloader, desc='extracting features'):
    model.eval()
    inputs = {'input_ids': batch[0].to(device)}
    with torch.no_grad():
        outputs = model(input_ids=inputs['input_ids'])
        outputs = outputs[1]
        
    if logits is None:
        logits = outputs.detach().cpu().numpy()
    else:
        logits = np.append(logits, outputs.detach().cpu().numpy(), axis=0)
torch.cuda.empty_cache()


extracting features: 100%|██████████| 2298/2298 [10:08<00:00,  3.78it/s]


In [54]:
#dimensionality reduction
start = time.time()
pca = PCA(n_components=512)
pca_out = pca.fit_transform(logits)
print('pca took {}'.format(time.time()-start))

pca took 21.039462566375732


In [55]:
concept_embeddings = pca_out[:-170]
relation_embeddings = pca_out[-170:]

In [58]:
print(model_type)
if model_type == 'bert-base-uncased':
    np.save('data/case4/concept_embeddings_bert', concept_embeddings)
    np.save('data/case4/relation_embeddings_bert', relation_embeddings)
elif model_type=='clinicalbert':
    np.save('data/case4/concept_embeddings_clinicalbert', concept_embeddings)
    np.save('data/case4/relation_embeddings_clinicalbert', relation_embeddings)
elif model_type=='ccbert':
    np.save('data/case4/concept_embeddings_ccbert', concept_embeddings)
    np.save('data/case4/relation_embeddings_ccbert', relation_embeddings)

ccbert
