In [4]:
%load_ext autoreload
%autoreload 2

### Basic imports

In [2]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
import datetime

Set paths

In [11]:
DATA_DIR = Path("/home/jovyan/BioBLP/data")
#DISEASE_ATTR_DATA_DIR = DATA_DIR.joinpath("raw/disease_attributes")
DISEASE_ATTR_DATA_DIR=Path(".")
biokg_disease_path = DATA_DIR.joinpath("raw/biokg.metadata.disease.tsv")
mesh_disease_notes_merged_path = DISEASE_ATTR_DATA_DIR.joinpath("mesh_disease_notes_merged.tsv")
biokg_mesh_df_path = DISEASE_ATTR_DATA_DIR.joinpath("biokg_w_dis_mesh_notes.tsv")


## Retrieve disease textual properties for biokg entities
Precurser notebook can be located at `./disease_mesh_notes_retrieval.ipynb`, which contains data inspection, retrieving MeSH textual notes using SPARQL queries against MeSH query endpoint, as well as postprocessing and serialising to disk.
The dataframe stored at above mentioned path `mesh_disease_notes_merged_path` is a product of the above notebook

In [6]:
mesh_notes_merged_df = pd.read_csv(mesh_disease_notes_merged_path, index_col=0, sep="\t")
print(len(mesh_notes_merged_df))
mesh_notes_merged_df.head()

31617


Unnamed: 0_level_0,note
mesh_id,Unnamed: 1_level_1
D014525,Narrowing of any part of the URETHRA. It is ch...
D017262,Low-molecular-weight compounds produced by mic...
D001321,A disorder beginning in childhood. It is marke...
D015730,"A republic in eastern Africa, on the Gulf of A..."
D002330,A cell-cycle phase nonspecific alkylating anti...


In [9]:
# get biokg disease data
biokg_dis_df = pd.read_csv(biokg_disease_path, sep="\t", names=["mesh_id", "type", "entity"], header=None)
biokg_dis_init_len = len(biokg_dis_df)
biokg_dis_counts = biokg_dis_df.entity.value_counts()
print(f"mesh ids are duplicated with extra rows coming from rdf type triples.\nTotal rows: {biokg_dis_init_len}\n"\
f"# DISEASE type nodes: {biokg_dis_counts['DISEASE']} \n# SCR_DISEASE nodes {biokg_dis_counts['SCR_DISEASE']}")
biokg_dis_df = biokg_dis_df[~biokg_dis_df["entity"].isin(["DISEASE", "SCR_DISEASE"])]
print(f"Biokg rows on dropping rdf type rows: {biokg_dis_init_len} --> {len(biokg_dis_df)} ")

biokg_dis_df.head()

mesh ids are duplicated with extra rows coming from rdf type triples.
Total rows: 22694
# DISEASE type nodes: 4868 
# SCR_DISEASE nodes 6479
Biokg rows on dropping rdf type rows: 22694 --> 11347 


Unnamed: 0,mesh_id,type,entity
11347,D000006,NAME,"Abdomen, Acute"
11348,D000007,NAME,Abdominal Injuries
11349,D000008,NAME,Abdominal Neoplasms
11350,D000012,NAME,Abetalipoproteinemia
11351,D000013,NAME,Congenital Abnormalities


### Merge mesh and biokg dfs on id

In [17]:
biokg_dis_df = biokg_dis_df.merge(mesh_notes_merged_df, how="inner", left_on="mesh_id", right_on="mesh_id")
print(biokg_dis_df.shape)
biokg_dis_df.head(2)

(6865, 4)


Unnamed: 0,mesh_id,type,entity,note
0,D000006,NAME,"Abdomen, Acute",A clinical syndrome with acute abdominal pain ...
1,D000007,NAME,Abdominal Injuries,General or unspecified injuries involving orga...


In [18]:
WRITE_TO_DISK = False
if WRITE_TO_DISK:
    biokg_dis_df.to_csv(biokg_mesh_df_path, sep="\t", header=True, index=False)
else:
    biokg_dis_df = pd.read_csv(biokg_mesh_df_path, delimiter="\t", header=0, index_col=None)

## Get BERT-base embeddings

In [19]:
import torch
from torch import nn
from torch.nn import CrossEntropyLoss

from transformers import BertModel, BertPreTrainedModel
from transformers import BertTokenizer

In [20]:
# Load the "bert-base-cased" pretrained model, and corresponding tokenizer
tz = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained("bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
# Inspect an example
i = 0
sample_sent = biokg_dis_df.iloc[i]['note']
sentences = biokg_dis_df.note.to_list()[:3]
tz(sample_sent, padding=True, return_tensors="pt")

{'input_ids': tensor([[  101,   138,  7300,  9318,  1114, 12104, 24716,  2489,  1115,  1110,
          5199,   117, 25813,   117,  1105,  6099,  1107, 15415,   119,   138,
         23987, 14701,  1336,  1129,  2416,  1118,   170,  2783,  1104, 11759,
           117,  5917,   117,  1137,  8131,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [22]:
disease_id_note_pairs = list(biokg_dis_df[['mesh_id', 'note']].to_records(index=False))
len(disease_id_note_pairs)

6865

In [15]:
encodings_out_path = Path('./disease_encodings/disease_embeddings.npz')

In [25]:
from tqdm import tqdm
mesh_ids = []
embeddings = []

for mesh_id, note in tqdm(disease_id_note_pairs):
    mesh_ids.append(mesh_id)
    encoded = tz(note, padding=True, return_tensors="pt")
    outputs = model(**encoded)
    embedding = outputs[0][0][0].detach().numpy()
    embeddings.append(embedding)
    #print(f"{mesh_id} with embedding of shape {embedding.shape}")

100%|██████████| 6865/6865 [11:33<00:00,  9.90it/s]


In [27]:
out_dict = {mesh_id: emb for mesh_id, emb in zip(mesh_ids, embeddings)}
np.savez(encodings_out_path, **out_dict)
print("Saved embeddings to", encodings_out_path)

Saved embeddings to disease_encodings/disease_embeddings.npz


In [32]:
import numpy as np
import torch
x = np.load(encodings_out_path)
list(x.keys())[:5]

['D000006', 'D000007', 'D000008', 'D000012', 'D000013']

In [39]:
y = list(x.values())[1]
torch.from_numpy(y)

In [None]:
get data in one place snd sgare
