In [1]:
import json
from tqdm import tqdm
import numpy as np

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

import pickle
from matplotlib import pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_file_fda = 'data/output/human-rx-openfda-drug.json'
RUN_DIAGNOSTIC = False

In [3]:
with open(data_file_fda) as f:
    data_fda = json.load(f)

In [4]:
model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')

In [44]:
keys = list(data_fda.keys())
print(len(keys))
k = keys[0]
print(data_fda[k].keys())
print(data_fda[k]['metadata'].keys())

sample_keys = keys[0:500]

32103
dict_keys(['metadata', 'Label Text'])
dict_keys(['application_number', 'brand_name', 'generic_name', 'manufacturer_name', 'product_ndc', 'product_type', 'route', 'substance_name', 'rxcui', 'spl_id', 'spl_set_id', 'package_ndc', 'is_original_packager', 'upc', 'unii'])


In [None]:
keys_fda, drugs_fda = zip(*data_fda.items())
sections_fda = [d['Label Text'].keys() for d in drugs_fda]
sections_fda = sorted(set([s for slist in sections_fda for s in slist]))

In [None]:
sections_fda

In [6]:
def compute_section_embedding(text, word_count=256):
    n_segments = 1 + len(text.split()) // word_count
    vecs = np.zeros((n_segments,768))
    for i in range(n_segments):
        segment = text.split()[ (i)*word_count : (i+1)*word_count ]
        vecs[i,:] = model.encode( ' '.join(segment) )
    return np.mean(vecs, axis=0)

In [9]:
!pip install ray

Collecting ray
  Downloading ray-2.3.1-cp310-cp310-macosx_11_0_arm64.whl (28.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.6/28.6 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
Collecting msgpack<2.0.0,>=1.0.0
  Downloading msgpack-1.0.5-cp310-cp310-macosx_11_0_arm64.whl (70 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.2/70.2 KB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting frozenlist
  Downloading frozenlist-1.3.3-cp310-cp310-macosx_11_0_arm64.whl (34 kB)
Collecting grpcio<=1.49.1,>=1.42.0
  Downloading grpcio-1.49.1.tar.gz (22.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.1/22.1 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m0m eta [36m0:00:01[0m0:01[0m01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting aiosignal
  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Building wheels for collected packa

In [10]:
from transformers import pipeline
import psutil

num_cpus = psutil.cpu_count(logical=True)
print('Number of available CPUs:', num_cpus)

ray.init(num_cpus=num_cpus, ignore_reinit_error=True)

pipe = pipeline(task = 'feature-extraction', model='pritamdeka/S-PubMedBert-MS-MARCO', batch_size=1, device=-1)

pipe_id = ray.put(pipe)

@ray.remote
def vectorize(pipeline, 

Number of available CPUs: 10


In [26]:
import nest_asyncio
import asyncio
from datetime import datetime
nest_asyncio.apply()

In [45]:
def background(f):
    def wrapped(*args, **kwargs):
        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)

    return wrapped

@background
def compute_vector_wrapper(key):
    drug = data_fda[key]
    sections = drug['Label Text']
    print(key)
    vectors[key] = {}
    for k,v in sections.items():
        # { "4fdf3b7e-f6ff-4d78-a928-c9d47c5bc9d9" = { "spl_product_data_elements": <VECTOR> } }
        if len(v) > 1:
            # print(f"{key} - {k} - {len(v)} subsections")
            # print(" ".join(v))
            vectors[key][k] = compute_section_embedding(" ".join(v))
        else:
            vectors[key][k] = compute_section_embedding(v[0])

vectors = {}
start = datetime.now()
loop = asyncio.get_event_loop()
looper = asyncio.gather(*[compute_vector_wrapper(key) for key in sample_keys])
results = loop.run_until_complete(looper)
end = datetime.now()
elapsed = end - start

total_sections = 0
for key in sample_keys:
    total_sections += len(data_fda[key]['Label Text'])
print(f"fin ------------- { int(elapsed.total_seconds()) } seconds")
print(f"{len(sample_keys)} drug labels processed: { int(elapsed.total_seconds()) / len(sample_keys) } seconds per drug")
print(f"{total_sections} sections processed: { int(elapsed.total_seconds()) / total_sections } seconds per section")

4fdf3b7e-f6ff-4d78-a928-c9d47c5bc9d95a9a589f-2712-485c-8a45-f506f06ebffa
fdf0f91e-6b00-4f86-a54d-92ef9dca0fb8

2a496565-0f45-49a8-b49b-e4a734aee439
a618c51f-5e21-4c41-9607-417fd04ee8ba
c98f9d56-0b21-1a8d-bebe-644ff00e19f1
05da038d-6763-49c3-b3ac-c2cc76155af4
44b80a39-9206-4f6f-82c5-12bb8ef0530e
ca8bfc0a-d43b-1072-e053-2995a90a66f0
d44efada-eee3-747e-e053-2995a90af561
d423971d-34c8-c7d2-e053-2a95a90abe74
d481e75c-0020-a218-e053-2995a90a19c5
4a08b6cf-7ba0-54a9-14e0-a6e8d1e4854e
a883780c-1bc1-4435-a651-3bde6694b6bd
6b26218d-52a6-4109-90e4-412a17d14fcf
c1eb13ec-76e7-4967-91f8-793b97843709
d22b4c9b-ba18-2eb7-e053-2a95a90a7c6a
632cb507-5675-2392-0460-b09c0dd14650
78f196d4-110e-45b1-813a-725f52c1a4eb
84b12129-cf10-4a3e-889c-990ebabdece6
8e5be3e5-1916-4d19-87ab-e2092f5d4c5a
b2eabec8-4fae-4441-b4dd-9a01b0f91c4a
f6548126-faa1-4f53-8e0b-bc1b743de04d
8ca8b7a2-72ef-4029-9b87-42afd298bb97
a29db87b-3f50-40ca-a5ab-406028ff19f0
8c404ea3-e56f-123c-e053-2a95a90a65fb
16c60b47-01b6-45bf-987f-95bd4de060ac
1

In [42]:
# AYSNCIO Tests
# 100 drug labels processed: 2.85 seconds per drug
# 2223 sections processed: 0.1282051282051282 seconds per section

# 500 drug labels processed: 3.202 seconds per drug
# 11593 sections processed: 0.13810057793496075 seconds per section

100 drug labels processed: 2.85 seconds per drug
2223 sections processed: 0.1282051282051282 seconds per section


In [41]:
data_fda['4fdf3b7e-f6ff-4d78-a928-c9d47c5bc9d9']

{'metadata': {'application_number': ['ANDA210175'],
  'brand_name': ['OXYMORPHONE HYDROCHLORIDE'],
  'generic_name': ['OXYMORPHONE HYDROCHLORIDE'],
  'manufacturer_name': ['XLCare Pharmaceuticals, Inc.'],
  'product_ndc': ['72865-130', '72865-131'],
  'product_type': ['HUMAN PRESCRIPTION DRUG'],
  'route': ['ORAL'],
  'substance_name': ['OXYMORPHONE HYDROCHLORIDE'],
  'rxcui': ['977939', '977942'],
  'spl_id': ['4fdf3b7e-f6ff-4d78-a928-c9d47c5bc9d9'],
  'spl_set_id': ['1a1d5061-b62a-49fa-a5b2-4970f51416ae'],
  'package_ndc': ['72865-130-01', '72865-131-01'],
  'is_original_packager': [True],
  'upc': ['0372865131015', '0372865130018'],
  'unii': ['5Y2EI94NBC']},
 'Label Text': {'spl_product_data_elements': ['OXYMORPHONE HYDROCHLORIDE OXYMORPHONE HYDROCHLORIDE OXYMORPHONE HYDROCHLORIDE OXYMORPHONE ANHYDROUS LACTOSE MAGNESIUM STEARATE CELLULOSE, MICROCRYSTALLINE STARCH, PREGELATINIZED CORN White to off white) T277 OXYMORPHONE HYDROCHLORIDE OXYMORPHONE HYDROCHLORIDE OXYMORPHONE HYDROCHLOR

In [None]:
vectors = {}
section_count = 0
start = datetime.now()
for key in tqdm.tqdm(sample_keys):
    drug = data_fda[key]
    sections = drug['Label Text']
    # print(key)
    vectors[key] = {}
    for k,v in sections.items():
        # { "4fdf3b7e-f6ff-4d78-a928-c9d47c5bc9d9" = { "spl_product_data_elements": <VECTOR> } }
        if len(v) > 1:
            # print(f"{key} - {k} - {len(v)} subsections")
            # print(" ".join(v))
            vectors[key][k] = compute_section_embedding(" ".join(v))
        else:
            vectors[key][k] = compute_section_embedding(v[0])
            section_count += 1
            
end = datetime.now()
elapsed = end - start

print(f"{len(sample_keys)} drug labels processed: { int(elapsed.total_seconds()) / len(sample_keys) } seconds per drug")
print(f"{section_count} sections processed: { int(elapsed.total_seconds()) / section_count } seconds per section")

 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                 | 303/500 [29:54<23:05,  7.04s/it]

In [None]:
from multiprocessing import Pool
import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

def compute_vector_wrapper(key):
    drug = data_fda[key]
    sections = drug['Label Text']
    vectors[key] = {}
    for k,v in sections.items():
        # { "4fdf3b7e-f6ff-4d78-a928-c9d47c5bc9d9" = { "spl_product_data_elements": <VECTOR> } }
        if len(v) > 1:
            # print(f"{key} - {k} - {len(v)} subsections")
            # print(" ".join(v))
            vectors[key][k] = compute_section_embedding(" ".join(v))
        else:
            vectors[key][k] = compute_section_embedding(v[0])
    print(f"{key} completed", flush=True)

with Pool(4) as pool:
     tqdm(pool.imap(compute_vector_wrapper, sample_keys))

In [None]:
print(section_count)

In [None]:
len(vectors['4fdf3b7e-f6ff-4d78-a928-c9d47c5bc9d9']['spl_product_data_elements'])

In [None]:
test_vector = vectors['4fdf3b7e-f6ff-4d78-a928-c9d47c5bc9d9']['spl_product_data_elements']
test_vector_list = test_vector.tolist()
json_vector = json.dumps(test_vector_list)
json.loads(json_vector)

In [None]:
print(f"num subsections: {len(data_fda['ca8bfc0a-d43b-1072-e053-2995a90a66f0']['Label Text']['warnings'])}")
data_fda['ca8bfc0a-d43b-1072-e053-2995a90a66f0']['Label Text']['warnings']

In [None]:
data_fda['ca8bfc0a-d43b-1072-e053-2995a90a66f0']

In [None]:
type(vectors['4fdf3b7e-f6ff-4d78-a928-c9d47c5bc9d9']['spl_product_data_elements'])

In [8]:
!pip list

Package                       Version
----------------------------- -----------
alabaster                     0.7.13
ansible                       5.2.0
ansible-core                  2.12.2
anyio                         3.5.0
appnope                       0.1.3
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
asgiref                       3.5.0
asttokens                     2.0.5
attrs                         21.4.0
Babel                         2.10.1
backcall                      0.2.0
beautifulsoup4                4.11.1
black                         22.8.0
bleach                        4.1.0
boto3                         1.24.66
botocore                      1.27.96
certifi                       2021.10.8
cffi                          1.15.0
cfgv                          3.3.1
charset-normalizer            2.0.12
click                         8.1.0
colorama                      0.4.4
coverage                      7.2.2
cryptography                  36.0.1
c