In [None]:
# various imports and setup

import os
import json
import requests
from zipfile import ZipFile

from tqdm import tqdm

In [None]:

data_dir = 'c:/data/drug-labels/openfda/'
zip_dir = data_dir + 'zip/'
json_dir = data_dir + 'json/'

json_print = lambda j: print(json.dumps(j, indent=4, sort_keys=True))
RUN_DIAGNOSTICS = False

#%% download raw data (11 zip files)

dl_json_url = 'https://api.fda.gov/download.json'
dl_json = json.loads(requests.get(dl_json_url).text)
json_print(dl_json['meta'])
print(f"Overall categories: {[k for k in dl_json['results'].keys()]}")

drugs_json = dl_json['results']['drug']
print(f"Drug data categories: {[k for k in drugs_json.keys()]}")

labels_json = dl_json['results']['drug']['label']
print(f"\nFound {labels_json['total_records']} labels, last updated {labels_json['export_date']}\n")

urls = [x['file'] for x in labels_json['partitions']]

if not os.path.exists(zip_dir):
    os.mkdir(zip_dir)

for url in urls:
    filename = url.split('/')[-1]
    if os.path.exists(zip_dir+filename):
        print(f"Already downloaded file {url}")
    else:
        r = requests.get(url)      
        with open(zip_dir+filename, 'wb') as f:
            f.write(r.content)
            print(f'Downloaded file {url}')
                  

#%% unzip those files (result is several jsons)

if not os.path.exists(json_dir):
      os.mkdir(json_dir)

for file in os.listdir(zip_dir):
    with ZipFile(zip_dir + file, 'r') as zf:  
        for zobj in zf.infolist():
            if os.path.exists(json_dir + zobj.filename):
                print(f'Already extracted file {zobj.filename}')
            else:
                zf.extract(zobj, json_dir)
                print(f'Extracted file {zobj.filename}')


#%% read all extracted jsons into master list

results = []
    
for file in tqdm(os.listdir(json_dir)):
    with open(json_dir + file, encoding='utf-8') as f:
        j = json.load(f)
        results += j['results']


#%% loop through master list, sort results into rx/otc/other

# note: many "uncategorized" drugs are simply missing their metadata,
#  which is usually stored in the "openfda" field (a dict)
def check_type(res):
    if 'product_type' not in res['openfda'].keys():
        return 'uncategorized_drug'
    pt = res['openfda']['product_type']
    if type(pt)==float:
        return 'uncategorized_drug'
    if type(pt)==list:
        assert(len(pt)==1)
        return(pt[0].lower().replace(' ','_'))
    else:
        print('Problem determining type: ', pt)
        
results_by_type = {}
for res in results:
    product_type = check_type(res)
    if product_type not in results_by_type.keys():
        results_by_type[product_type] = [res]
    else:
        results_by_type[product_type].append(res)

        
#%% we only care about the prescription drugs, save to disk

drugs = results_by_type['human_prescription_drug']

errors = []
records = {}
# restructure to match ema format and save to disk
for drug in tqdm(drugs):
    try:
        info = {}
        info['metadata'] = drug['openfda']
        
        label_text = {}
        for key,val in drug.items():
            # for my purposes I didn't need tables (mostly html formatting)
            if (type(val)==list) and ('table' not in key):
                label_text[key] = list(set(val)) # de-duplicate contents
        info['Label Text'] = label_text
        
        records[drug['id']] = info

    except:
        errors += [drug]

print(f'Encountered problems reading {len(errors)} records')
with open('output/human-rx-openfda-drug.json', 'w') as f:
    json.dump(records, f, indent=4)

# for manual inspection, save the first few separately (full file too big)
if RUN_DIAGNOSTICS:
    with open('output/human-rx-drug-openfda-excerpt.json', 'w') as f:
        json.dump({k:records[k] for k in list(records.keys())[:100]}, f, indent=4)

