In [1]:
#%% imports and setup

import pandas as pd
import pdfplumber

import re
import os

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

from collections import Counter
from Levenshtein import distance as levdistance
# warning: possible code licensing issue with this package?
# if that's a problem can replace Levenshtein.distance with nltk.edit_distance

import json


In [2]:
data_dir = 'data/ema/'
pdf_dir = data_dir + 'pdf/'
txt_dir = data_dir + 'txt/'

RUN_DIAGNOSTICS = False

#%% get master spreadsheet from EMA listing all product pages

url = 'https://www.ema.europa.eu/sites/default/files/Medicines_output_european_public_assessment_reports.xlsx'
filename = url.split('/')[-1]
with open(filename, 'wb') as f:
    f.write(requests.get(url).content)
    

#%% reload from disk (no need to download every time)

data = pd.read_excel('Medicines_output_european_public_assessment_reports.xlsx', skiprows=8, header=0)
print(f'Found records for {data.URL.nunique()} products')
print(data.Category.value_counts())

# we don't need the veterinary products
data = data[data['Category']=='Human']

#%% find all linked english pdfs with "product-information" in the filename,
#   save each one with the product number (e.g. as the local filename

new_files, existing_files, errors = 0, 0, []

for index,row in tqdm(data.iterrows()):
    product_number = re.sub('/', '_', row['Product number']).strip()
    filename = pdf_dir + product_number + '.pdf'
    if os.path.exists(filename):
        existing_files += 1
        continue
    
    url = row['URL']
    soup = BeautifulSoup(requests.get(url).text, features='html.parser')
    links = soup.find_all('a', href=True)
    pdfs = [l['href'] for l in links if l['href'].endswith('_en.pdf')]
    pdfs = [p for p in pdfs if '/product-information/' in p]
    
    if len(pdfs)==1: # there should be one product info document
        link = pdfs[0]
        with open(filename, 'wb') as f:
            f.write(requests.get(link).content)
            new_files += 1            
    else:
        errors.append(index)

print(f'Wrote {new_files} files with {len(errors)} errors, {existing_files} already downloaded')


#%% optional: this code verifies that the drugs with no product information are withdrawn/refused

if RUN_DIAGNOSTICS:
    products = [re.sub('/', '_', p).strip() for p in data['Product number']]
    files = os.listdir(pdf_dir)
    missing = [p for p in products if p+'.pdf' not in files]
    print(f'Missing product information sheet from {len(missing)}/{len(data)} pages')
    
    # Missing files are all for drugs that have been refused/withdrawn, normally that box says "Authorized"
    ema_status = []
    for p in missing:
        row = data[data['Product number'] == re.sub('_','/',p)]
        url = row['URL'].values[0]
        soup = BeautifulSoup(requests.get(url).text, features='html.parser')
        status = soup.find('div',{'class':'ema-status-title'})
        text = 'Status not listed' if not status else status.text
        ema_status.append(text)
        
    summary = pd.DataFrame({'file':missing, 'status':ema_status})
    summary['code'] = data.loc[errors]['Product number'].values
    summary['link'] = data.loc[errors]['URL'].values
    print(summary.status.value_counts())
    

#%% functions to...
#   strip tables and margins, pdfplumber is very clunky about this :/
#   find headers within text of single file
#   slice files with multiple entries into multiple outputs (e.g. check for repeated instances of section 1)
#   write output to disk

# helper function for pdfplumber
def remove_tables(page):
    ts = {"vertical_strategy": "lines", "horizontal_strategy": "lines"}
    bboxes = [table.bbox for table in page.find_tables(table_settings=ts)]
    
    def not_within_bboxes(obj):
        #Check if the object is in any of the table's bbox.
        def obj_in_bbox(_bbox):
            #See https://github.com/jsvine/pdfplumber/blob/stable/pdfplumber/table.py#L404
            v_mid = (obj["top"] + obj["bottom"]) / 2
            h_mid = (obj["x0"] + obj["x1"]) / 2
            x0, top, x1, bottom = _bbox
            return (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
        return not any(obj_in_bbox(__bbox) for __bbox in bboxes)
    
    return page.filter(not_within_bboxes)

# helper function for pdfplumber    
def remove_margins(page, dpi=72, size=0.7):
    # strip 0.7 inches from top and bottom (page numbers, header text if any), A4 is 8.25 x 11.75
    # syntax is page.crop((x0, top, x1, bottom))
    w = float(page.width)/dpi
    h = float(page.height)/dpi
    return page.crop((0, (size)*dpi, w*dpi, (h-size)*dpi))


# function: input file, output text of annex 1
def read_smpc(filename, no_blanks=True, no_tables=True):
    text = []
    if filename.endswith('.pdf'):
        with pdfplumber.open(filename) as pdf:
            for page in pdf.pages:
                page = remove_margins(page)
                
                if no_tables:
                    page = remove_tables(page)
                    
                page_text = page.extract_text().split('\n')
                text += page_text
    elif filename.endswith('.txt'):
        with open(filename, 'r', encoding='utf-8') as f:
            text = f.readlines()

    annex_lines = [re.match('.*ANNEX\s+I.*', line) is not None for line in text]
    annex_index = [i for i,v in enumerate(annex_lines) if v]
    
    text = text[annex_index[0]:annex_index[1]]
    if no_blanks:
        text = [line for line in text if not line.isspace()]
    
    return text


# function: input text, output list of section headers and content
def get_smpc_sections(text):
    idx, headers, sections = [], [], []
    for i,line in enumerate(text):
        if re.match('^[0-9]+\.[0-9]*\s+.*[A-Z].*', line):
            idx += [i]
            headers += [line.strip()]
    
    # in headers, must increment or restart, and not end in punctuation
    idx_valid, headers_valid = [idx[0]], [headers[0]]
    for n in range(1,len(headers)):
        prev = float(headers[n-1].split()[0])
        curr = float(headers[n].split()[0])
        lastchar = headers[n].strip()[-1].lower()
        valid = (prev < curr <= prev+1) or (curr==1)
        valid = valid and (lastchar in 'qwertyuiopasdfghjklzxcvbnm()')
        if valid:
            idx_valid.append(idx[n])
            headers_valid.append(headers[n])
    idx, headers = idx_valid, headers_valid
    
    for n,h in enumerate(headers):
        if (n+1)<len(headers):
            contents = text[idx[n]+1:idx[n+1]]
        else:
            contents = text[idx[n]+1:]
        sections += ['\n'.join(contents)]
    
    return headers, sections


def split_entries(headers, sections):
    num_entries = sum([h.startswith('1.') for h in headers])
    entries = [[] for _ in range(num_entries)]
    entry = -1
    for h,s in zip(headers, sections):
        if h.startswith('1. '):
            entry += 1
        entries[entry] += [[h,s]]
    return entries        
    

# function: save to file (by drug)
def write_smpc_by_drug(entries, filename):
    entries = [e for e in entries if len(e)]
    for i,entry in enumerate(entries):
        name = filename.split('/')[-1][:-4]
        if len(entries)>1:
            name = name + '_' + str(i+1)
        with open('./output/'+name+'.txt', 'w', encoding='utf-8') as f:
            for h,s in entry:
                f.write(h+'\n\n')
                f.write(s+'\n\n')

# original workflow: run these four in series, no longer using fourth one


#%% alternate method CLI conversion pdftotext.exe https://www.xpdfreader.com/download.html

from subprocess import run

errors, old_files, new_files = [], [], []
txt_dir = data_dir + 'txt/'
# See: https://www.xpdfreader.com/download.html
exe_path = '~/Downloads/xpdf-tools-mac-4.04/bin64/pdftotext'
flags = ['-layout', '-nodiag', '-enc', 'UTF-8', '-nopgbrk', '-marginb', '-54']
for src in tqdm(os.listdir(pdf_dir)):
    src = pdf_dir + src
    tgt = src.replace(pdf_dir, txt_dir).replace('.pdf', '.txt')
    command = [exe_path, *flags, src, tgt]
    if not os.path.exists(tgt):
        output = run(command, capture_output=True, shell=True)
        if len(output.stderr):
            errors += [(src, output.stderr)]
        else:
            new_files += [src]
    else:
        old_files += [src]
        
print(f'Converted {len(new_files)} PDFs to TXT, ({len(old_files)+len(new_files)}/{len(os.listdir(pdf_dir))}) complete, {len(errors)} errors')
if(len(errors) > 0):
    print(errors[0])
        


#%% initial pass at finding all headers
# warning, this is very slow for pdf input, 
# txt input is faster (and more accurate), but we can't strip tables

if RUN_DIAGNOSTICS:
    all_headers = []
    errors = []
    input_dir = txt_dir
    files = os.listdir(input_dir)
    for file in tqdm(files):
        try:
            text = read_smpc(input_dir+file)
            headers, sections = get_smpc_sections(text)
            all_headers += headers
        except:
            errors += [file]
    
    all_headers = [re.sub('\s+', ' ', h).strip() for h in all_headers]
    all_headers = [h for h in all_headers if h[-1].lower() in 'qwertyuiopasdfghjklzxcvbnm()']
    header_num = [float(h.split()[0]) for h in all_headers]
    header_txt = [' '.join(h.split()[1:]).title() for h in all_headers]
    counter = Counter(header_txt)
    # 100 cutoff for header counts determined manually, if you inspect this dict
    # you'll see obvious good ones with 1000+ and obvious bad ones with 1-10
    centers = [h for h in counter if counter[h]>100]
    centers.sort()
    print(f'Found {len(centers)} common headers: \n{centers}')


#%% manually specify headers, use edit distance to do clustering
 
centers = [
    'Clinical Particulars',
    'Contraindications',
    'Date Of First Authorisation/Renewal Of The Authorisation',
    'Date Of Revision Of The Text',
    'Effects On Ability To Drive And Use Machines',
    'Fertility, Pregnancy And Lactation',
    'Incompatibilities',
    'Interaction With Other Medicinal Products And Other Forms Of Interaction',
    'List Of Excipients',
    'Marketing Authorisation Holder',
    'Marketing Authorisation Number',
    'Name Of The Medicinal Product',
    'Nature And Contents Of Container',
    'Overdose',
    'Pharmaceutical Form',
    'Pharmaceutical Particulars',
    'Pharmacodynamic Properties',
    'Pharmacokinetic Properties',
    'Pharmacological Properties',
    'Posology And Method Of Administration',
    'Preclinical Safety Data',
    'Pregnancy And Lactation',
    'Qualitative And Quantitative Composition',
    'Shelf Life',
    'Special Precautions For Disposal',
    'Special Precautions For Disposal And Other Handling',
    'Special Precautions For Storage',
    'Special Warnings And Precautions For Use',
    'Therapeutic Indications',
    'Undesirable Effects'
    ]
# note: maybe we should manually merge these pairs:
#   FERTILITY, PREGNANCY AND LACTATION
#   PREGNANCY AND LACTATION
#   SPECIAL PRECAUTIONS FOR DISPOSAL AND OTHER HANDLING
#   SPECIAL PRECAUTIONS FOR DISPOSAL
# but not doing so lets the similarity computation do its thing

# improved initial text parsing step so this clustering problem wasn't so messy
def get_fixed_header(text, centers=centers):
    # return center with the lowest edit distance, 
    #   or placeholder (last entry) if no there's good match
    dists = [levdistance(text.lower(),c.lower()) for c in centers]
    #ix = np.argmin(dists)
    ix = dists.index(min(dists))
    if dists[ix] > 0.6*len(text):
        return None
    else:
        return centers[ix]


#%% use all the above to parse pdfs into sections

input_dir = txt_dir
files = os.listdir(input_dir)
data = pd.read_excel('Medicines_output_european_public_assessment_reports.xlsx', skiprows=8, header=0)

errors = []
records = {}
for file in tqdm(files):
    try:
        info = {}
        product_code = file.split('.')[0]
        row = data[data['Product number']==re.sub('_','/', file.split('.')[0])]
        info['metadata'] = row.iloc[0].apply(str).to_dict()
        
        label_text = {} # next level = product page w/ metadata
        text = read_smpc(input_dir+file)
        headers, sections = get_smpc_sections(text)
    
        for h,s in zip(headers,sections):
            header = get_fixed_header(h)
            if (header is not None) and (len(s)>0):
                if header not in label_text.keys():
                    label_text[header] = [s]
                else:
                    label_text[header].append(s)
        
        info['Label Text'] = label_text
        
        records[row['Product number'].iloc[0]] = info

    except:
        errors += [file]

print(f'Encountered problems reading {len(errors)} files')
with open('data/output/human-rx-drug-ema.json', 'w') as f:
    json.dump(records, f, indent=4)
    
    

Found records for 1988 products
Human         1706
Veterinary     282
Name: Category, dtype: int64


1706it [00:24, 69.04it/s] 


Wrote 0 files with 108 errors, 1598 already downloaded


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1598/1598 [00:37<00:00, 43.08it/s]


Converted 0 PDFs to TXT, (0/1598) complete, 1598 errors
('data/ema/pdf/EMEA_H_C_000913.pdf', b"pdftotext version 4.04 [www.xpdfreader.com]\nCopyright 1996-2022 Glyph & Cog, LLC\nUsage: pdftotext [options] <PDF-file> [<text-file>]\n  -f <int>               : first page to convert\n  -l <int>               : last page to convert\n  -layout                : maintain original physical layout\n  -simple                : simple one-column page layout\n  -simple2               : simple one-column page layout, version 2\n  -table                 : similar to -layout, but optimized for tables\n  -lineprinter           : use strict fixed-pitch/height layout\n  -raw                   : keep strings in content stream order\n  -fixed <number>        : assume fixed-pitch (or tabular) text\n  -linespacing <number>  : fixed line spacing for LinePrinter mode\n  -clip                  : separate clipped text\n  -nodiag                : discard diagonal text\n  -enc <string>          : output text encodi

0it [00:00, ?it/s]

Encountered problems reading 0 files



