In [55]:
import os
import pandas as pd
import time
import math
import re

def getCombinationTherapiesAndSingleTherapiesEMA(emaList, exclusions):
    emaCombinationTherapies = []
    emaSingleTherapies = []
    for item in emaList:
        if type(item)!=float and (("," in item) or ("/" in item) or ("AND" in item)) and item not in exclusions:
            emaCombinationTherapies.append(item)
        else:
            emaSingleTherapies.append(item)
    return emaCombinationTherapies, emaSingleTherapies

print("Ingesting EMA list")

ema = pd.read_excel("EPAR_table_4.xlsx",skiprows=[0,1,2,3,4,5,6,7])
humanDrugs = ema[ema.Category=='Human']
approvedDrugs = humanDrugs[humanDrugs['Authorisation status']=='Authorised']
drugnames = list(approvedDrugs['International non-proprietary name (INN) / common name'])

for idx, name in enumerate(drugnames):
    try:
        drugnames[idx] = name.upper().replace('\n','').replace('\r','')
    except:
        drugnames[idx] = drugnames[idx]

splitExclusions = set(list(pd.read_excel("ema_split_exclusions.xlsx")['name']))
emaCombinationTherapies, emaSingleTherapies = getCombinationTherapiesAndSingleTherapiesEMA(drugnames, splitExclusions)
emaSingleSet = list(set(emaSingleTherapies))
print("Splitting combination therapies (currently ", len(emaSingleSet), "unique compounds in list)")

for item in emaCombinationTherapies:
    ingList = re.split(', | / | AND', item).strip()
    for i in ingList:
        if i not in emaSingleSet:
            emaSingleTherapies.append(i)

print(len(set(emaSingleTherapies)), "single-component therapies after splitting")
print(len(set(emaSingleTherapies+emaCombinationTherapies)), " total therapies after splitting")

print("Removing excluded therapies")
exclusions = pd.read_excel('ema_exclusions.xlsx')['name']
drugList = set(emaSingleTherapies+emaCombinationTherapies).difference(exclusions)
print(len(drugList), " total therapies after exclusions")

sheetData = pd.DataFrame(data=[drugList]).transpose()
sheetData.columns = ['Drug Name']
sheetData.to_excel("ema_list.xlsx")


Ingesting EMA list
Splitting combination therapies (currently  786 unique compounds in list)
893 single-component therapies after splitting
1009  total therapies after splitting
Removing excluded therapies
994  total therapies after exclusions
