In [2]:
import os
import pandas as pd
import time
import math
import re
from io import StringIO
import requests

def preferRXCUI(curieList, labelList):
    for idx, item in enumerate(curieList):
        if "RXCUI" in item:
            return item, labelList[idx]
    return curieList[0], labelList[0]

def getCombinationTherapiesAndSingleTherapiesEMA(emaList, exclusions):
    emaCombinationTherapies = []
    emaSingleTherapies = []
    for item in emaList:
        if type(item)!=float and (("," in item) or ("/" in item) or ("AND" in item)) and item not in exclusions:
            emaCombinationTherapies.append(item)
        else:
            emaSingleTherapies.append(item)
    return emaCombinationTherapies, emaSingleTherapies

def isBasicCation(item):
    basic_cations = ['FERROUS', 
                     'CALCIUM', 
                     'SODIUM', 
                     'MAGNESIUM', 
                     'MANGANESE', 
                     'POTASSIUM', 
                     'ALUMINUM', 
                     'TITANIUM', 
                     'COPPER', 
                     'CUPRIC', 
                     'LYSINE']
    
    if item in basic_cations:
        return True

    return False

def isBasicAnion(item):
    basic_anions = ['CHLORIDE', 
                    'DIOXIDE', 
                    'OXIDE', 
                    'ACETATE', 
                    'SULFATE', 
                    'PHOSPHATE', 
                    'HYDROXIDE', 
                    'HYDROCHLORIDE',
                    'CITRATE', 
                    'DIACETATE', 
                    'TRIACETATE', 
                    'ADIPATE', 
                    'TARTRATE', 
                    'BITARTRATE', 
                    'FUMARATE', 
                    'HEMIFUMARATE',
                    'MALEATE', 
                    'BROMIDE', 
                    'MEGLUMINE', 
                    'BICARBONATE', 
                    'MESYLATE', 
                    'DISULFIDE', 
                    'FLUORIDE', 
                    'GLYCEROPHOSPHATE']

    if item in basic_anions:
        return True

    return False

def isOtherBasicTerm(item):
    other_identifiers = ['HYDRATE', 
                         'DIHYDRATE', 
                         'MONOHYDRATE', 
                         'TRIHYDRATE', 
                         'ANHYDROUS', 
                         'MONOBASIC', 
                         'DIBASIC', 
                         'LYSINE', 
                         'ARGININE',
                         'HEPTAHYDRATE']

    if item in other_identifiers:
        return True
        
    return False

def isBasicSaltOrMetalOxide(inString):
    components = inString.strip().split()
    
    for item in components:
        item = item.replace(';', '').replace(',','')
        if not isBasicCation(item) and not isBasicAnion(item) and not isOtherBasicTerm(item):
            return False
            
    return True

def removeCationsAnionsAndBasicTerms(ingredientString):
    if not isBasicSaltOrMetalOxide(ingredientString):
        components = ingredientString.strip().split()
        for ind,i in enumerate(components):
            if isBasicAnion(i) or isBasicCation(i) or isOtherBasicTerm(i):
                components[ind] = ''
        newString = ''
        for i in components:
            newString = newString + i + " "
        newString = newString[:-1]
        return newString
    return ingredientString



def getCurie(name):
    itemRequest = 'https://name-resolution-sri.renci.org/lookup?string=' + name + '&autocomplete=false&offset=0&limit=10&biolink_type=ChemicalOrDrugOrTreatment'
    returned = (pd.read_json(StringIO(requests.get(itemRequest).text)))
    resolvedName = returned.curie
    resolvedLabel = returned.label
    return resolvedName, resolvedLabel


print("Ingesting EMA list")

ema = pd.read_excel("EPAR_table_4.xlsx",skiprows=[0,1,2,3,4,5,6,7])
humanDrugs = ema[ema.Category=='Human']
approvedDrugs = humanDrugs[humanDrugs['Authorisation status']=='Authorised']
drugnames = list(approvedDrugs['International non-proprietary name (INN) / common name'])

for idx, name in enumerate(drugnames):
    try:
        drugnames[idx] = name.upper().replace('\n','').replace('\r','')
    except:
        drugnames[idx] = drugnames[idx]

splitExclusions = set(list(pd.read_excel("ema_split_exclusions.xlsx")['name']))
emaCombinationTherapies, emaSingleTherapies = getCombinationTherapiesAndSingleTherapiesEMA(drugnames, splitExclusions)
emaSingleTherapies = list(set(emaSingleTherapies))
emaSingleSet = set(emaSingleTherapies)
print("Splitting combination therapies (currently ", len(emaSingleSet), "unique compounds in list)")


Approved_EMA = []
combination_therapy = []
therapyName = []
name_in_ema = []
curie_ID = []
curie_label = []
ingredientCuriesList = []

print("Removing excluded therapies")
exclusions = pd.read_excel('ema_exclusions.xlsx')['name']
drugList = list(set(emaCombinationTherapies + emaSingleTherapies).difference(exclusions))


for index, item in enumerate(drugList):
    print(index)
    if item in emaCombinationTherapies:
        name_in_ema.append(item) #1
        Approved_EMA.append("True") #2
        combination_therapy.append("True") #3
        
        newIngList = []
        curr_ingredient_curies = []
        
        ingList = re.split(', | / | AND', item)
        
        for idx2, i in enumerate(ingList):
            if i not in emaSingleTherapies:
                print(i, " not found in single therapies. Adding it.")
                newName = removeCationsAnionsAndBasicTerms(i.strip())
                emaSingleTherapies.append(i.strip())
                drugList.append(i.strip())
            ingList[idx2] = i.upper()
            
        ingList.sort()
        
        for i in ingList:
            newItem = removeCationsAnionsAndBasicTerms(i)
            newIngList.append(newItem)
            print(newItem)
            try:
                curie,label = getCurie(newItem)
                curr_ingredient_curies.append(curie[0])
            except:
                print("curie extraction for ", newItem, " failed")
        ingredientCuriesList.append(curr_ingredient_curies) #4

        newName = ""
        for i in newIngList:
            if i is not None:
                newName += i + "; "
                
        therapyName.append(newName[:-2]) #5

        newName = newName[:-2]
        print("old name: ", item, ". new name: ", newName)
        curie, label = getCurie(newName)
        preferred_curie, preferred_label = preferRXCUI(curie, label) #prefer RXCUI labels only if combination therapy.
        curie_ID.append(preferred_curie) #6
        curie_label.append(preferred_label) #7


    elif item in emaSingleTherapies:
        name_in_ema.append(item) #1
        Approved_EMA.append("True") #2
        therapyName.append(removeCationsAnionsAndBasicTerms(item)) #3
        combination_therapy.append("False") #4

        try:
            curie,label = getCurie(item)
            curie_ID.append(curie[0]) #5
            curie_label.append(label[0])#6
        except:
            print("curie extraction for ", item, ' failed')
            curie_ID.append("Error") #5
            curie_label.append("Error") #6
        
        ingredientCuriesList.append("NA")#7
                                   
        

print(len(set(emaSingleTherapies)), "single-component therapies after splitting")
print(len(drugList), " total therapies after splitting")


Ingesting EMA list
Splitting combination therapies (currently  786 unique compounds in list)
Removing excluded therapies
0
1
2
3
METFORMIN  not found in single therapies. Adding it.
METFORMIN
SITAGLIPTIN
old name:  SITAGLIPTIN, METFORMIN . new name:  METFORMIN; SITAGLIPTIN
4
5
ABACAVIR (AS SULFATE)  not found in single therapies. Adding it.
ZIDOVUDINE  not found in single therapies. Adding it.
ABACAVIR (AS SULFATE)
LAMIVUDINE
ZIDOVUDINE
old name:  ABACAVIR (AS SULFATE) / LAMIVUDINE / ZIDOVUDINE . new name:  ABACAVIR (AS SULFATE); LAMIVUDINE; ZIDOVUDINE
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
SITAGLIPTIN HYDROCHLORIDE MONOHYDRATE  not found in single therapies. Adding it.
METFORMIN HYDROCHLORIDE  not found in single therapies. Adding it.
METFORMIN 
SITAGLIPTIN  
old name:  SITAGLIPTIN HYDROCHLORIDE MONOHYDRATE, METFORMIN HYDROCHLORIDE . new name:  METFORMIN ; SITAGLIPTIN  
41
42
43
44
FORMOTEROL  not found in single therapies.

In [22]:
print(len(name_in_ema))
print(len(therapyName))
print(len(Approved_EMA))
print(len(combination_therapy))
print(len(curie_ID))
print(len(curie_label))
print(len(ingredientCuriesList))



data = pd.DataFrame({'single_ID':curie_ID,
                     'ID_Label':curie_label,
                     'Name_EMA':name_in_ema,
                     'Therapy_Name':therapyName, 
                     'Approved_Europe': Approved_EMA, 
                     'Combination_Therapy':combination_therapy, 
                     'Ingredient_IDs':ingredientCuriesList})

data.to_excel("ema_list.xlsx")


996
996
996
996
996
996
996
