In [22]:
import pandas as pd
from io import StringIO
pd.options.mode.copy_on_write = True
import requests
import re

def getCurie(name):
    itemRequest = 'https://name-resolution-sri.renci.org/lookup?string=' + name + '&autocomplete=false&offset=0&limit=10&biolink_type=ChemicalEntity'
    returned = (pd.read_json(StringIO(requests.get(itemRequest).text)))
    resolvedName = returned.curie
    resolvedLabel = returned.label
    return resolvedName, resolvedLabel

def getCombinationTherapiesAndSingleTherapiesPMDA(pmdaList, exclusions):
    pmdaCombinationTherapies = []
    pmdaSingleTherapies = []
    for item in pmdaList:
        if type(item)!=float and (("," in item) or ("/" in item) or (" AND " in item)) and item not in exclusions:
            newItem = item.replace(",","; ").replace(" AND ", "; ").replace("/","; ").replace(";;",";").replace(";  ", "; ").replace("  ;", ";").replace(" ;",";").strip()
            if "(" in newItem:
                newItem = newItem.replace("(1)","").replace("(2)","").replace("(3)","").replace("(4)","").replace("(5)","").replace("(6)","").replace("(7)","").replace("(8)","")\
                .replace("(9)","").replace("(10)","").replace("(11)","").replace("(12)","").replace("(13)","").replace("(14)","").replace("(15)","").replace("(16)","")\
                .replace("(17)","").replace("(18)","").replace("(19)","").replace("(20)","").replace("(20)","").replace("(21)","").replace("1)","").replace("2)","").replace("5)","")

            print(newItem)
            pmdaCombinationTherapies.append(newItem)
        else:
            pmdaSingleTherapies.append(item)
    return pmdaCombinationTherapies, pmdaSingleTherapies

def makeUppercase(list):
    for index, item in enumerate(list):
        if not type(item)==float:
            list[index] = item.upper().replace('\n',' ').strip()
    return list



def isBasicCation(item):
    basic_cations = ['FERROUS', 
                     'CALCIUM', 
                     'SODIUM', 
                     'MAGNESIUM', 
                     'MANGANESE', 
                     'POTASSIUM', 
                     'ALUMINUM', 
                     'TITANIUM', 
                     'COPPER', 
                     'CUPRIC', 
                     'LYSINE']
    
    if item in basic_cations:
        return True

    return False

def isBasicAnion(item):
    basic_anions = ['CHLORIDE', 
                    'DIOXIDE', 
                    'OXIDE', 
                    'ACETATE', 
                    'SULFATE', 
                    'PHOSPHATE', 
                    'HYDROXIDE', 
                    'HYDROCHLORIDE',
                    'CITRATE', 
                    'DIACETATE', 
                    'TRIACETATE', 
                    'ADIPATE', 
                    'TARTRATE', 
                    'BITARTRATE', 
                    'FUMARATE', 
                    'HEMIFUMARATE',
                    'MALEATE', 
                    'BROMIDE', 
                    'MEGLUMINE', 
                    'BICARBONATE', 
                    'MESYLATE',
                    'MESILATE',
                    'DISULFIDE', 
                    'FLUORIDE', 
                    'GLYCEROPHOSPHATE']

    if item in basic_anions:
        return True

    return False

def isOtherBasicTerm(item):
    other_identifiers = ['HYDRATE', 
                         'DIHYDRATE', 
                         'MONOHYDRATE', 
                         'TRIHYDRATE', 
                         'ANHYDROUS', 
                         'MONOBASIC', 
                         'DIBASIC', 
                         'LYSINE', 
                         'ARGININE',
                         'HEPTAHYDRATE',
                         'TETRAHYDRATE']

    if item in other_identifiers:
        return True
        
    return False

def isBasicSaltOrMetalOxide(inString):
    components = inString.strip().split()
    
    for item in components:
        item = item.replace(';', '').replace(',','')
        if not isBasicCation(item) and not isBasicAnion(item) and not isOtherBasicTerm(item):
            return False
            
    return True

def removeCationsAnionsAndBasicTerms(ingredientString):
    if not isBasicSaltOrMetalOxide(ingredientString):
        components = ingredientString.strip().split()
        for ind,i in enumerate(components):
            if isBasicAnion(i) or isBasicCation(i) or isOtherBasicTerm(i):
                components[ind] = ''
        newString = ''
        for i in components:
            newString = newString + i + " "
        newString = newString[:-1]
        return newString
    return ingredientString


print("Ingesting PMDA list")
pmda_approvals_df = pd.read_csv("pmda_approvals.csv")
drugList = pmda_approvals_df['Active Ingredient (underlined: new active ingredient)']

drugList = makeUppercase(drugList)
splitExclusions = set(list(pd.read_excel("pmda_split_exclusions.xlsx")['name']))
pmdaCombinationTherapies, pmdaSingleTherapies = getCombinationTherapiesAndSingleTherapiesPMDA(drugList, splitExclusions)

exclusions = pd.read_excel('exclusions_pmda.xlsx')['name']
drugList = list(set(pmdaSingleTherapies + pmdaCombinationTherapies).difference(exclusions))


Approved_Japan = []
combination_therapy = []
therapyName = []
name_in_pmda_list = []
curie_ID = []
curie_label = []
ingredient_curies = []

for index, item in enumerate(drugList):
    print(index)
    name_in_pmda_list.append(item) #1
    Approved_Japan.append("True") #2
    item = item.strip()
    if item in pmdaCombinationTherapies:
        item = item.replace("A COMBINATION DRUG OF", "").strip(' ')
        newIngredientList = []
        ingredientCuriesList = []
        
        combination_therapy.append("True") #3
        
        ingList = re.split(' , |,|/| \ | AND |; ', item)
        for i in ingList:
            i = i.strip()
            if i not in pmdaSingleTherapies:
                pmdaSingleTherapies.append(i.strip())
                drugList.append(i.strip())
            
            newName = removeCationsAnionsAndBasicTerms(i)
            print("ingredient: ", i , ". New name: ", newName, ".")

            
            newIngredientList.append(newName)
            #curie, label = getCurie(newName)
            ingredientCuriesList.append(" ")#(curie[0])

        ingredient_curies.append(ingredientCuriesList) #4

        newTherapyName = ""
        for i in newIngredientList:
            newTherapyName += i.strip() + "; "
        newTherapyName = newTherapyName[:-2].strip()
        therapyName.append(newTherapyName) #5
        #curie, label = getCurie(newTherapyName)
        curie_ID.append(" ")#(curie[0]) #6
        curie_label.append(" ")#(label[0]) #7

        print("Old name: ", item, ". New name: ", newTherapyName, ".")
        print("ingredient names: ", newIngredientList)


    elif item in pmdaSingleTherapies:
        combination_therapy.append("False")
        newName = removeCationsAnionsAndBasicTerms(item.upper().strip())
        therapyName.append(newName) #3
        try:
            #curie, label = getCurie(newName) #4
            curie_ID.append(" ")#(curie[0]) #5
            curie_label.append(" ")#(label[0]) #6
        except:
            curie_ID.append("Error")
            curie_label.append("Error")
        ingredient_curies.append("NA") #7


print(len(set(pmdaSingleTherapies)), "single-component therapies after splitting")
print(len(set(pmdaSingleTherapies+pmdaCombinationTherapies)), " total therapies after splitting")

print("removing excluded therapies")
pmdaDrugSet = set(pmdaSingleTherapies+pmdaCombinationTherapies).difference(exclusions)
pmdaDrugSet = set(pmdaDrugSet).difference(pd.read_excel("pmda_deduplication.xlsx")['To Remove'])

data = pd.DataFrame({'single_ID':curie_ID, 
                     'Name_PMDA':name_in_pmda_list,
                     'Therapy_Name':therapyName,
                     'ID_Label':curie_label,
                     'Approved_Japan': Approved_Japan, 
                     'Combination_Therapy':combination_therapy, 
                     'Ingredient_IDs':ingredient_curies})

data.to_excel("pmda_list.xlsx")

Ingesting PMDA list
APADAMTASE ALFA; CINAXADAMTASE ALFA
EFGARTIGIMOD ALFA; VORHYALURONIDASE ALFA
DAUNORUBICIN HYDROCHLORIDE; CYTARABINE
FOSLEVODOPA; FOSCARBIDOPA HYDRATE
RIPASUDIL HYDROCHLORIDE HYDRATE; BRIMONIDINE TARTRATE
TIXAGEVIMAB; CILGAVIMAB
RELEBACTAM HYDRATE; IMIPENEM HYDRATE; CILASTATIN SODIUM
NIRMATRELVIR; RITONAVIR
L-LYSINE HYDROCHLORIDE; L-ARGININE HYDROCHLORIDE
ANHYDROUS SODIUM SULFATE; POTASSIUM SULFATE; MAGNESIUM SULFATE HYDRATE
DARATUMUMAB; VORHYALURONIDASE ALFA
INSULIN DEGLUDEC; LIRAGLUTIDE
INSULIN GLARGINE; LIXISENATIDE
ASPIRIN; VONOPRAZAN FUMARATE
BRIMONIDINE TARTRATE; TIMOLOL MALEATE
BRIMONIDINE TARTRATE; BRINZOLAMIDE
DARUNAVIR ETHANOLATE; COBICISTAT; EMTRICITABINE; TENOFOVIR ALAFENAMIDE FUMARATE
DOLUTEGRAVIR SODIUM; LAMIVUDINE
MACROGOL 4000; SODIUM CHLORIDE; SODIUM BICARBONATE; POTASSIUM CHLORIDE
CEFTOLOZANE  SULFATE; TAZOBACTAM SODIUM
SOFOSBUVIR; VELPATASVIR
LEVONORGESTREL; ETHINYLESTRAD IOL
RILPIVIRINE HYDROCHLORIDE; EMTRICITABINE; TENOFOVIR  ALAFENAMIDE FUMARATE

ValueError: All arrays must be of the same length