In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None  #default='warn'
import numpy as np
import difflib as dl
import psycopg2 as pg
import re
import requests
from io import StringIO

def getCurie(name):
    itemRequest = 'https://name-resolution-sri.renci.org/lookup?string=' + name + '&autocomplete=false&offset=0&limit=10&biolink_type=ChemicalEntity'
    returned = (pd.read_json(StringIO(requests.get(itemRequest).text)))
    resolvedName = returned.curie
    resolvedLabel = returned.label
    return resolvedName, resolvedLabel

def getCombinationTherapiesAndSingleTherapiesLists(orangebook, exclusions):
    obCombinationTherapies = []
    obSingleTherapies = []
    ingredientList = set(list(orangebook.Ingredient))
    for item in ingredientList:
        if (";" in item) or (" AND " in item) or ("W/" in item):
            obCombinationTherapies.append(item)
        else:
            obSingleTherapies.append(item.strip())
    return list(set(obCombinationTherapies)), list(set(obSingleTherapies))

def getAllStatuses(orangebook, item):
    indices = [i for i, x in enumerate(orangebook['Ingredient']) if x == item]
    return list(orangebook['Type'][indices])

def getMostPermissiveStatus(statusList):
    if "OTC" in statusList:
        return "OTC"
    elif "RX" in statusList:
        return "RX"
    elif "DISCN" in statusList:
        return "DISCONTINUED"
    else:
        return "UNSURE"

def isBasicCation(item):
    basic_cations = ['FERROUS', 
                     'CALCIUM', 
                     'SODIUM', 
                     'MAGNESIUM', 
                     'MANGANESE', 
                     'POTASSIUM', 
                     'ALUMINUM', 
                     'TITANIUM', 
                     'COPPER', 
                     'CUPRIC', 
                     'LYSINE']
    
    if item in basic_cations:
        return True

    return False

def isBasicAnion(item):
    basic_anions = ['CHLORIDE', 
                    'DIOXIDE', 
                    'OXIDE', 
                    'ACETATE', 
                    'SULFATE', 
                    'PHOSPHATE', 
                    'HYDROXIDE', 
                    'HYDROCHLORIDE',
                    'CITRATE', 
                    'DIACETATE', 
                    'TRIACETATE', 
                    'ADIPATE', 
                    'TARTRATE', 
                    'BITARTRATE', 
                    'FUMARATE', 
                    'HEMIFUMARATE',
                    'MALEATE', 
                    'BROMIDE', 
                    'MEGLUMINE', 
                    'BICARBONATE', 
                    'MESYLATE', 
                    'DISULFIDE', 
                    'FLUORIDE', 
                    'GLYCEROPHOSPHATE']

    if item in basic_anions:
        return True

    return False

def isOtherBasicTerm(item):
    other_identifiers = ['HYDRATE', 
                         'DIHYDRATE', 
                         'MONOHYDRATE', 
                         'TRIHYDRATE', 
                         'ANHYDROUS', 
                         'MONOBASIC', 
                         'DIBASIC', 
                         'LYSINE', 
                         'ARGININE',
                         'HEPTAHYDRATE']

    if item in other_identifiers:
        return True
        
    return False

def isBasicSaltOrMetalOxide(inString):
    components = inString.strip().split()
    
    for item in components:
        item = item.replace(';', '').replace(',','')
        if not isBasicCation(item) and not isBasicAnion(item) and not isOtherBasicTerm(item):
            return False
            
    return True

def removeCationsAnionsAndBasicTerms(ingredientString):
    if not isBasicSaltOrMetalOxide(ingredientString):
        components = ingredientString.strip().split()
        for ind,i in enumerate(components):
            if isBasicAnion(i) or isBasicCation(i) or isOtherBasicTerm(i):
                components[ind] = ''
        newString = ''
        for i in components:
            newString = newString + i + " "
        newString = newString[:-1]
        return newString
    return ingredientString
        
orangebook = pd.read_csv("products.txt", delimiter="~")
#orangebook.to_excel("orangebooklist.xlsx")
splitExclusions = set(list(pd.read_excel("fda_ob_split_exclusions.xlsx")['name']))
obCombinationTherapies, obSingleTherapies = getCombinationTherapiesAndSingleTherapiesLists(orangebook, splitExclusions)
print(len(set(obCombinationTherapies)), " combination therapeutics.")
print(len(set(obSingleTherapies)), " single-ingredient therapeutics.")
obSingleSet = set(obSingleTherapies)
print("splitting combination therapies (currently ", len(obSingleSet), " items in list)")
exclusions = pd.read_excel('fda_exclusions.xlsx')['name']
Approved_USA = []
combination_therapy = []
therapyName = []
name_in_orange_book = []
available_USA = []
curie_ID = []
curie_label = []

ingredient_curies = []


drugList = list(set(obCombinationTherapies + obSingleTherapies).difference(exclusions))


labelDict = {}
idDict = {}

for index, item in enumerate(drugList):
    if item in obCombinationTherapies:
        name_in_orange_book.append(item)#1
        Approved_USA.append("True")#2
        combination_therapy.append("True")#3
        available_USA.append(getMostPermissiveStatus(getAllStatuses(orangebook,item)))#4
        print("item ",index, ": ", item)
       
        newIngList = []
        ingList = re.split('; | ; | AND | W/ ', item)
        ingredientCuriesList = []
        
        for i in ingList:
            if i in idDict:
                ingredientCuriesList.append(idDict[i])
            else:
                curie, label = getCurie(i)
                ingredientCuriesList.append(curie[0])
                idDict[i]=curie[0]
                labelDict[i] = label[0]
            if i not in obSingleTherapies:
                drugList.append(i.strip())
                obSingleTherapies.append(i.strip())
            #print("old name: ", i, "; new name: ", removeCationsAnionsAndBasicTerms(i))
            newIngList.append(removeCationsAnionsAndBasicTerms(i)) #5

        ingredient_curies.append(ingredientCuriesList) #6
        newName = ""
        curie,label = getCurie(newName)
        curie_ID.append(curie[0]) #7
        curie_label.append(label[0]) #8
        
        for i in newIngList:
            if i is not None:
                newName += i + "; "
        newName = newName[:-2]
        therapyName.append(newName)#9
        
        
    elif item in obSingleTherapies:
        itemStatuses = getAllStatuses(orangebook,item)
        name_in_orange_book.append(item)
        therapyName.append(removeCationsAnionsAndBasicTerms(item))
        Approved_USA.append("True")
        combination_therapy.append("False")
        available_USA.append(getMostPermissiveStatus(getAllStatuses(orangebook,item)))
        print("item ", index, ": ", item)
        if item in idDict and item in labelDict:
            curie_ID.append(idDict[item])
        else:
            curie,label = getCurie(item)
            curie_ID.append(curie[0])
            curie_label.append(label[0])
            idDict[item] = curie[0]
            labelDict[item] = label[0]
            
        ingredient_curies.append("NA")

print(len(obSingleTherapies), "single-component therapies after splitting")
print(len(obSingleTherapies + obCombinationTherapies), " total therapies after splitting")

print(len(therapyName), " therapies after exclusions")

data = pd.DataFrame({'Single_ID':curie_ID, 
                     'ID_Label':curie_label, 
                     'Name_Orange_Book':name_in_orange_book,
                     'Therapy_Name':therapyName, 
                     'Approved_USA': Approved_USA, 
                     'Combination_Therapy':combination_therapy, 
                     'Available_USA':available_USA, 
                     'Ingredient_IDs':ingredient_curies})

data.to_excel("fda_ob_list.xlsx")

586  combination therapeutics.
2065  single-ingredient therapeutics.
splitting combination therapies (currently  2065  items in list)
item  0 :  ARGATROBAN
item  1 :  IBUPROFEN
item  2 :  BACITRACIN ZINC
item  3 :  VALPROIC ACID
item  4 :  PERAMPANEL
item  5 :  ACETIC ACID, GLACIAL; HYDROCORTISONE; NEOMYCIN SULFATE
item  6 :  AMINO ACIDS; CALCIUM CHLORIDE; DEXTROSE; MAGNESIUM SULFATE; POTASSIUM CHLORIDE; SODIUM ACETATE; SODIUM GLYCEROPHOSPHATE; SOYBEAN OIL
item  7 :  CYCLOPENTOLATE HYDROCHLORIDE; PHENYLEPHRINE HYDROCHLORIDE
item  8 :  METFORMIN HYDROCHLORIDE; PIOGLITAZONE HYDROCHLORIDE
item  9 :  ABEMACICLIB
item  10 :  BACAMPICILLIN HYDROCHLORIDE
item  11 :  AMIODARONE HYDROCHLORIDE
item  12 :  LOPERAMIDE HYDROCHLORIDE
item  13 :  LEVONORDEFRIN; MEPIVACAINE HYDROCHLORIDE
item  14 :  DEXBROMPHENIRAMINE MALEATE; PSEUDOEPHEDRINE SULFATE
item  15 :  ATROPINE SULFATE; DIFENOXIN HYDROCHLORIDE
item  16 :  BENZTROPINE MESYLATE
item  17 :  CHLOROTHIAZIDE; METHYLDOPA
item  18 :  CARBENICILLIN I