In [2]:
import spacy
import scispacy
from scispacy.linking import EntityLinker
import json
import numpy as np
from drug_utils import DataHandler
from tqdm import tqdm
import re

In [None]:
def remove_text_in_parentheses(drug_data):
    # Regular expression pattern to find text within parentheses
    pattern = r'\s*\([^)]*\)'
    for entry in drug_data:
        # Remove text within parentheses and any leading/trailing whitespace
        entry['name'] = re.sub(pattern, '', entry['name']).strip()
    return drug_data

In [3]:
dh = DataHandler("processed_data2.json")
drugs_data = dh.load_data()

In [6]:
pattern = r'\s*\([^)]*\)'
drug_class_lst = []
for drug in drugs_data:
    drug['alternative_names'] = list(set(drug['alternative_names'] ))
    drug['alternative_names'] = [s for s in drug['alternative_names'] if s != "\n" and any(c.isalpha() for c in s)]
    drug_class_lst.extend(drug['drug_classes'])


In [7]:
drugs_data

[{'name': 'Azulfidine',
  'alternative_names': ['5-aminosalicylates',
   'Azulfidine',
   'sulfasalazine',
   'Sulfazine',
   'Azulfidine EN-tabs',
   'Antirheumatics'],
  'drug_classes': ['5-aminosalicylates', 'Antirheumatics'],
  'uses': ['Arthritis', 'Rheumatoid Arthritis', 'Ulcerative Colitis'],
  'status': 'Prescription only'},
 {'name': 'Abstral',
  'alternative_names': ['Fentora', 'Abstral', 'Subsys', 'Opioids', 'fentanyl'],
  'drug_classes': ['Opioids (narcotic analgesics)'],
  'uses': ['Cancer Pain'],
  'status': 'Discontinued'},
 {'name': 'Aczone',
  'alternative_names': ['dapsone topical', 'Aczone'],
  'drug_classes': ['Topical acne agents'],
  'uses': ['Acne Vulgaris', 'Hypersensitivity'],
  'status': 'Prescription only'},
 {'name': 'Adzynma',
  'alternative_names': ['ADAMTS13, recombinant-krhn', 'Adzynma'],
  'drug_classes': [],
  'uses': ['Headache',
   'Digital Arthropathy-Brachydactyly, Familial',
   'Infusion procedures',
   'Abdominal Pain',
   'Intravenous infusion p

In [11]:

# Regular expression pattern to match any space followed by parentheses and their contents
pattern = r'\s*\([^)]*\)'

# Processing the list
processed_drug_names = [re.sub(pattern, '', name) for name in drug_class_lst if name != "\n" and any(c.isalpha() for c in name)]

# The processed list
processed_drug_names = set(processed_drug_names)
processed_drug_names

{'5-alpha-reductase inhibitors',
 '5-aminosalicylates',
 '5HT3 receptor antagonists',
 'ACE inhibitors with calcium channel blocking agents',
 'ACE inhibitors with thiazides',
 'AMPA receptor antagonists',
 'Adamantane antivirals',
 'Adrenal corticosteroid inhibitors',
 'Adrenergic bronchodilators',
 'Adrenergic uptake inhibitors for ADHD',
 'Agents for hypertensive emergencies',
 'Agents for pulmonary hypertension',
 'Aldosterone receptor antagonists',
 'Alkylating agents',
 'Allergenics',
 'Alpha-adrenoreceptor antagonists',
 'Alpha-glucosidase inhibitors',
 'Amebicides',
 'Aminoglycosides',
 'Aminopenicillins',
 'Amylin analogs',
 'Analgesic combinations',
 'Androgens and anabolic steroids',
 'Angiotensin Converting Enzyme Inhibitors',
 'Angiotensin II inhibitors with calcium channel blockers',
 'Angiotensin II inhibitors with thiazides',
 'Angiotensin receptor blockers',
 'Angiotensin receptor blockers and neprilysin inhibitors',
 'Anorectal preparations',
 'Anorexiants',
 'Antacid

In [13]:
for drug in drugs_data:
    drug["alternative_names"] = [name for name in drug["alternative_names"] if name not in processed_drug_names]
    drug["alternative_names"] = [
        name for name in drug["alternative_names"] 
        if any(c.isalpha() for c in name) and not name.startswith('...')
]
drugs_data

[{'name': 'Azulfidine',
  'alternative_names': ['Azulfidine',
   'sulfasalazine',
   'Sulfazine',
   'Azulfidine EN-tabs'],
  'drug_classes': ['5-aminosalicylates', 'Antirheumatics'],
  'uses': ['Arthritis', 'Rheumatoid Arthritis', 'Ulcerative Colitis'],
  'status': 'Prescription only'},
 {'name': 'Abstral',
  'alternative_names': ['Fentora', 'Abstral', 'Subsys', 'fentanyl'],
  'drug_classes': ['Opioids (narcotic analgesics)'],
  'uses': ['Cancer Pain'],
  'status': 'Discontinued'},
 {'name': 'Aczone',
  'alternative_names': ['dapsone topical', 'Aczone'],
  'drug_classes': ['Topical acne agents'],
  'uses': ['Acne Vulgaris', 'Hypersensitivity'],
  'status': 'Prescription only'},
 {'name': 'Adzynma',
  'alternative_names': ['ADAMTS13, recombinant-krhn', 'Adzynma'],
  'drug_classes': [],
  'uses': ['Headache',
   'Digital Arthropathy-Brachydactyly, Familial',
   'Infusion procedures',
   'Abdominal Pain',
   'Intravenous infusion procedures',
   'Congenital Thrombotic Thrombocytopenic Pu

In [10]:
import requests
from bs4 import BeautifulSoup

# Base URL with a placeholder for the letter
base_url = "https://www.rxlist.com/script/main/alphaidx.asp?p={}_rx-mcon"

# A dictionary to hold the text from each link, organized by letter
drug_links_text = {}

# Loop over every letter in the alphabet
for letter in 'abcdefghijklmnopqrstuvwxyz':
    # Format the URL with the current letter
    url = base_url.format(letter)
    
    # Make the request
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all <li> elements
        li_elements = soup.find_all('li')
        
        # Extract the text from each <a> tag within each <li> element
        links_text = [a.get_text() for li in li_elements for a in li.find_all('a')]
        
        # Save the texts in the dictionary under the current letter
        drug_links_text[letter] = links_text
    else:
        # Handle unsuccessful requests
        drug_links_text[letter] = []

# drug_links_text now contains the text of each link for each letter of the alphabet
drug_links_text

{'a': ['A',
  'B',
  'C',
  'D',
  'E',
  'F',
  'G',
  'H',
  'I',
  'J',
  'K',
  'L',
  'M',
  'N',
  'O',
  'P',
  'Q',
  'R',
  'S',
  'T',
  'U',
  'V',
  'W',
  'X',
  'Y',
  'Z',
  'Abacavir',
  'Abacavir and Lamivudine',
  'Abacavir-Dolutegravir-Lamivudine',
  'Abaloparatide',
  'Abatacept',
  'Abciximab',
  'Abemaciclib',
  'Abiraterone',
  'AbobotulinumtoxinA',
  'Abrocitinib',
  'Acai',
  'Acalabrutinib',
  'Acamprosate',
  'Acarbose',
  'Acebutolol',
  'Acetaminophen',
  'Acetaminophen and Aspirin',
  'Acetaminophen Butalbital Caffeine Codeine',
  'Acetaminophen Chlorpheniramine',
  'Acetaminophen Chlorpheniramine Dextromethorphan',
  'Acetaminophen Chlorpheniramine Phenylephrine',
  'Acetaminophen Dextromethorphan Phenylephrine',
  'Acetaminophen Dextromethorphan Pseudoephedrine',
  'Acetaminophen Diphenhydramine Dextromethorphan',
  'Acetaminophen IV',
  'Acetaminophen Pamabrom Pyridoxine',
  'Acetaminophen Phenyltoloxamine',
  'Acetaminophen Rectal',
  'Acetaminophen-As

In [11]:
# List of strings to be filtered out
filter_out_list = [
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 
    'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'Drugs & Medications', 
    'Pill Identification Tool', 'Vitamins, Herbs, & Dietary Supplements', 'Dictionary', 
    'About Us', 'Consumer Contact RxList', 'Terms of Use', 'Privacy Policy', 'Sponsor Policy'
]

# Assuming 'drug_links_text' is your dictionary with the text of each link
drug_links_text = {
    letter: [text for text in text_list if text not in filter_out_list]
    for letter, text_list in drug_links_text.items()
}

# 'filtered_drug_links_text' now contains the filtered lists
drug_links_text

{'a': ['Abacavir',
  'Abacavir and Lamivudine',
  'Abacavir-Dolutegravir-Lamivudine',
  'Abaloparatide',
  'Abatacept',
  'Abciximab',
  'Abemaciclib',
  'Abiraterone',
  'AbobotulinumtoxinA',
  'Abrocitinib',
  'Acai',
  'Acalabrutinib',
  'Acamprosate',
  'Acarbose',
  'Acebutolol',
  'Acetaminophen',
  'Acetaminophen and Aspirin',
  'Acetaminophen Butalbital Caffeine Codeine',
  'Acetaminophen Chlorpheniramine',
  'Acetaminophen Chlorpheniramine Dextromethorphan',
  'Acetaminophen Chlorpheniramine Phenylephrine',
  'Acetaminophen Dextromethorphan Phenylephrine',
  'Acetaminophen Dextromethorphan Pseudoephedrine',
  'Acetaminophen Diphenhydramine Dextromethorphan',
  'Acetaminophen IV',
  'Acetaminophen Pamabrom Pyridoxine',
  'Acetaminophen Phenyltoloxamine',
  'Acetaminophen Rectal',
  'Acetaminophen-Aspirin-Caffeine',
  'Acetaminophen-Caffeine-Dihydrocodeine',
  'Acetaminophen-Ibuprofen',
  'Acetaminophen/Doxylamine/Dextromethorphan/Phenylephrine',
  'Acetaminophen/Pamabrom',
  'A

In [12]:
from rapidfuzz import process, fuzz

In [13]:
# Adjusting the flattening process to include the original string and also to split strings with 'and' and '-'
def lowercase_list(lst):
    """ Lowercases all strings in a list """
    return [item.lower() for item in lst]
# Adjusted function to split based on multiple delimiters
def split_string(s, delimiters):
    split_list = [s]
    for delimiter in delimiters:
        temp_list = []
        for item in split_list:
            temp_list.extend(item.split(delimiter))
        split_list = temp_list
    return split_list

# Flattening the list with the adjusted function
flat_list = []
for key, values in drug_links_text.items():
    for value in values:
        # Splitting strings with '/', 'and', '-'
        split_values = split_string(value, ['/', 'and', '-', ' ', ','])
        # Adding both original and split values to the list
        flat_list.append(value)
        flat_list.extend(split_values)

# Removing duplicates while preserving order
unique_flat_list = []
[unique_flat_list.append(item) for item in flat_list if item not in unique_flat_list]

generic_drug_names_set = set(lowercase_list(unique_flat_list))





In [14]:
generic_drug_names_set

{'',
 'ig',
 'estazolam',
 'hylan g-f 20',
 'ceftolozane',
 'silver sulfadiazine',
 'aducanumab',
 'fibrinogen/thrombin',
 'acetaminophen rectal',
 'ivermectin topical',
 'thiamine',
 'carbamide peroxide otic',
 'ashwag',
 'macrilen',
 'chitosan',
 'pemetrexed',
 'ocriplasmin',
 'pioglitazone',
 'glatiramer injection',
 'ivosidenib',
 'ointment',
 'altovis',
 'hydrocortisone topical',
 'horseweed',
 'teriflunomide',
 'neostigmine-glycopyrrolate',
 'epoetin',
 'betaine/polyhexanide',
 'denileukin',
 'fluoroestradiol',
 'gentamicin ophthalmic',
 'potassium phosphate-sodium acid phosphate',
 'ureaurethane',
 'cycloserine',
 'vitamin a',
 'icatibant',
 'lidocaine topical',
 'tetrastarch',
 'magnesium supplement',
 'cobicistat, elvitegravir, emtricitabine, and tenofovir',
 'mirtazapine',
 'phenylephrine/witch hazel topical',
 'norethindrone acetate/ethinyl estradiol',
 'fentanyl transdermal',
 '(arava)',
 'aripiprazole',
 'germanium',
 'miglustat',
 'piperacillin-tazobactam',
 'burosumab',


In [15]:
from rapidfuzz import process, fuzz
from tqdm import tqdm

def get_generic_names(alternative_names, generic_names_set):
    matched_generic_names = []
    for name in alternative_names:
        # Split the name into words
        words = name.split()
        for word in words:
            # Check for an exact match first
            if word in generic_names_set:
                matched_generic_names.append(word)
            else:
                # Apply fuzzy matching if no exact match is found
                match, score, _ = process.extractOne(word, generic_names_set, scorer=fuzz.WRatio)
                if score > 90:  # Adjust the threshold as needed
                    matched_generic_names.append(match)
    return matched_generic_names

# Iterate over the drugs_data to match and add the generic names
for drug in tqdm(drugs_data, desc="Processing Drugs"):
    drug['generic_names'] = get_generic_names(drug['alternative_names'], generic_drug_names_set)


Processing Drugs: 100%|██████████| 8171/8171 [05:31<00:00, 24.64it/s]


In [16]:
medication_forms = [
    'Oral', 'Topical', 'Injectable', 'Injection', 'Intravenous', 'IV', 'Inhalable', 'Inhaler',
    'Ophthalmic', 'Otic', 'Nasal', 'Transdermal', 'Sublingual', 'Rectal', 'Vaginal',
    'Subcutaneous', 'Buccal', 'Extended-Release', 'ER', 'Sustained-Release', 'SR',
    'Chewable', 'Effervescent', 'Powder', 'Liquid', 'Capsule', 'Gel', 'Cream', 'Suspension','preparation', 'powd', "inhalation", 'solution'
]

# Lowercasing the medication form terms
medication_forms_lower = [form.lower() for form in medication_forms]

# Function to remove lowercased medication forms from drug names
def remove_medication_forms_lower(drug_name, medication_forms_lower):
    drug_name_lower = drug_name.lower()
    for form in medication_forms_lower:
        drug_name_lower = drug_name_lower.replace(form, '').strip()
    return drug_name_lower



In [17]:
# Adjusting the function to also remove empty strings ('') from the list of generic names

def clean_generic_names_and_remove_empty(generic_names, medication_forms_lower):
    cleaned_names = []
    for name in generic_names:
        cleaned_name = name.lower()
        for form in medication_forms_lower:
            cleaned_name = cleaned_name.replace(form, '').strip()
        if cleaned_name:  # Checks if the string is not empty
            cleaned_names.append(cleaned_name)
    return cleaned_names

# Processing the generic_names data with the adjusted function
for drug in drugs_data:
    drug['generic_names'] = clean_generic_names_and_remove_empty(drug['generic_names'], medication_forms_lower)


drugs_data

[{'name': 'Azulfidine',
  'alternative_names': ['Azulfidine',
   'sulfasalazine',
   'Sulfazine',
   'Azulfidine EN-tabs'],
  'drug_classes': ['5-aminosalicylates', 'Antirheumatics'],
  'uses': ['Arthritis', 'Rheumatoid Arthritis', 'Ulcerative Colitis'],
  'status': 'Prescription only',
  'generic_names': ['sulfasalazine']},
 {'name': 'Abstral',
  'alternative_names': ['Fentora', 'Abstral', 'Subsys', 'fentanyl'],
  'drug_classes': ['Opioids (narcotic analgesics)'],
  'uses': ['Cancer Pain'],
  'status': 'Discontinued',
  'generic_names': ['fentanyl']},
 {'name': 'Aczone',
  'alternative_names': ['dapsone topical', 'Aczone'],
  'drug_classes': ['Topical acne agents'],
  'uses': ['Acne Vulgaris', 'Hypersensitivity'],
  'status': 'Prescription only',
  'generic_names': ['dapsone']},
 {'name': 'Adzynma',
  'alternative_names': ['ADAMTS13, recombinant-krhn', 'Adzynma'],
  'drug_classes': [],
  'uses': ['Headache',
   'Digital Arthropathy-Brachydactyly, Familial',
   'Infusion procedures',
 

In [18]:
drugs_data

[{'name': 'Azulfidine',
  'alternative_names': ['Azulfidine',
   'sulfasalazine',
   'Sulfazine',
   'Azulfidine EN-tabs'],
  'drug_classes': ['5-aminosalicylates', 'Antirheumatics'],
  'uses': ['Arthritis', 'Rheumatoid Arthritis', 'Ulcerative Colitis'],
  'status': 'Prescription only',
  'generic_names': ['sulfasalazine']},
 {'name': 'Abstral',
  'alternative_names': ['Fentora', 'Abstral', 'Subsys', 'fentanyl'],
  'drug_classes': ['Opioids (narcotic analgesics)'],
  'uses': ['Cancer Pain'],
  'status': 'Discontinued',
  'generic_names': ['fentanyl']},
 {'name': 'Aczone',
  'alternative_names': ['dapsone topical', 'Aczone'],
  'drug_classes': ['Topical acne agents'],
  'uses': ['Acne Vulgaris', 'Hypersensitivity'],
  'status': 'Prescription only',
  'generic_names': ['dapsone']},
 {'name': 'Adzynma',
  'alternative_names': ['ADAMTS13, recombinant-krhn', 'Adzynma'],
  'drug_classes': [],
  'uses': ['Headache',
   'Digital Arthropathy-Brachydactyly, Familial',
   'Infusion procedures',
 

In [7]:
for drug in drugs_data:
    for idx, name in enumerate(drug['generic_names']):
        if name == 'testostone':
            drug['generic_names'][idx] = 'testosterone'
        if name == 'methyltestostone':
            drug['generic_names'][idx] = 'methyltestosterone'
            

In [8]:
dh.save_data(drugs_data)