In [24]:
import pandas as pd
import numpy as np
from urllib.parse import quote
import requests
import time
import bot

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

import pickle


# Get SMILES notation

In [2]:
meds_raw = pd.read_csv('medecines.csv')
meds = meds_raw.iloc[8:,1:]
meds.columns = meds_raw.iloc[7,1:]
human_meds = meds.loc[:,'Category'] == 'Human'
meds = meds[human_meds]
meds = meds.reset_index(drop=True)
meds.head(3)

7,Category,Medicine name,Therapeutic area,International non-proprietary name (INN) / common name,Active substance,Product number,Patient safety,Authorisation status,ATC code,Additional monitoring,...,Vet pharmacotherapeutic group,Date of opinion,Decision date,Revision number,Condition / indication,Species,ATCvet code,First published,Revision date,URL
0,Human,Lenvima,Thyroid Neoplasms,lenvatinib,lenvatinib mesilate,EMEA/H/C/003727,no,Authorised,L01XE,yes,...,,2015-03-26 01:00:00,2023-03-09 01:00:00,18,Lenvima is indicated as monotherapy for the tr...,,,2018-06-21 12:16:00,2023-05-11 15:24:00,https://www.ema.europa.eu/en/medicines/human/E...
1,Human,Febuxostat Krka,Hyperuricemia; Gout,febuxostat,febuxostat,EMEA/H/C/004773,no,Authorised,M04AA03,no,...,,2019-01-31 01:00:00,2023-05-11 00:00:00,4,Febuxostat Krka is indicated for the treatment...,,,2019-03-29 17:03:00,2023-05-11 15:01:00,https://www.ema.europa.eu/en/medicines/human/E...
2,Human,Tasigna,"Leukemia, Myelogenous, Chronic, BCR-ABL Positive",nilotinib,nilotinib,EMEA/H/C/000798,no,Authorised,L01EA03,no,...,,,2022-05-19 00:00:00,42,Tasigna is indicated for the treatment of:adul...,,,2017-11-15 01:09:00,2023-05-11 14:40:00,https://www.ema.europa.eu/en/medicines/human/E...


In [3]:
def pubchem_lookup(active_substance):
    '''
    searches for Canonical Smiles in the PubChem database
    '''
    url_substance = quote(active_substance)
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{url_substance}/property/CanonicalSMILES/TXT"
    response = requests.get(url)
    if response.status_code != 200:
        return None
    else:
        smiles = response.content.decode('utf-8')[:-1]
        return smiles


In [4]:
def fill_smiles_column():
    '''
    Wrapper for the pubchem_lookup function to fill entire column
    with Canonical smile notation.
    makes max 5 requests per second to comply with the API 
    '''

    for idx, active_substance in enumerate(meds.loc[:,'Active substance']):
        try:
            # only lookup first active substance
            substance1 = active_substance.split(',')[0]
            smiles = pubchem_lookup(substance1)
            meds.loc[idx,'smiles'] = smiles
        except:
            meds.loc[idx,'smiles'] = None
        # To comply with pubchem usage policy
        if idx % 5 == 0:
            time.sleep(1)
    

In [5]:
bot.botrun(fill_smiles_column())

In [None]:
def advanced_smiles_lookup(active_substance):
    '''
    If first search doesnt yield results then this option can be used
    takes much more time than first option and opens browsers in the
    background.

    Uses Google Chrome
    '''
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(options=options)
    driver.get('https://pubchem.ncbi.nlm.nih.gov/')


    search = driver.find_element(By.XPATH, '//input[@type="text" and @placeholder=""]')
    search.send_keys(f'{active_substance}')


    search_button = driver.find_element(By.CLASS_NAME, 'main-search-submit')
    search_button.click()
    time.sleep(2)
    first_element = driver.find_element(By.CSS_SELECTOR,'[data-action="result-link"]')
    first_element.click()
    time.sleep(2)
    smiles_div = driver.find_element(By.ID, 'Canonical-SMILES')
    div_text = smiles_div.text
    smiles = div_text.split('\n')[1]
    driver.close()
    return smiles



In [None]:
holdouts = meds[meds.loc[:,'smiles'].isna()]
indicies = list(holdouts.index)
indicies
substances = list(holdouts.loc[:,'Active substance'])
substances
for idx, active_substance in zip(indicies,substances):
    try:
            substance1 = active_substance.split(',')[0]
            smiles = advanced_smiles_lookup(substance1)
            meds.loc[idx,'smiles'] = smiles
    except:
          pass

  options.headless = True


In [None]:
sum(meds.loc[:,'smiles'].isna())/len(meds)


0.25

In [None]:
meds.to_pickle('meds_w_smiles')

# Calculate similarity scores

In [3]:
meds = pd.read_pickle('meds_w_smiles')

In [4]:
def tanimoto_calc(smi1, smi2):
    mol1 = Chem.MolFromSmiles(smi1)
    mol2 = Chem.MolFromSmiles(smi2)
    fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 3, nBits=2048)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 3, nBits=2048)
    s = round(DataStructs.TanimotoSimilarity(fp1,fp2),3)
    return s

In [5]:
tanimoto_calc(meds.loc[0,'smiles'], meds.loc[3,'smiles'])

0.07

In [6]:
# all unique active substances
unique_active = meds.drop_duplicates(subset=['smiles'])

smiles = unique_active.loc[:,'smiles'].dropna()
smiles = list(smiles)
nr_smiles = len(smiles)
similarity = np.zeros([nr_smiles, nr_smiles])



In [7]:
for i in range(nr_smiles):
    for j in range(nr_smiles-i):
        try:
            sim_score = tanimoto_calc(smiles[i], smiles[j])
            similarity[i,j] = similarity[j,i] = sim_score
        except:
            similarity[i,j] = similarity[j,i] = None



In [10]:
dist_matrix = 1 - similarity

In [12]:
np.save('distance_mat', dist_matrix)

In [26]:
with open("smiles", "wb") as file:   #Pickling
    pickle.dump(smiles, file)

# Other stuff


In [None]:
from collections import Counter

In [None]:
from collections import Counter
sorted(Counter(meds.loc[:,'Human pharmacotherapeutic group']).items(), key=lambda x:x[1], reverse=True)

[('Antineoplastic agents', 252),
 ('Immunosuppressants', 108),
 ('Drugs used in diabetes', 97),
 ('Antivirals for systemic use', 96),
 ('Vaccines, ', 68),
 ('Antithrombotic agents', 66),
 ('Drugs for obstructive airway diseases, ', 44),
 ('Agents acting on the renin-angiotensin system', 43),
 ('Other alimentary tract and metabolism products, ', 42),
 ('Psycholeptics', 40),
 ('Immunostimulants, ', 39),
 ('Antihemorrhagics', 32),
 ('Antiepileptics, ', 30),
 ('Ophthalmologicals', 30),
 ('Antibacterials for systemic use, ', 29),
 (nan, 29),
 ('Drugs for treatment of bone diseases', 27),
 ('Sex hormones and modulators of the genital system, ', 26),
 ('Urologicals', 24),
 ('All other therapeutic products', 23),
 ('Other nervous system drugs', 23),
 ('Anti-Parkinson drugs', 23),
 ('Psychoanaleptics, ', 22),
 ('Diagnostic radiopharmaceuticals', 18),
 ('Calcium homeostasis', 17),
 ('Lipid modifying agents', 15),
 ('Pituitary and hypothalamic hormones and analogues', 14),
 ('Cardiac therapy', 13

In [None]:

no_smiles = meds[meds.loc[:,'smiles'].isna()]
no_smiles.head(3)
