## Read FDA compounds

https://github.com/greenelab/circadian-clock-modulators/issues/1

> The SCREENWELL FDA-approved drug library V2 containing 741 compounds was purchased from Enzo Life Sciences (Hayashi Kasei Co., Ltd.), and the International Drug Collection (IDC) containing 311 compounds was purchased from MicroSource Discovery Systems, Inc. (Namiki Shoji Co., Ltd.).

In [1]:
import logging

import pandas
import requests_cache

from utils.pubchem import query_pubchem_for_drugbank

In [2]:
requests_cache.install_cache(cache_name="requests-cache", allowable_codes=[200, 404])

### Read tamai screen results

In [3]:
screen_df = pandas.read_table("data/tamai-screen/combined-results.tsv")
screen_df.head(2)

Unnamed: 0,compound_name,compound_set,concentration,period_before,period_change,plate,notes
0,Clindamycin·HCl,fda,1 uM,21.5,0.02917,1,
1,Felbamate,fda,1 uM,21.7,0.0625,1,


### Read Screen-Well compounds

The spreadsheet contains chemical structures in `mrv` format (ChemAxon MRV format, [Marvin Document](https://docs.chemaxon.com/display/docs/Marvin_Documents_-_MRV.html)). This format appears to be propreitary / not well supported, so map via other fields instead.

In [4]:
# mrv (ChemAxon MRV format)
xlsx_path = "data/compounds/BML-2843_v. 1.4_Rev. 18Dec19_StructureFile.xlsx"
structure_df = pandas.read_excel(xlsx_path, sheet_name=0)
structure_df.head(2)

Unnamed: 0,Hash,StructureStringLength,StructureStringFormat,StructureString
0,50A647B589F94DEB7D0CBA1EAE819980,1,mrv,JChemExcelFgsAAB+LCAAAAAAABADtvQdgHEmWJSYvbcp7...
1,C725614FF6722BF574DB2EE705636395,1,mrv,JChemExcelogYAAB+LCAAAAAAABADtvQdgHEmWJSYvbcp7...


In [5]:
screenwell_df = pandas.read_excel(xlsx_path, sheet_name=1).add_prefix("screenwell_").convert_dtypes()
# fix messy column names
screenwell_df.columns = screenwell_df.columns.str.strip().str.lower().str.replace(" ", "_")
# remove blank rows
screenwell_df = screenwell_df.query("screenwell_name != 'Blank'")
screenwell_df.head(2)

Unnamed: 0,screenwell_structure,screenwell_smiles,screenwell_iupac,screenwell_plate_location,screenwell_catalog_number,screenwell_name,screenwell_cas,screenwell_mw,screenwell_concentration,screenwell_solvent,screenwell_plate_number,screenwell_plate_description,screenwell_approval_year,screenwell_indication,screenwell_mechanism_of_action,screenwell_notes/toxicity,screenwell_plate_part_number
0,,CCC[C@@H]1CC(N(C1)C)C(=O)N[C@@H](C2[C@@H]([C@H...,"(4R)-N-[(1S,2S)-2-chloro-1-[(3R,4R,6R)-3,4,5-t...",1-A02,A190,Clindamycin · HCl,21462-39-5,461.4,10 mM,DMSO,1,FDA Library II,1970,Antibiotic,Systemic/vaginal clindamycin inhibits protein ...,"Side effects include: diarrhea, pseudomembrano...",2843
1,,C1=CC=C(C=C1)C(COC(=O)N)COC(=O)N,(3-carbamoyloxy-2-phenylpropyl) carbamate,1-A03,DL567,Felbamate,25451-15-4,238.2,10 mM,DMSO,1,FDA Library II,1993,Anticonvulsant,Precise mechanism unknown; It has an effect on...,"Adverse reactions include decreased appetite, ...",2843


In [6]:
# Number of compound names in the screen where exact text match fails
screen_names = sorted(screen_df.query("compound_set == 'fda'").compound_name.unique())
len(set(screen_names) - set(screenwell_df.screenwell_name))

152

In [7]:
%%time
rows = []
for screenwell in screenwell_df.itertuples():
    for by, compound_name in [
            ("name", screenwell.screenwell_iupac),
            ("smiles", screenwell.screenwell_smiles),
            ("name", screenwell.screenwell_name),
        ]:
        row = query_pubchem_for_drugbank(compound_name, by=by)
        if row:
            break
    else:
        logging.warning(f"IUPAC lookup failed for {screenwell.screenwell_name}")
        continue
    row["screenwell_name"] = screenwell.screenwell_name
    rows.append(row)
len(rows)



CPU times: user 4.27 s, sys: 292 ms, total: 4.56 s
Wall time: 37.5 s


764

In [8]:
screenwell_map_df = (
    screenwell_df
    [["screenwell_name", "screenwell_catalog_number", "screenwell_plate_location", "screenwell_smiles", "screenwell_iupac"]]
    .merge(pandas.DataFrame(rows), how="left")
    .convert_dtypes()
)
screenwell_map_df.head(5)

Unnamed: 0,screenwell_name,screenwell_catalog_number,screenwell_plate_location,screenwell_smiles,screenwell_iupac,query_compound,pubchem_cid,drugbank_id
0,Clindamycin · HCl,A190,1-A02,CCC[C@@H]1CC(N(C1)C)C(=O)N[C@@H](C2[C@@H]([C@H...,"(4R)-N-[(1S,2S)-2-chloro-1-[(3R,4R,6R)-3,4,5-t...",CCC[C@@H]1CC(N(C1)C)C(=O)N[C@@H](C2[C@@H]([C@H...,53384906,
1,Felbamate,DL567,1-A03,C1=CC=C(C=C1)C(COC(=O)N)COC(=O)N,(3-carbamoyloxy-2-phenylpropyl) carbamate,(3-carbamoyloxy-2-phenylpropyl) carbamate,3331,DB00949
2,Cyclosporin A,A195,1-A04,CC[C@H]1C(=O)N(CC(=O)N([C@H](C(=O)N[C@H](C(=O)...,"(3S,6S,9S,12R,15S,18S,21S,24S,30S,33S)-30-ethy...",Cyclosporin A,5284373,DB00091
3,Donepezil·HCl,DL568,1-A05,COC1=C(C=C2C(=C1)CC(C2=O)CC3CCN(CC3)CC4=CC=CC=...,"2-[(1-benzylpiperidin-4-yl)methyl]-5,6-dimetho...","2-[(1-benzylpiperidin-4-yl)methyl]-5,6-dimetho...",5741,
4,Lincomycin·HCl monohydrate,A240,1-A06,CCCC1CC(N(C1)C)C(=O)NC(C2C(C(C(C(O2)SC)O)O)O)C...,"(2S,4R)-N-[(1R,2R)-2-hydroxy-1-[(2R,3R,4S,5R,6...","(2S,4R)-N-[(1R,2R)-2-hydroxy-1-[(2R,3R,4S,5R,6...",3000540,DB01627


In [9]:
print(f"""
{(screenwell_map_df.query_compound == screenwell_map_df.screenwell_iupac).sum():,} compounds mapped to pubchem via screenwell_iupac
{(screenwell_map_df.query_compound == screenwell_map_df.screenwell_smiles).sum():,} compounds mapped to pubchem via screenwell_smiles
{(screenwell_map_df.query_compound == screenwell_map_df.screenwell_name).sum():,} compounds mapped to pubchem via screenwell_name
{sum(screenwell_map_df.pubchem_cid.isna()):,} compounds failed to map to pubchem

{sum(screenwell_map_df.drugbank_id.notna()):,} compounds mapped to DrugBank
""".strip())

557 compounds mapped to pubchem via screenwell_iupac
191 compounds mapped to pubchem via screenwell_smiles
33 compounds mapped to pubchem via screenwell_name
8 compounds failed to map to pubchem

406 compounds mapped to DrugBank


In [10]:
# Screen-Well compounds not mapped to PubChem
screenwell_map_df[screenwell_map_df.pubchem_cid.isna()]

Unnamed: 0,screenwell_name,screenwell_catalog_number,screenwell_plate_location,screenwell_smiles,screenwell_iupac,query_compound,pubchem_cid,drugbank_id
61,Ranitidine·HCl,AC766,1-G03,CN/C(=C/[N+](=O)[O-])/NCCSCC1=CC=C(O1)CN(C)C.Cl,(Z)-1-N'-[2-[[5-[(dimethylamino)methyl]furan-2...,,,
88,Cefepime·2HCl Hydrate,DL191,2-A10,[H+].C[N+]1(CCCC1)CC2=C(N3C(C(C3=O)NC(=O)/C(=N...,"7-[[(2Z)-2-(2-amino-1,3-thiazol-4-yl)-2-methox...",,,
342,Terazosin·HCl,DL513,5-C05,COC1=C(C=C2C(=C1)C(=NC(=N2)N3CCN(CC3)C(=O)C4CC...,"[4-(4-amino-6,7-dimethoxyquinazolin-2-yl)piper...",,,
449,Ceftizoxim·Na,AC3053,6-H03,CO/N=C(\C1=CSC(=N1)N)/C(=O)N[C@H]2[C@@H]3N(C2=...,"sodium;(6R,7R)-7-[[(2E)-2-(2-amino-1,3-thiazol...",,,
450,Ceftriaxone·Na,AC3054,6-H04,CN1C(=NC(=O)C(=O)N1)SCC2=C(N3[C@@H]([C@@H](C3=...,"sodium;7-[[(2Z)-2-(2-amino-1,3-thiazol-4-yl)-2...",,,
452,Cefuroxime·Na,AC3056,6-H06,CO/N=C(\C1=CC=CO1)/C(=O)N[C@H]2[C@@H]3N(C2=O)C...,"sodium;(6R,7R)-3-(carbamoyloxymethyl)-7-[[(2E)...",,,
483,Dantrolene . sodium,550072,7-C10,C1C(=O)N=C(N1/N=C/C2=CC=C(O2)C3=CC=C(C=C3)[N+]...,sodium;3-[(E)-[5-(4-nitrophenyl)furan-2-yl]met...,,,
492,Demeclocycline·HCl,AC3083,7-D09,CN(C)[C@H]1[C@@H]2C[C@@H]3[C@@H](C4=C(C=CC(=C4...,"(4S,4aS,5aS,6S,12aR)-7-chloro-4-(dimethylamino...",,,


In [11]:
screenwell_map_df.to_csv("data/compounds/screenwell-to-pubchem.tsv", sep="\t", index=False)