In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
from rdkit import Chem
from rdkit.Chem.rdmolops import GetAdjacencyMatrix
import torch
from torch_geometric.data import Data
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import os
import torch
from torch_geometric.nn import GCNConv, GATConv
from sklearn.model_selection import train_test_split
from torch_geometric.loader import DataLoader
from sklearn.metrics import r2_score

  _torch_pytree._register_pytree_node(


In [3]:
models = pd.read_csv("cell_models.csv")
targets = pd.read_csv("target.csv",sep=";")

## TODOs
1. Table with drug bank identifier and smiles code (parser of structures.sdf) (drug bank id != GDSC drug id)
2. Merge drug tables (IC50 with SMILES) (pd.merge with targets.csv)
3. For each smile generate PyG Graphs (https://www.blopig.com/blog/2022/02/how-to-turn-a-smiles-string-into-a-molecular-graph-for-pytorch-geometric/)
4. Generate table gene expression, cnv and mutations for all drugs and cell lines (huge)
5. Create mask for each drug-cell line pair used for masknig trainable params in the end-to-end learning step (all_features.zip)
6. Create GCN for drug feature learning (note: pre-training might make sense here!)
7. Create Linear Regression for cell lines features learning
8. Create model for combined learning end-to-end
9. Optional: Huggin face pre-trained drug models

Goal: Improve by combining: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8434731/ + Feature Selector

## 1. Get SMILES

## Drug Id of GDSC2  to PubChem Id via their online table

In [4]:
data = []
drug_table_html = '<table class="sorted-table before filter export_csv_tab_xls paginate_10_x25_50_all zebra-table headers_rotated"><thead><tr><th class="header">drug_id</th><th class="header">drug_name</th><th class="header">synonyms</th><th class="header">pathway_name</th><th class="header">targets</th><th class="header">pubchem</th></tr></thead><tbody><tr class="even  even"><td class="r">1259</td><td class="r">Talazoparib</td><td class="r">BMN-673, BMN 973</td><td class="r">Genome integrity</td><td class="r">PARP1, PARP2</td><td class="r">44819241</td></tr><tr class="odd  odd"><td class="r">1372</td><td class="r">Trametinib</td><td class="r">GSK1120212, Mekinist</td><td class="r">ERK MAPK signaling</td><td class="r">MEK1, MEK2</td><td class="r">11707110</td></tr><tr class="even  even"><td class="r">1559</td><td class="r">Luminespib</td><td class="r">AUY922, VER-52296,NVP-AUY922,  AUY</td><td class="r">Protein stability and degradation</td><td class="r">HSP90</td><td class="r">10096043</td></tr><tr class="odd  odd"><td class="r">1615</td><td class="r">CZC24832</td><td class="r">GTPL6653</td><td class="r">PI3K/MTOR signaling</td><td class="r">PI3Kgamma</td><td class="r">42623951</td></tr><tr class="even  even"><td class="r">1620</td><td class="r">PFI3</td><td class="r">PFI-3, PFI 3, AOB2221</td><td class="r">Chromatin other</td><td class="r">Polybromo 1, SMARCA4, SMARCA2</td><td class="r">78243717</td></tr><tr class="odd  odd"><td class="r">1622</td><td class="r">Wnt-C59</td><td class="r">AOB3540, AK161197</td><td class="r">WNT signaling</td><td class="r">PORCN</td><td class="r">57519544</td></tr><tr class="even  even"><td class="r">1626</td><td class="r">OTX015</td><td class="r">Birabresib, OTX-015, MK-8628</td><td class="r">Chromatin other</td><td class="r">BRD2, BRD3, BRD4</td><td class="r">9936746</td></tr><tr class="odd  odd"><td class="r">1627</td><td class="r">GSK343</td><td class="r">GTPL8240, AOB3680, AK175558</td><td class="r">Chromatin histone methylation</td><td class="r">EZH2</td><td class="r">71268957</td></tr><tr class="even  even"><td class="r">1634</td><td class="r">AGI-6780</td><td class="r">AK162157</td><td class="r">Metabolism</td><td class="r">IDH2 R140Q mutant</td><td class="r">71299339</td></tr><tr class="odd  odd"><td class="r">1854</td><td class="r">MN-64</td><td class="r">-</td><td class="r">WNT signaling</td><td class="r">TNKS1, TNKS2</td><td class="r">2802462</td></tr><tr class="even  even"><td class="r">1909</td><td class="r">Venetoclax</td><td class="r">ABT-199, Veneclexta, GDC-0199</td><td class="r">Apoptosis regulation</td><td class="r">BCL2</td><td class="r">49846579</td></tr><tr class="odd  odd"><td class="r">1084</td><td class="r">Rapamycin</td><td class="r">AY-22989, Sirolimus, WY-090217, Torisel, Rapamune</td><td class="r">PI3K/MTOR signaling</td><td class="r">MTORC1</td><td class="r">5384616</td></tr><tr class="even  even"><td class="r">1131</td><td class="r">PRIMA-1MET</td><td class="r">Prima 1MET, APR-246</td><td class="r">p53 pathway</td><td class="r">TP53 activation</td><td class="r">52918385</td></tr><tr class="odd  odd"><td class="r">1584</td><td class="r">BDOCA000347a</td><td class="r">4-bromo-N-(6-methoxy-1,3-dimethyl-2-oxo-2,3-dihydro-1H-benzo[d]imidazol-5-yl)-2-methylbenzenesulfonamide</td><td class="r">Chromatin histone acetylation</td><td class="r">BRPF1, BRPF2, BRPF3</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1716</td><td class="r">IRAK4_4710</td><td class="r">SN1050852374, IRAK4_4710</td><td class="r">Other, kinases</td><td class="r">IRAK4</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1812</td><td class="r">Bleomycin</td><td class="r">-</td><td class="r">DNA replication</td><td class="r">dsDNA break induction</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1817</td><td class="r">Romidepsin</td><td class="r">Istodax, FK228, Depsipeptide</td><td class="r">Chromatin histone acetylation</td><td class="r">HDAC1, HDAC2, HDAC3, HDAC8</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1834</td><td class="r">743380</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1913</td><td class="r">AGI-5198</td><td class="r">IDH-C35</td><td class="r">Metabolism</td><td class="r">IDH1 (R132H)</td><td class="r">56645356</td></tr><tr class="odd  odd"><td class="r">1915</td><td class="r">AZD3759</td><td class="r">-</td><td class="r">EGFR signaling</td><td class="r">EGFR</td><td class="r">78209992</td></tr><tr class="even  even"><td class="r">1932</td><td class="r">NVP-ADW742</td><td class="r">NVP ADW742, NVPADW742</td><td class="r">IGF1R signaling</td><td class="r">IGF1R</td><td class="r">9825149</td></tr><tr class="odd  odd"><td class="r">1933</td><td class="r">P22077</td><td class="r">P 22077, P-22077</td><td class="r">Protein stability and degradation</td><td class="r">USP7, USP47</td><td class="r">46931953</td></tr><tr class="even  even"><td class="r">1997</td><td class="r">WEHI-539</td><td class="r">WEHI539, WEHI-539</td><td class="r">Apoptosis regulation</td><td class="r">BCL-XL</td><td class="r">71297207</td></tr><tr class="odd  odd"><td class="r">2011</td><td class="r">ICL-SIRT078</td><td class="r">-</td><td class="r">Other</td><td class="r">SIR2</td><td class="r">-</td></tr><tr class="even  even"><td class="r">2038</td><td class="r">UNC0638</td><td class="r">-</td><td class="r">Chromatin histone methylation</td><td class="r">G9A, GLP</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">2040</td><td class="r">Foretinib</td><td class="r">GSK1363089, XL-880, EXEL-2880, GSK089</td><td class="r">RTK signaling</td><td class="r">MET, KDR, TIE2, VEGFR3/FLT4, RON, PDGFR, FGFR1, EGFR</td><td class="r">42642645</td></tr><tr class="even  even"><td class="r">2106</td><td class="r">Uprosertib</td><td class="r">GSK2141795, GSK2141795C, GSK-2141795</td><td class="r">PI3K/MTOR signaling</td><td class="r">AKT1, AKT2, AKT3</td><td class="r">51042438</td></tr><tr class="odd  odd"><td class="r">2154</td><td class="r">GSK-LSD1-2HCl </td><td class="r">-</td><td class="r">Chromatin histone methylation</td><td class="r">LSD1</td><td class="r">-</td></tr><tr class="even  even"><td class="r">2159</td><td class="r">UNC0379</td><td class="r">-</td><td class="r">Chromatin histone methylation</td><td class="r">SETD8</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">2360</td><td class="r">THR-101</td><td class="r">WIMM synthesis</td><td class="r">PI3K/MTOR signaling</td><td class="r">Mutant RAS</td><td class="r">None</td></tr><tr class="even  even"><td class="r">2439</td><td class="r">glutathione</td><td class="r">G6013, sigma</td><td class="r">Other</td><td class="r">anti-oxidant proteins</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1008</td><td class="r">Methotrexate</td><td class="r">Abitrexate, Amethopterin, Rheumatrex, Trexall, Folex</td><td class="r">DNA replication</td><td class="r">Antimetabolite</td><td class="r">126941</td></tr><tr class="even  even"><td class="r">1017</td><td class="r">Olaparib</td><td class="r">AZD2281, KU0059436,  Lynparza</td><td class="r">Genome integrity</td><td class="r">PARP1, PARP2</td><td class="r">23725625</td></tr><tr class="odd  odd"><td class="r">1019</td><td class="r">Bosutinib</td><td class="r">SKI-606, Bosulif</td><td class="r">Other, kinases</td><td class="r">SRC, ABL, TEC</td><td class="r">5328940</td></tr><tr class="even  even"><td class="r">1021</td><td class="r">Axitinib</td><td class="r">AG-13736, Inlyta</td><td class="r">RTK signaling</td><td class="r">PDGFR, KIT, VEGFR</td><td class="r">6450551</td></tr><tr class="odd  odd"><td class="r">1170</td><td class="r">CCT-018159</td><td class="r">CCT018159, CCT 018159</td><td class="r">Protein stability and degradation</td><td class="r">HSP90</td><td class="r">5327091</td></tr><tr class="even  even"><td class="r">1375</td><td class="r">Temozolomide</td><td class="r">Temodar, Temodal, M-39831, SCH 52365</td><td class="r">DNA replication</td><td class="r">DNA alkylating agent</td><td class="r">5394</td></tr><tr class="odd  odd"><td class="r">1502</td><td class="r">Bicalutamide</td><td class="r">ICI-176334, Casodex, Cosudex, ICI 176334</td><td class="r">Hormone-related</td><td class="r">AR</td><td class="r">2375</td></tr><tr class="even  even"><td class="r">1557</td><td class="r">LCL161</td><td class="r">-</td><td class="r">Apoptosis regulation</td><td class="r">XIAP, IAP1, IAP2</td><td class="r">24737642</td></tr><tr class="odd  odd"><td class="r">1594</td><td class="r">OSI-027</td><td class="r">A-1065-5</td><td class="r">PI3K/MTOR signaling</td><td class="r">MTORC1, MTORC2</td><td class="r">44224160</td></tr><tr class="even  even"><td class="r">1799</td><td class="r">Ibrutinib</td><td class="r">PCI-32765, CRA-032765, PCI-32765-00</td><td class="r">Other, kinases</td><td class="r">BTK</td><td class="r">24821094</td></tr><tr class="odd  odd"><td class="r">1873</td><td class="r">Buparlisib</td><td class="r">BKM120, NVP-BKM120</td><td class="r">PI3K/MTOR signaling</td><td class="r">PI3Kalpha, PI3Kdelta, PI3Kbeta, PI3Kgamma</td><td class="r">66577015, 16654980</td></tr><tr class="even  even"><td class="r">1912</td><td class="r">Afuresertib</td><td class="r">GSK2110183, GSK2110183C</td><td class="r">PI3K/MTOR signaling</td><td class="r">AKT1, AKT2, AKT3</td><td class="r">46843057</td></tr><tr class="odd  odd"><td class="r">1050</td><td class="r">ZM447439</td><td class="r">ZM-447439, ZM 447439</td><td class="r">Mitosis</td><td class="r">AURKA, AURKB</td><td class="r">9914412</td></tr><tr class="even  even"><td class="r">1059</td><td class="r">AZD8055</td><td class="r">AZD-8055</td><td class="r">PI3K/MTOR signaling</td><td class="r">MTORC1, MTORC2</td><td class="r">25262965</td></tr><tr class="odd  odd"><td class="r">1083</td><td class="r">Crizotinib</td><td class="r">Xalkori, PF2341066, PF-2341066, PF 2341066</td><td class="r">RTK signaling</td><td class="r">MET, ALK, ROS1</td><td class="r">11626560</td></tr><tr class="even  even"><td class="r">1449</td><td class="r">AZD1208</td><td class="r">SN1028350371</td><td class="r">Other, kinases</td><td class="r">PIM1, PIM2, PIM3</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1580</td><td class="r">GSK2801</td><td class="r">1-(7-(2-methylsulfonyl-phenyl)-4-propoxy-1-aza-bicyclo[4.3.0]nona-2,4,6,8-tetraen-9-yl)-ethanone</td><td class="r">Chromatin other</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1776</td><td class="r">GSK2256098C</td><td class="r">-</td><td class="r">Cytoskeleton</td><td class="r">FAK1</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1783</td><td class="r">LMB_AB1</td><td class="r">-</td><td class="r">Other</td><td class="r">ADRA1A, ADRB1</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1804</td><td class="r">Acetalax</td><td class="r">Oxyphenisatin acetate</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1818</td><td class="r">Vincristine</td><td class="r">-</td><td class="r">-</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1825</td><td class="r">Podophyllotoxin bromide</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1832</td><td class="r">729189</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1919</td><td class="r">Osimertinib</td><td class="r">AZD9291, AZD 9291, AZD-9291, Tagrisso, Mereletinib</td><td class="r">EGFR signaling</td><td class="r">EGFR</td><td class="r">71496458</td></tr><tr class="odd  odd"><td class="r">1936</td><td class="r">Savolitinib</td><td class="r">AZD6094, Volitinib, AZD-6094, AZD 6094</td><td class="r">RTK signaling</td><td class="r">MET</td><td class="r">68289010</td></tr><tr class="even  even"><td class="r">2044</td><td class="r">Pyridostatin</td><td class="r">-</td><td class="r">DNA replication</td><td class="r">G-quadruplex stabiliser</td><td class="r">25227847</td></tr><tr class="odd  odd"><td class="r">2047</td><td class="r">Ulixertinib</td><td class="r">BVD-523, VRT752271</td><td class="r">ERK MAPK signaling</td><td class="r">ERK1, ERK2</td><td class="r">11719003, 58641927</td></tr><tr class="even  even"><td class="r">2048</td><td class="r">Vinorelbine</td><td class="r">vinorelbine tartrate, Navelbine, Exelbine</td><td class="r">Mitosis</td><td class="r">Microtubule destabiliser</td><td class="r">5311497</td></tr><tr class="odd  odd"><td class="r">2112</td><td class="r">VTP-A</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">2149</td><td class="r">CT7033-2</td><td class="r">-</td><td class="r">Chromatin histone methylation</td><td class="r">KDM4A, KDM4C, KDM4E, KDM3A, KDM6B</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">2499</td><td class="r">N-acetyl cysteine</td><td class="r">NAC</td><td class="r">Metabolism</td><td class="r">Metabolism</td><td class="r">None</td></tr><tr class="even  even"><td class="r">1007</td><td class="r">Docetaxel</td><td class="r">RP-56976, Taxotere</td><td class="r">Mitosis</td><td class="r">Microtubule stabiliser</td><td class="r">148124</td></tr><tr class="odd  odd"><td class="r">1012</td><td class="r">Vorinostat</td><td class="r">Zolinza, SAHA, suberanilohydroxamic acid, suberoylanilide hydroxamic acid, MK-0683</td><td class="r">Chromatin histone acetylation</td><td class="r">HDAC inhibitor Class I, IIa, IIb, IV</td><td class="r">5311</td></tr><tr class="even  even"><td class="r">1016</td><td class="r">Temsirolimus</td><td class="r">CCI-779, Torisel</td><td class="r">PI3K/MTOR signaling</td><td class="r">MTOR</td><td class="r">6918289</td></tr><tr class="odd  odd"><td class="r">1248</td><td class="r">Daporinad</td><td class="r">APO866,  FK866, FK866</td><td class="r">Metabolism</td><td class="r">NAMPT</td><td class="r">6914657</td></tr><tr class="even  even"><td class="r">1249</td><td class="r">BMS-345541</td><td class="r">-</td><td class="r">Other</td><td class="r">IKK-1, IKK-2</td><td class="r">9926054</td></tr><tr class="odd  odd"><td class="r">1268</td><td class="r">XAV939</td><td class="r">NVP-XAV939, XAV-939, XAV 939</td><td class="r">WNT signaling</td><td class="r">TNKS1, TNKS2</td><td class="r">2726824</td></tr><tr class="even  even"><td class="r">1549</td><td class="r">Sapitinib</td><td class="r">AZD8931</td><td class="r">EGFR signaling</td><td class="r">EGFR, ERBB2, ERBB3</td><td class="r">11488320</td></tr><tr class="odd  odd"><td class="r">1561</td><td class="r">Taselisib</td><td class="r">GDC-0032, GDC0032, RG7604</td><td class="r">PI3K/MTOR signaling</td><td class="r">PI3K (beta sparing)</td><td class="r">51001932</td></tr><tr class="even  even"><td class="r">1853</td><td class="r">OF-1</td><td class="r">-</td><td class="r">Chromatin histone acetylation</td><td class="r">BRPF1B, BRPF2</td><td class="r">35397514</td></tr><tr class="odd  odd"><td class="r">1072</td><td class="r">Avagacestat</td><td class="r">BMS-708163, BMS 708163</td><td class="r">Other</td><td class="r">Amyloid beta20, Amyloid beta40</td><td class="r">46883536</td></tr><tr class="even  even"><td class="r">1073</td><td class="r">5-Fluorouracil</td><td class="r">5-FU</td><td class="r">Other</td><td class="r">Antimetabolite (DNA &amp; RNA)</td><td class="r">3385</td></tr><tr class="odd  odd"><td class="r">1177</td><td class="r">Niraparib</td><td class="r">MK-4827, MK4827</td><td class="r">Genome integrity</td><td class="r">PARP1, PARP2</td><td class="r">24958200</td></tr><tr class="even  even"><td class="r">1463</td><td class="r">AZD1332</td><td class="r">SN1061387896</td><td class="r">RTK signaling</td><td class="r">NTRK1, NTRK2, NTRK3</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1733</td><td class="r">ULK1_4989</td><td class="r">SN1049257073, ULK1_4989</td><td class="r">Other, kinases</td><td class="r">ULK1</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1734</td><td class="r">VSP34_8731</td><td class="r">SN1049998399, VSP34_8731</td><td class="r">Other</td><td class="r">VSP34</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1736</td><td class="r">Selumetinib</td><td class="r">SN1103949345, AZD1480</td><td class="r">ERK MAPK signaling</td><td class="r">MEK1, MEK2</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1784</td><td class="r">LMB_AB2</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">GADD34</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1821</td><td class="r">765771</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1828</td><td class="r">720427</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1830</td><td class="r">Gallibiscoquinazole</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1839</td><td class="r">615590</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1843</td><td class="r">LMP744</td><td class="r">MJ-III-65</td><td class="r">DNA replication</td><td class="r">TOP1</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1847</td><td class="r">BEN</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1922</td><td class="r">Cediranib</td><td class="r">AZD2171, AZD 2171, AZD-2171, Recentin</td><td class="r">RTK signaling</td><td class="r">VEGFR, FLT1, FLT2, FLT3, FLT4, KIT, PDGFRB</td><td class="r">9933475</td></tr><tr class="even  even"><td class="r">1930</td><td class="r">Telomerase Inhibitor IX</td><td class="r">MST-312, MST 312, MST312</td><td class="r">Genome integrity</td><td class="r">Telomerase</td><td class="r">10385095</td></tr><tr class="odd  odd"><td class="r">1939</td><td class="r">UMI-77</td><td class="r">UMI 77, UMI-77</td><td class="r">Apoptosis regulation</td><td class="r">MCL1</td><td class="r">992586</td></tr><tr class="even  even"><td class="r">1941</td><td class="r">Sepantronium bromide</td><td class="r">YM155, YM-155, YM 155</td><td class="r">Apoptosis regulation</td><td class="r">BIRC5</td><td class="r">11178236</td></tr><tr class="odd  odd"><td class="r">1996</td><td class="r">MIM1</td><td class="r">MIM 1, MIM-1</td><td class="r">Apoptosis regulation</td><td class="r">MCL1</td><td class="r">16241412</td></tr><tr class="even  even"><td class="r">2010</td><td class="r">HKMTI-1-005</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">2057</td><td class="r">Remodelin</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">2157</td><td class="r">A-366</td><td class="r">-</td><td class="r">Chromatin histone methylation</td><td class="r">EHMT1, EHMT2</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">2171</td><td class="r">BMS-754807</td><td class="r">BMS-754807</td><td class="r">IGF1R signaling</td><td class="r">IGF1R, IR</td><td class="r">-</td></tr><tr class="even  even"><td class="r">2177</td><td class="r">SGC0946</td><td class="r">-</td><td class="r">Chromatin histone methylation</td><td class="r">DOT1L</td><td class="r">56962337</td></tr><tr class="odd  odd"><td class="r">1003</td><td class="r">Camptothecin</td><td class="r">Camptothecine, (+)-Camptothecin</td><td class="r">DNA replication</td><td class="r">TOP1</td><td class="r">24360</td></tr><tr class="even  even"><td class="r">1004</td><td class="r">Vinblastine</td><td class="r">Velban</td><td class="r">Mitosis</td><td class="r">Microtubule destabiliser</td><td class="r">6710780</td></tr><tr class="odd  odd"><td class="r">1010</td><td class="r">Gefitinib</td><td class="r">ZD-1839, Iressa</td><td class="r">EGFR signaling</td><td class="r">EGFR</td><td class="r">123631</td></tr><tr class="even  even"><td class="r">1023</td><td class="r">GW441756</td><td class="r">GW 441756</td><td class="r">RTK signaling</td><td class="r">NTRK1</td><td class="r">9943465</td></tr><tr class="odd  odd"><td class="r">1025</td><td class="r">SB216763</td><td class="r">SB-216763, SB 216763</td><td class="r">WNT signaling</td><td class="r">GSK3A, GSK3B</td><td class="r">176158</td></tr><tr class="even  even"><td class="r">1029</td><td class="r">Motesanib</td><td class="r">AMG-706, AMG 706, AMG706</td><td class="r">RTK signaling</td><td class="r">VEGFR, RET, KIT, PDGFR</td><td class="r">11667893</td></tr><tr class="odd  odd"><td class="r">1034</td><td class="r">Staurosporine</td><td class="r">-</td><td class="r">RTK signaling</td><td class="r">Broad spectrum kinase inhibitor</td><td class="r">several</td></tr><tr class="even  even"><td class="r">1038</td><td class="r">NU7441</td><td class="r">KU-57788, NU-7432, NU-7741</td><td class="r">Genome integrity</td><td class="r">DNAPK</td><td class="r">11327430</td></tr><tr class="odd  odd"><td class="r">1031</td><td class="r">Elesclomol</td><td class="r">STA-4783</td><td class="r">Protein stability and degradation</td><td class="r">HSP90</td><td class="r">300471</td></tr><tr class="even  even"><td class="r">1096</td><td class="r">Tozasertib</td><td class="r">MK 0457,MK-0457,MK-045, VX-680 VX 680 VX-68</td><td class="r">Mitosis</td><td class="r">AURKA, AURKB, AURKC, others</td><td class="r">5494449</td></tr><tr class="odd  odd"><td class="r">1194</td><td class="r">SB505124</td><td class="r">SB 505124, SB505124</td><td class="r">RTK signaling</td><td class="r">TGFBR1, ACVR1B, ACVR1C</td><td class="r">9858940</td></tr><tr class="even  even"><td class="r">1239</td><td class="r">YK-4-279</td><td class="r">YK 4-279</td><td class="r">Other</td><td class="r">RNA helicase A</td><td class="r">44632017</td></tr><tr class="odd  odd"><td class="r">1910</td><td class="r">ABT737</td><td class="r">-</td><td class="r">Apoptosis regulation</td><td class="r">BCL2, BCL-XL,  BCL-W, BCL-B, BFL1</td><td class="r">11228183</td></tr><tr class="even  even"><td class="r">1911</td><td class="r">Dactinomycin</td><td class="r">Actinomycin D, Cosmegen, Lyovac, Meractinomycin, Ac-De, ActD</td><td class="r">Other</td><td class="r">RNA polymerase</td><td class="r">several</td></tr><tr class="odd  odd"><td class="r">1043</td><td class="r">JNK Inhibitor VIII</td><td class="r">-</td><td class="r">JNK and p38 signaling</td><td class="r">JNK</td><td class="r">11624601</td></tr><tr class="even  even"><td class="r">1058</td><td class="r">Pictilisib</td><td class="r">GDC-0941, GDC0941, RG-7621</td><td class="r">PI3K/MTOR signaling</td><td class="r">PI3K (class 1)</td><td class="r">17755052</td></tr><tr class="odd  odd"><td class="r">1069</td><td class="r">EHT-1864</td><td class="r">EHT 1864</td><td class="r">Cytoskeleton</td><td class="r">RAC1, RAC2, RAC3</td><td class="r">9938202</td></tr><tr class="even  even"><td class="r">1085</td><td class="r">Sorafenib</td><td class="r">Nexavar, 284461-73-0, BAY 43-9006</td><td class="r">Other, kinases</td><td class="r">PDGFR, KIT, VEGFR, RAF</td><td class="r">216239</td></tr><tr class="odd  odd"><td class="r">1089</td><td class="r">Oxaliplatin</td><td class="r">Eloxatin</td><td class="r">DNA replication</td><td class="r">DNA alkylating agent</td><td class="r">5310940</td></tr><tr class="even  even"><td class="r">1093</td><td class="r">GSK1904529A</td><td class="r">GSK-1904529A, GSK 1904529A</td><td class="r">IGF1R signaling</td><td class="r">IGF1R, IR</td><td class="r">25124816</td></tr><tr class="odd  odd"><td class="r">1133</td><td class="r">Serdemetan</td><td class="r">JNJ-26854165</td><td class="r">p53 pathway</td><td class="r">MDM2</td><td class="r">11609586</td></tr><tr class="even  even"><td class="r">1168</td><td class="r">Erlotinib</td><td class="r">Tarceva, RG-1415, CP-358774, OSI-774, Ro-508231, R-1415</td><td class="r">EGFR signaling</td><td class="r">EGFR</td><td class="r">176870</td></tr><tr class="odd  odd"><td class="r">1179</td><td class="r">MK-1775</td><td class="r">AZD1775, Adavosertib</td><td class="r">Cell cycle</td><td class="r">WEE1, PLK1</td><td class="r">24856436</td></tr><tr class="even  even"><td class="r">1190</td><td class="r">Gemcitabine</td><td class="r">Gemzar, LY-188011</td><td class="r">DNA replication</td><td class="r">Pyrimidine antimetabolite</td><td class="r">60750</td></tr><tr class="odd  odd"><td class="r">1378</td><td class="r">Bleomycin (50 uM)</td><td class="r">-</td><td class="r">DNA replication</td><td class="r">dsDNA break induction</td><td class="r">5460769</td></tr><tr class="even  even"><td class="r">1583</td><td class="r">GSK-LSD1</td><td class="r">N-((1R,2S)-2-phenylcyclopropyl)piperidin-4-amine</td><td class="r">Chromatin histone methylation</td><td class="r">KDM1</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1708</td><td class="r">CDK9_5576</td><td class="r">SN1047226266, CDK9_5576</td><td class="r">Cell cycle</td><td class="r">CDK9</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1738</td><td class="r">IGF1R_3801</td><td class="r">SN1051640224, IGF1R_3801</td><td class="r">IGF1R signaling</td><td class="r">IGFR1</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1739</td><td class="r">JAK_8517</td><td class="r">SN1066590414, JAK_8517</td><td class="r">Other, kinases</td><td class="r">JAK1, JAK2</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1777</td><td class="r">GSK2276186C</td><td class="r">-</td><td class="r">Other, kinases</td><td class="r">JAK1, JAK2, JAK3</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1778</td><td class="r">GSK2110183B</td><td class="r">-</td><td class="r">PI3K/MTOR signaling</td><td class="r">AKT1, AKT2, AKT3</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1780</td><td class="r">GSK3337463A</td><td class="r">-</td><td class="r">Other, kinases</td><td class="r">NIK</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1802</td><td class="r">Zoledronate</td><td class="r">Zoledronic acid, Zometa, Reclast</td><td class="r">Other</td><td class="r"> </td><td class="r">-</td></tr><tr class="even  even"><td class="r">1809</td><td class="r">Teniposide</td><td class="r">Vumon</td><td class="r">DNA replication</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1816</td><td class="r">Fulvestrant</td><td class="r">-</td><td class="r">Hormone-related</td><td class="r">ESR</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1820</td><td class="r">123829</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1824</td><td class="r">123138</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1829</td><td class="r">667880</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1831</td><td class="r">L-Oxonoreleagnine</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1833</td><td class="r">741909</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1840</td><td class="r">630600</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1926</td><td class="r">GNE-317</td><td class="r">GNE317, GNE 317</td><td class="r">PI3K/MTOR signaling</td><td class="r">PI3Kalpha</td><td class="r">70676303</td></tr><tr class="odd  odd"><td class="r">1940</td><td class="r">WIKI4</td><td class="r">WIKI-4, WIKI 4</td><td class="r">WNT signaling</td><td class="r">TNKS1, TNKS2</td><td class="r">2984337</td></tr><tr class="even  even"><td class="r">2045</td><td class="r">AMG-319</td><td class="r">AMG319</td><td class="r">PI3K/MTOR signaling</td><td class="r">PI3K (beta sparing)</td><td class="r">68947304</td></tr><tr class="odd  odd"><td class="r">2170</td><td class="r">AT13148</td><td class="r">-</td><td class="r">PI3K/MTOR signaling</td><td class="r">AKT1</td><td class="r">-</td></tr><tr class="even  even"><td class="r">2498</td><td class="r">alpha-lipoic acid</td><td class="r">aLA</td><td class="r">Metabolism</td><td class="r">Metabolism</td><td class="r">None</td></tr><tr class="odd  odd"><td class="r">1022</td><td class="r">AZD7762</td><td class="r">AZD-7762, AZD 7762</td><td class="r">Cell cycle</td><td class="r">CHEK1, CHEK2</td><td class="r">11152667</td></tr><tr class="even  even"><td class="r">2169</td><td class="r">AZD6482</td><td class="r">AZD 6482, AZD-6482, AK-55409</td><td class="r">PI3K/MTOR signaling</td><td class="r">PI3Kbeta</td><td class="r">44137675</td></tr><tr class="odd  odd"><td class="r">2174</td><td class="r">IOX2</td><td class="r">IOX-2, IOX 2, AK176060</td><td class="r">Other</td><td class="r">EGLN1</td><td class="r">54685215</td></tr><tr class="even  even"><td class="r">1033</td><td class="r">Vismodegib</td><td class="r">GDC0449, Erivedge</td><td class="r">Other</td><td class="r">SMO</td><td class="r">24776445</td></tr><tr class="odd  odd"><td class="r">1037</td><td class="r">BX795</td><td class="r">BX-795</td><td class="r">Other, kinases</td><td class="r">TBK1, PDK1 (PDPK1), IKK, AURKB, AURKC</td><td class="r">10077147</td></tr><tr class="even  even"><td class="r">1200</td><td class="r">Fulvestrant</td><td class="r">Faslodex, ICI-182780</td><td class="r">Hormone-related</td><td class="r">ESR</td><td class="r">104741</td></tr><tr class="odd  odd"><td class="r">1243</td><td class="r">Piperlongumine</td><td class="r">Piplartine</td><td class="r">Other</td><td class="r">Induces reactive oxygen species</td><td class="r">637858</td></tr><tr class="even  even"><td class="r">1250</td><td class="r">AZ960</td><td class="r">AZ 960, AZ-960</td><td class="r">Other, kinases</td><td class="r">JAK2, JAK3</td><td class="r">25099184</td></tr><tr class="odd  odd"><td class="r">1560</td><td class="r">Alpelisib</td><td class="r">BYL719, BYL-719, NVP-BYL719</td><td class="r">PI3K/MTOR signaling</td><td class="r">PI3Kalpha</td><td class="r">56649450</td></tr><tr class="even  even"><td class="r">1631</td><td class="r">PRT062607</td><td class="r">P505-15, PRT-2607, BIIB-057</td><td class="r">Other, kinases</td><td class="r">SYK</td><td class="r">44462758</td></tr><tr class="odd  odd"><td class="r">1632</td><td class="r">Ribociclib</td><td class="r">LEE011, NVP-LEE011, LEE011-BBA</td><td class="r">Cell cycle</td><td class="r">CDK4, CDK6</td><td class="r">44631912</td></tr><tr class="even  even"><td class="r">1635</td><td class="r">Picolinici-acid</td><td class="r">Picolinate</td><td class="r">Other</td><td class="r">Inflammatory related</td><td class="r">1018</td></tr><tr class="odd  odd"><td class="r">1046</td><td class="r">Wee1 Inhibitor</td><td class="r">681640, Wee1 Inhibitor</td><td class="r">Cell cycle</td><td class="r">WEE1, CHEK1</td><td class="r">10384072</td></tr><tr class="even  even"><td class="r">1068</td><td class="r">Obatoclax Mesylate</td><td class="r">GX15-070MS, Obatoclax, GX15-070</td><td class="r">Apoptosis regulation</td><td class="r">BCL2, BCL-XL, BCL-W, MCL1</td><td class="r">11404337</td></tr><tr class="odd  odd"><td class="r">1180</td><td class="r">Dinaciclib</td><td class="r">SCH727965, SCH 727965, SCH-727-965, MK-7965</td><td class="r">Cell cycle</td><td class="r">CDK1, CDK2, CDK5, CDK9</td><td class="r">46926350</td></tr><tr class="even  even"><td class="r">1730</td><td class="r">PAK_5339</td><td class="r">SN1046290829, PAK_5339</td><td class="r">Cytoskeleton</td><td class="r">PAK1, PAK2</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1803</td><td class="r">Acetalax</td><td class="r">Oxyphenisatin acetate</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1814</td><td class="r">Nelarabine</td><td class="r">Atriance, Arranon</td><td class="r">DNA replication</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1827</td><td class="r">Dihydrorotenone</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1836</td><td class="r">150412</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">2002</td><td class="r">N30652-18-1</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">2110</td><td class="r">GSK591</td><td class="r">EPZ015866, GSK3203591</td><td class="r">Chromatin histone methylation</td><td class="r">PMRT5</td><td class="r">117072552</td></tr><tr class="odd  odd"><td class="r">2111</td><td class="r">VE821</td><td class="r">VE 821, VE-821</td><td class="r">Genome integrity</td><td class="r">ATR</td><td class="r">51000408</td></tr><tr class="even  even"><td class="r">2172</td><td class="r">JQ1</td><td class="r">JQ-1, (+)-JQ-1</td><td class="r">Chromatin other</td><td class="r">BRD2, BRD3, BRD4, BRDT</td><td class="r">46907787</td></tr><tr class="odd  odd"><td class="r">2359</td><td class="r">GSK2830371</td><td class="r">None</td><td class="r">Other</td><td class="r">WIP1</td><td class="r">70983932</td></tr><tr class="even  even"><td class="r">2361</td><td class="r">THR-102</td><td class="r">WIMM synthesis</td><td class="r">PI3K/MTOR signaling</td><td class="r">Mutant RAS</td><td class="r">None</td></tr><tr class="odd  odd"><td class="r">1011</td><td class="r">Navitoclax</td><td class="r">ABT-263, ABT263, ABT 263</td><td class="r">Apoptosis regulation</td><td class="r">BCL2, BCL-XL, BCL-W</td><td class="r">24978538</td></tr><tr class="even  even"><td class="r">1014</td><td class="r">Refametinib</td><td class="r">RDEA119, BAY-86-9766, BAY 869766</td><td class="r">ERK MAPK signaling</td><td class="r">MEK1, MEK2</td><td class="r">44182295</td></tr><tr class="odd  odd"><td class="r">1024</td><td class="r">Lestaurtinib</td><td class="r">CEP-701, SP-924, SPM-924, A-154475, KT-555</td><td class="r">Other, kinases</td><td class="r">FLT3, JAK2, NTRK1, NTRK2, NTRK3</td><td class="r">126565</td></tr><tr class="even  even"><td class="r">2175</td><td class="r">CHIR-99021</td><td class="r">CT 99021, CHIR99021, CHIR 99021</td><td class="r">WNT signaling</td><td class="r">GSK3A, GSK3B</td><td class="r">9956119</td></tr><tr class="odd  odd"><td class="r">1237</td><td class="r">EPZ004777</td><td class="r">EPZ-004777</td><td class="r">Chromatin histone methylation</td><td class="r">DOT1L</td><td class="r">56962336</td></tr><tr class="even  even"><td class="r">1373</td><td class="r">Dabrafenib</td><td class="r">GSK2118436, Tafinlar</td><td class="r">ERK MAPK signaling</td><td class="r">BRAF</td><td class="r">44462760</td></tr><tr class="odd  odd"><td class="r">1529</td><td class="r">Pevonedistat</td><td class="r">MLN4924, MLN 4924, MLN-4924</td><td class="r">Other</td><td class="r">NAE</td><td class="r">16720766</td></tr><tr class="even  even"><td class="r">1598</td><td class="r">LGK974</td><td class="r">LGK-974</td><td class="r">WNT signaling</td><td class="r">PORCN</td><td class="r">46926973</td></tr><tr class="odd  odd"><td class="r">1613</td><td class="r">VE-822</td><td class="r">VE 822, VE822, Berzosertib</td><td class="r">Genome integrity</td><td class="r">ATR</td><td class="r">59472121</td></tr><tr class="even  even"><td class="r">1618</td><td class="r">GSK2606414</td><td class="r">AK175551</td><td class="r">Metabolism</td><td class="r">PERK</td><td class="r">53469448</td></tr><tr class="odd  odd"><td class="r">1629</td><td class="r">ML323</td><td class="r">GTPL7898, AOB2313,</td><td class="r">Protein stability and degradation</td><td class="r">USP1, UAF1</td><td class="r">60167849</td></tr><tr class="even  even"><td class="r">1849</td><td class="r">Sabutoclax</td><td class="r">BI-97C1</td><td class="r">Apoptosis regulation</td><td class="r">BCL2,  BCL-XL,  BFL1, MCL1</td><td class="r">46236925</td></tr><tr class="odd  odd"><td class="r">1048</td><td class="r">Mirin</td><td class="r">-</td><td class="r">Genome integrity</td><td class="r">MRE11</td><td class="r">none</td></tr><tr class="even  even"><td class="r">1051</td><td class="r">Alisertib</td><td class="r">MLN8237</td><td class="r">Mitosis</td><td class="r">AURKA</td><td class="r">24771867</td></tr><tr class="odd  odd"><td class="r">1057</td><td class="r">Dactolisib</td><td class="r">NVP-BEZ235, BEZ235</td><td class="r">PI3K/MTOR signaling</td><td class="r">PI3K (class 1), MTORC1, MTORC2</td><td class="r">11977753</td></tr><tr class="even  even"><td class="r">1061</td><td class="r">SB590885</td><td class="r">SB-590885</td><td class="r">ERK MAPK signaling</td><td class="r">BRAF</td><td class="r">11316960</td></tr><tr class="odd  odd"><td class="r">1062</td><td class="r">Selumetinib</td><td class="r">AZD6244, AZD-6244, ARRY-886</td><td class="r">ERK MAPK signaling</td><td class="r">MEK1, MEK2</td><td class="r">10127622</td></tr><tr class="even  even"><td class="r">1091</td><td class="r">BMS-536924</td><td class="r">BMS 536924</td><td class="r">IGF1R signaling</td><td class="r">IGF1R, IR</td><td class="r">10390396</td></tr><tr class="odd  odd"><td class="r">1175</td><td class="r">Rucaparib</td><td class="r">PF-01367338, AG-014699, AG-14447, AG-14699</td><td class="r">Genome integrity</td><td class="r">PARP1, PARP2</td><td class="r">9931953</td></tr><tr class="even  even"><td class="r">1191</td><td class="r">Bortezomib</td><td class="r">PS-341, LDP-341, Velcade</td><td class="r">Protein stability and degradation</td><td class="r">Proteasome</td><td class="r">387447</td></tr><tr class="odd  odd"><td class="r">1586</td><td class="r">BDILV000379a</td><td class="r">4-(4-(1-methyl-1H-pyrazol-4-yl)-1-(2-(1-methyl-1H-pyrazol-4-yl)ethyl)-4,5-dihydro-1H-imidazol-5-yl)benzonitrile</td><td class="r">Chromatin histone acetylation</td><td class="r">BAZ2A, BAZ2B</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1718</td><td class="r">JAK1_8709</td><td class="r">SN1049200060, JAK1_8709</td><td class="r">Other, kinases</td><td class="r">JAK1</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1732</td><td class="r">TAF1_5496</td><td class="r">SN1050393042, TAF1_5496</td><td class="r">Other</td><td class="r">TAF1</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1779</td><td class="r">GSK626616AC</td><td class="r">-</td><td class="r">Other, kinases</td><td class="r">DYRK1A</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1785</td><td class="r">LMB_AB3</td><td class="r">-</td><td class="r">Other</td><td class="r">PPP1R15B</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1806</td><td class="r">Oxaliplatin</td><td class="r">-</td><td class="r">DNA replication</td><td class="r">DNA alkylating agent</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1835</td><td class="r">Elephantin</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1838</td><td class="r">Sinularin</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1844</td><td class="r">776928</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1916</td><td class="r">AZD5363</td><td class="r">-</td><td class="r">Other, kinases</td><td class="r">AKT1, AKT2, AKT3, ROCK2</td><td class="r">25227436, 42602260</td></tr><tr class="odd  odd"><td class="r">1927</td><td class="r">GSK2578215A</td><td class="r">GSK 2578215A,  GSK-2578215A</td><td class="r">Other, kinases</td><td class="r">LRRK2</td><td class="r">68107965</td></tr><tr class="even  even"><td class="r">2000</td><td class="r">N27922-53-1</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">2046</td><td class="r">MK-8776</td><td class="r">SCH900776</td><td class="r">Cell cycle</td><td class="r">CHEK1, CHEK2, CDK2</td><td class="r">16224745</td></tr><tr class="even  even"><td class="r">2113</td><td class="r">VTP-B</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">2145</td><td class="r">PBD-288</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">2148</td><td class="r">POMHEX</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">2173</td><td class="r">PFI-1</td><td class="r">-</td><td class="r">Chromatin other</td><td class="r">BRD4</td><td class="r">71271629</td></tr><tr class="even  even"><td class="r">1013</td><td class="r">Nilotinib</td><td class="r">Tasigna, AMN 107</td><td class="r">ABL signaling</td><td class="r">ABL</td><td class="r">644241</td></tr><tr class="odd  odd"><td class="r">1032</td><td class="r">Afatinib</td><td class="r">BIBW2992, Tovok, Gilotrif</td><td class="r">EGFR signaling</td><td class="r">EGFR, ERBB2</td><td class="r">10184653</td></tr><tr class="even  even"><td class="r">1036</td><td class="r">PLX-4720</td><td class="r">PLX4720, PLX 4720</td><td class="r">ERK MAPK signaling</td><td class="r">BRAF</td><td class="r">24180719</td></tr><tr class="odd  odd"><td class="r">1149</td><td class="r">TW 37</td><td class="r">TW37, TW-37</td><td class="r">Apoptosis regulation</td><td class="r">BCL2, BCL-XL, MCL1</td><td class="r">11455910</td></tr><tr class="even  even"><td class="r">1009</td><td class="r">Tretinoin</td><td class="r">ATRA, Vesanoid, Renova, Atralin, Tretin-X, Avita</td><td class="r">Other</td><td class="r">Retinoic acid</td><td class="r">444795</td></tr><tr class="odd  odd"><td class="r">1039</td><td class="r">SL0101</td><td class="r">SL-0101, SL 0101-1</td><td class="r">Other, kinases</td><td class="r">RSK, AURKB, PIM1, PIM3</td><td class="r">10459196</td></tr><tr class="even  even"><td class="r">1199</td><td class="r">Tamoxifen</td><td class="r">Nolvadex, Soltamox, Zynoplex, ICI-46474, Kessar</td><td class="r">Hormone-related</td><td class="r">ESR1</td><td class="r">2733526</td></tr><tr class="odd  odd"><td class="r">1512</td><td class="r">Cyclophosphamide</td><td class="r">Cytoxan, Neosar, Cyclophosphamid, Procytox, Cyclophosphane</td><td class="r">DNA replication</td><td class="r">Alkylating agent</td><td class="r">2907</td></tr><tr class="even  even"><td class="r">1558</td><td class="r">Lapatinib</td><td class="r">Tykerb, Tyverb</td><td class="r">EGFR signaling</td><td class="r">EGFR, ERBB2</td><td class="r">208908</td></tr><tr class="odd  odd"><td class="r">1617</td><td class="r">AZD5582</td><td class="r">GTPL7710, AOB5560</td><td class="r">Apoptosis regulation</td><td class="r">XIAP, cIAP</td><td class="r">49847690</td></tr><tr class="even  even"><td class="r">1624</td><td class="r">I-BET-762</td><td class="r">GSK525762A</td><td class="r">Chromatin other</td><td class="r">BRD2, BRD3, BRD4</td><td class="r">46943432</td></tr><tr class="odd  odd"><td class="r">1852</td><td class="r">LY2109761</td><td class="r">5XE</td><td class="r">Other</td><td class="r">TGFB1</td><td class="r">11655119</td></tr><tr class="even  even"><td class="r">1862</td><td class="r">MG-132</td><td class="r">LLL cpd, MG 132, MG132</td><td class="r">Protein stability and degradation</td><td class="r">Proteasome, CAPN1</td><td class="r">462382</td></tr><tr class="odd  odd"><td class="r">1047</td><td class="r">Nutlin-3a (-)</td><td class="r">-</td><td class="r">p53 pathway</td><td class="r">MDM2</td><td class="r">11433190</td></tr><tr class="even  even"><td class="r">1067</td><td class="r">CCT007093</td><td class="r">-</td><td class="r">Cell cycle</td><td class="r">PPM1D</td><td class="r">2314623</td></tr><tr class="odd  odd"><td class="r">1582</td><td class="r">SGC-CBP30</td><td class="r">8-(3-chloro-4-methoxy-phenethyl)-4-(3,5-dimethyl-isoxazol-4-yl)-9-(2-(morpholin-4-yl)-propyl)-7,9-diaza-bicyclo[4.3.0]nona-1(6),2,4,7-tetraene</td><td class="r">Chromatin histone acetylation</td><td class="r">EP300, CBP</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1585</td><td class="r">BDF00022089a</td><td class="r">-</td><td class="r">Chromatin histone acetylation</td><td class="r">BRPF1, BRPF2, BRPF3</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1706</td><td class="r">AZD5153</td><td class="r">SN1045566955, AZD5153</td><td class="r">Chromatin other</td><td class="r">BRD4</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1712</td><td class="r">Eg5_9814</td><td class="r">SN1047613775, Eg5_9814</td><td class="r">Other</td><td class="r">KSP11</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1713</td><td class="r">ERK_2440</td><td class="r">SN1051032892, ERK_2440</td><td class="r">ERK MAPK signaling</td><td class="r">ERK1,ERK2</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1720</td><td class="r">AZD5991</td><td class="r">SN1049446612, AZD5991</td><td class="r">Apoptosis regulation</td><td class="r">MCL1</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1808</td><td class="r">Topotecan</td><td class="r">-</td><td class="r">DNA replication</td><td class="r">TOP1</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1810</td><td class="r">Mitoxantrone</td><td class="r">-</td><td class="r">DNA replication</td><td class="r">TOP2</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1811</td><td class="r">Dactinomycin</td><td class="r">Actinomycin D</td><td class="r">Other</td><td class="r">RNA polymerase</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1815</td><td class="r">Dacarbazine</td><td class="r">DTIC</td><td class="r">Other</td><td class="r">CP11A</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1866</td><td class="r">BDP-00009066</td><td class="r">-</td><td class="r">Cytoskeleton</td><td class="r">MRCKB_HUMAN</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1918</td><td class="r">AZD8186</td><td class="r">AZD 8186, AZD-8186</td><td class="r">PI3K/MTOR signaling</td><td class="r">PI3Kalpha, PI3Kbeta</td><td class="r">52913813</td></tr><tr class="odd  odd"><td class="r">1924</td><td class="r">Ipatasertib</td><td class="r">GDC0068, GDC 0068, GDC-0068</td><td class="r">PI3K/MTOR signaling</td><td class="r">AKT1, AKT, AKT3</td><td class="r">24788740</td></tr><tr class="even  even"><td class="r">1925</td><td class="r">GDC0810</td><td class="r">GDC-0810, GDC 0810</td><td class="r">Hormone-related</td><td class="r">ESR1, ESR2</td><td class="r">56941241</td></tr><tr class="odd  odd"><td class="r">1998</td><td class="r">BPD-00008900</td><td class="r">-</td><td class="r">Other</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">2037</td><td class="r">GSK343</td><td class="r">-</td><td class="r">Chromatin histone methylation</td><td class="r">EZH2</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">2039</td><td class="r">AGK2</td><td class="r">-</td><td class="r">Chromatin histone acetylation</td><td class="r">BAZ2A, BAZ2B</td><td class="r">-</td></tr><tr class="even  even"><td class="r">2043</td><td class="r">BIBR-1532</td><td class="r">-</td><td class="r">Genome integrity</td><td class="r">TERT</td><td class="r">9927531</td></tr><tr class="odd  odd"><td class="r">2055</td><td class="r">Mycophenolic acid</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">2109</td><td class="r">AZ6102</td><td class="r">-</td><td class="r">WNT signaling</td><td class="r">TNKS1, TNKS2</td><td class="r">91663328</td></tr><tr class="odd  odd"><td class="r">2158</td><td class="r">CPI-637</td><td class="r">-</td><td class="r">Chromatin histone methylation</td><td class="r">EP300</td><td class="r">-</td></tr><tr class="even  even"><td class="r">2438</td><td class="r">ascorbate (vitamin C)</td><td class="r">back-up solution from YWKim</td><td class="r">Other</td><td class="r">anti-oxidant proteins</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1005</td><td class="r">Cisplatin</td><td class="r">cis-Diammineplatinum(II) dichloride, Platinol, CIS-DDP</td><td class="r">DNA replication</td><td class="r">DNA crosslinker</td><td class="r">84691</td></tr><tr class="even  even"><td class="r">1006</td><td class="r">Cytarabine</td><td class="r">Ara-Cytidine, Arabinosyl Cytosine, U-19920</td><td class="r">Other</td><td class="r">Antimetabolite</td><td class="r">6253</td></tr><tr class="odd  odd"><td class="r">1018</td><td class="r">Veliparib</td><td class="r">ABT-888, ABT888, ABT 888</td><td class="r">Genome integrity</td><td class="r">PARP1, PARP2</td><td class="r">11960529</td></tr><tr class="even  even"><td class="r">1020</td><td class="r">Lenalidomide</td><td class="r">CDC-501, CC-5013, Revlimid</td><td class="r">Protein stability and degradation</td><td class="r">CRBN</td><td class="r">216326</td></tr><tr class="odd  odd"><td class="r">1026</td><td class="r">Tanespimycin</td><td class="r">17-AAG, BMS-722782</td><td class="r">Protein stability and degradation</td><td class="r">HSP90</td><td class="r">6505803</td></tr><tr class="even  even"><td class="r">1192</td><td class="r">GSK269962A</td><td class="r">GSK 269962A</td><td class="r">Cytoskeleton</td><td class="r">ROCK1, ROCK2</td><td class="r">16095342</td></tr><tr class="odd  odd"><td class="r">1494</td><td class="r">SN-38</td><td class="r">7-Ethyl-10-Hydroxy-Camptothecin, SN 38</td><td class="r">DNA replication</td><td class="r">TOP1</td><td class="r">104842</td></tr><tr class="even  even"><td class="r">1507</td><td class="r">Ruxolitinib</td><td class="r">INCB-18424, Ruxolitinib Phosphate, Jakafi</td><td class="r">Other, kinases</td><td class="r">JAK1, JAK2</td><td class="r">25126798</td></tr><tr class="odd  odd"><td class="r">1510</td><td class="r">Linsitinib</td><td class="r">OSI-906, ASP-7487</td><td class="r">IGF1R signaling</td><td class="r">IGF1R</td><td class="r">11640390</td></tr><tr class="even  even"><td class="r">1511</td><td class="r">Epirubicin</td><td class="r">Ellence, Farmorubicin, IMI-28</td><td class="r">DNA replication</td><td class="r">Anthracycline</td><td class="r">41867</td></tr><tr class="odd  odd"><td class="r">1553</td><td class="r">Uprosertib</td><td class="r">GSK2141795, GSK2141795C, GSK-2141795</td><td class="r">PI3K/MTOR signaling</td><td class="r">AKT1, AKT2, AKT3</td><td class="r">51042438</td></tr><tr class="even  even"><td class="r">1563</td><td class="r">EPZ5676</td><td class="r">EPZ-5676, Pinometostat</td><td class="r">Chromatin histone methylation</td><td class="r">DOT1L</td><td class="r">57345410</td></tr><tr class="odd  odd"><td class="r">1564</td><td class="r">SCH772984</td><td class="r">CS-1421</td><td class="r">ERK MAPK signaling</td><td class="r">ERK1, ERK2</td><td class="r">24866313</td></tr><tr class="even  even"><td class="r">1576</td><td class="r">IWP-2</td><td class="r">Wnt Inhibitor IWP-2</td><td class="r">WNT signaling</td><td class="r">PORCN</td><td class="r">2155128</td></tr><tr class="odd  odd"><td class="r">1578</td><td class="r">Leflunomide</td><td class="r">Sulol, SU-101, HWA-486, Arava</td><td class="r">DNA replication</td><td class="r">Pyrimidine synthesis inhibitor</td><td class="r">3899</td></tr><tr class="even  even"><td class="r">1593</td><td class="r">Entinostat</td><td class="r">MS-275</td><td class="r">Chromatin histone acetylation</td><td class="r">HDAC1, HDAC3</td><td class="r">4261</td></tr><tr class="odd  odd"><td class="r">1614</td><td class="r">WZ4003</td><td class="r">WZ 4003</td><td class="r">Other, kinases</td><td class="r">NUAK1, NUAK2</td><td class="r">72200024</td></tr><tr class="even  even"><td class="r">1621</td><td class="r">PCI-34051</td><td class="r">AK298746</td><td class="r">Chromatin histone acetylation</td><td class="r">HDAC8, HDAC6, HDAC1</td><td class="r">24753719</td></tr><tr class="odd  odd"><td class="r">1625</td><td class="r">RVX-208</td><td class="r">1044870-39-4, Apabetalone</td><td class="r">Chromatin other</td><td class="r">BRD4</td><td class="r">24871506</td></tr><tr class="even  even"><td class="r">1630</td><td class="r">Entospletinib</td><td class="r">GS-9973</td><td class="r">Other, kinases</td><td class="r">SYK</td><td class="r">59473233</td></tr><tr class="odd  odd"><td class="r">1786</td><td class="r">AZD4547</td><td class="r">1035270-39-3</td><td class="r">RTK signaling</td><td class="r">FGFR1, FGFR2, FGFR3</td><td class="r">51039095</td></tr><tr class="even  even"><td class="r">1855</td><td class="r">KRAS (G12C) Inhibitor-12</td><td class="r">GTPL8020</td><td class="r">ERK MAPK signaling</td><td class="r">KRAS (G12C)</td><td class="r">73555129</td></tr><tr class="odd  odd"><td class="r">1908</td><td class="r">Ulixertinib</td><td class="r">BVD-523, VRT752271</td><td class="r">ERK MAPK signaling</td><td class="r">ERK1, ERK2</td><td class="r">11719003, 58641927</td></tr><tr class="even  even"><td class="r">1042</td><td class="r">Doramapimod</td><td class="r">BIRB-796, BIRB 796</td><td class="r">JNK and p38 signaling</td><td class="r">p38, JNK2</td><td class="r">156422</td></tr><tr class="odd  odd"><td class="r">1049</td><td class="r">PD173074</td><td class="r">PD-173074, PD 173074</td><td class="r">RTK signaling</td><td class="r">FGFR1, FGFR2, FGFR3</td><td class="r">1401</td></tr><tr class="even  even"><td class="r">1052</td><td class="r">RO-3306</td><td class="r">-</td><td class="r">Cell cycle</td><td class="r">CDK1</td><td class="r">44450571</td></tr><tr class="odd  odd"><td class="r">1053</td><td class="r">MK-2206</td><td class="r">MK 2206, MK2206</td><td class="r">PI3K/MTOR signaling</td><td class="r">AKT1, AKT2</td><td class="r">46930998</td></tr><tr class="even  even"><td class="r">1054</td><td class="r">Palbociclib</td><td class="r">PD0332991, PD-0332991, PF-00080665-73</td><td class="r">Cell cycle</td><td class="r">CDK4, CDK6</td><td class="r">5330286</td></tr><tr class="odd  odd"><td class="r">1060</td><td class="r">PD0325901</td><td class="r">PD-0325901, PD 0325901</td><td class="r">ERK MAPK signaling</td><td class="r">MEK1, MEK2</td><td class="r">9826528</td></tr><tr class="even  even"><td class="r">1079</td><td class="r">Dasatinib</td><td class="r">BMS-354825-03, BMS-354825, Sprycel</td><td class="r">Other, kinases</td><td class="r">ABL, SRC, Ephrins, PDGFR, KIT</td><td class="r">3062316</td></tr><tr class="odd  odd"><td class="r">1080</td><td class="r">Paclitaxel</td><td class="r">BMS-181339-01, Taxol, Onxol, Paxene, Praxel, Abraxane</td><td class="r">Mitosis</td><td class="r">Microtubule stabiliser</td><td class="r">36314</td></tr><tr class="even  even"><td class="r">1086</td><td class="r">BI-2536</td><td class="r">-</td><td class="r">Cell cycle</td><td class="r">PLK1, PLK2, PLK3</td><td class="r">11364421</td></tr><tr class="odd  odd"><td class="r">1088</td><td class="r">Irinotecan</td><td class="r">Camptosar, (+)-Irinotecan, Irinotecanum, irinotecan hydrochloride</td><td class="r">DNA replication</td><td class="r">TOP1</td><td class="r">60838</td></tr><tr class="even  even"><td class="r">1401</td><td class="r">AZD5438</td><td class="r">SN1057898678</td><td class="r">Cell cycle</td><td class="r">CDK2</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1428</td><td class="r">IAP_5620</td><td class="r">SN1043546339</td><td class="r">Other</td><td class="r">IAP</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1441</td><td class="r">AZD2014</td><td class="r">SN1103949359</td><td class="r">PI3K/MTOR signaling</td><td class="r">mTORC1, mTORC2</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1581</td><td class="r">Bromosporine</td><td class="r">ethyl N-[6-(3-methanesulfonamido-4-methylphenyl)-3-methyl-[1,2,4]triazolo[4,3-b]pyridazin-8-yl]carbamate</td><td class="r">Chromatin other</td><td class="r">CECR2, BRD2, BRD4, BRD9</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1709</td><td class="r">CDK9_5038</td><td class="r">SN1047483750, CDK9_5038</td><td class="r">Cell cycle</td><td class="r">CDK9</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1714</td><td class="r">ERK_6604</td><td class="r">SN1047587618, ERK_6604</td><td class="r">ERK MAPK signaling</td><td class="r">ERK1,ERK2</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1782</td><td class="r">GSK2830371A</td><td class="r">-</td><td class="r">Other</td><td class="r">PPM1D</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1807</td><td class="r">Carmustine</td><td class="r">-</td><td class="r">-</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1813</td><td class="r">Fludarabine</td><td class="r">Fludara</td><td class="r">DNA replication</td><td class="r">Antimetabolite</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1819</td><td class="r">Docetaxel</td><td class="r">RP-56976, Taxotere</td><td class="r">Mitosis</td><td class="r">Microtubule stabiliser</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1826</td><td class="r">50869</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">1845</td><td class="r">Schweinfurthin A</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">1917</td><td class="r">AZD6738</td><td class="r">AZD 6738, AZD-6738</td><td class="r">Genome integrity</td><td class="r">ATR</td><td class="r">54761306</td></tr><tr class="odd  odd"><td class="r">1928</td><td class="r">I-BRD9</td><td class="r">GSK602, GSK 602, GSK-602</td><td class="r">Chromatin other</td><td class="r">BRD9</td><td class="r">91668541</td></tr><tr class="even  even"><td class="r">1931</td><td class="r">MIRA-1</td><td class="r">MIRA 1, MIRA1, NSC19630, NSC-19630, NSC 19630</td><td class="r">p53 pathway</td><td class="r">TP53</td><td class="r">227681</td></tr><tr class="odd  odd"><td class="r">1999</td><td class="r">N25720-51-A1</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="even  even"><td class="r">2003</td><td class="r">N29087-69-1</td><td class="r">-</td><td class="r">Unclassified</td><td class="r">-</td><td class="r">-</td></tr><tr class="odd  odd"><td class="r">2096</td><td class="r">VX-11e</td><td class="r">VX11e, VX11e</td><td class="r">ERK MAPK signaling</td><td class="r">ERK2</td><td class="r">11634725</td></tr><tr class="even  even"><td class="r">2107</td><td class="r">LJI308</td><td class="r">-</td><td class="r">PI3K/MTOR signaling</td><td class="r">RSK2, RSK1, RSK3</td><td class="r">118704762</td></tr><tr class="odd  odd"><td class="r">2156</td><td class="r">5-azacytidine</td><td class="r">-</td><td class="r">Other</td><td class="r">DNA methyltransferases</td><td class="r">-</td></tr><tr class="even  even"><td class="r">2362</td><td class="r">THR-103</td><td class="r">WIMM synthesis</td><td class="r">PI3K/MTOR signaling</td><td class="r">Mutant RAS</td><td class="r">None</td></tr><tr class="odd  odd"><td class="r">1030</td><td class="r">KU-55933</td><td class="r">KU55933</td><td class="r">Genome integrity</td><td class="r">ATM</td><td class="r">5278396</td></tr><tr class="even  even"><td class="r">1129</td><td class="r">PF-4708671</td><td class="r">PF 4708671, PF4708671</td><td class="r">PI3K/MTOR signaling</td><td class="r">S6K1</td><td class="r">51371303</td></tr></tbody></table>'
soup = BeautifulSoup(drug_table_html)
table = soup.find('table')
table_body = table.find('tbody')

rows = table_body.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append([cols[0], cols[-1]])
drug_to_pubchem_df = pd.DataFrame(data = data, columns = ["DRUG_ID", "PUB_CHEM_ID"])
drug_to_pubchem_df["DRUG_ID"] = drug_to_pubchem_df["DRUG_ID"].astype("int")

In [5]:
drug_to_pubchem_df = pd.read_csv("gdsc_drugs.csv", sep = ",").rename(columns={"Drug Id" : "DRUG_ID"," PubCHEM": "PUB_CHEM_ID" })
drug_to_pubchem_df.loc[drug_to_pubchem_df["DRUG_ID"] == 1803, "PUB_CHEM_ID"] = 8269
targets_drug_pubchem_df = pd.merge(targets, drug_to_pubchem_df, on = "DRUG_ID", how = "left")

## Some Drugs dont have PubChem Ids (None, none, several (wtf?), -) -> Filter only those which have a valid Id

In [6]:
def get_only_valid_pubchem_ids(targets_drug_pubchem_df):
    def num_conv(id):
        try:
            return int(id)
        except:
            return None
    id_nums = list(map(num_conv, targets_drug_pubchem_df["PUB_CHEM_ID"].unique()))
    id_nums = list(filter(lambda id: id != None, id_nums))
    return id_nums
",".join(map(str, get_only_valid_pubchem_ids(targets_drug_pubchem_df))) 

'24360,6710780,84691,6253,148124,126941,444795,123631,24978538,5311,644241,44182295,6918289,23725625,11960529,5328940,216326,6450551,11152667,9943465,126565,176158,6505803,11667893,5278396,300471,10184653,24776445,24180719,10077147,11327430,10459196,156422,11624601,10384072,11433190,1401,9914412,24771867,44450571,46930998,5330286,11977753,17755052,25262965,9826528,11316960,10127622,2314623,11404337,9938202,46883536,3385,3062316,36314,11626560,5384616,216239,11364421,60838,5310940,10390396,25124816,5494449,51371303,52918385,11609586,11455910,176870,5327091,9931953,24958200,24856436,46926350,60750,387447,16095342,9858940,2733526,104741,56962336,44632017,637858,6914657,9926054,25099184,44819241,2726824,11707110,44462760,5394,5460769,104842,2375,25126798,11640390,41867,2907,16720766,11488320,51042438,24737642,208908,10096043,56649450,51001932,57345410,24866313,2155128,3899,4261,44224160,46926973,59472121,72200024,42623951,49847690,53469448,78243717,24753719,57519544,46943432,24871506,99367

## Transferred Pubchem Ids to SMILES via https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi

In [7]:
pubchem_to_smiles_df = pd.read_csv("pubchem_to_smiles.csv", sep = "\t", header = None)
pubchem_to_smiles_df = pubchem_to_smiles_df.rename(columns={0:"PUB_CHEM_ID", 1:"SMILES"})

In [8]:
def int_filter(entry):
    try:
        int(entry)
        return True
    except:
        return False
        
targets_drug_valid_pubchem_df = targets_drug_pubchem_df[targets_drug_pubchem_df["PUB_CHEM_ID"].apply(int_filter)]
targets_drug_valid_pubchem_df = targets_drug_valid_pubchem_df.copy()
targets_drug_valid_pubchem_df["PUB_CHEM_ID"] = targets_drug_valid_pubchem_df["PUB_CHEM_ID"].astype(np.int64)

In [9]:
pubchem_to_smiles_df = pubchem_to_smiles_df[~pubchem_to_smiles_df["SMILES"].isna()]

In [10]:
targets_drug_valid_smiles_df = pd.merge(targets_drug_valid_pubchem_df, pubchem_to_smiles_df, on = "PUB_CHEM_ID", how = "inner")

In [11]:
targets_drug_valid_smiles_df.loc[targets_drug_valid_smiles_df["DRUG_ID"] == 1803, "SMILES"].values[0]

'CC(=O)OC1=CC=C(C=C1)C2(C3=CC=CC=C3NC2=O)C4=CC=C(C=C4)OC(=O)C'

## 2. SMILES to PyG graphs

In [124]:
def one_hot_encoding(x, permitted_list):
    """
    TODO: POINT OF IMPROVEMENT
    Maps input elements x which are not in the permitted list to the last element
    of the permitted list.
    """

    if x not in permitted_list:
        x = permitted_list[-1]

    binary_encoding = [int(boolean_value) for boolean_value in list(map(lambda s: x == s, permitted_list))]

    return binary_encoding

def get_atom_features(atom, 
                  hydrogens_implicit = True):
    """
    Takes an RDKit atom object as input and gives a 1d-numpy array of atom features as output.
    """    
    # compute atom features

    electronegativity_table = {
        'H': 2.20, 'Li': 0.98, 'Be': 1.57, 'B': 2.04, 'C': 2.55, 'N': 3.04, 'O': 3.44, 
        'F': 3.98, 'Na': 0.93, 'Mg': 1.31, 'Al': 1.61, 'Si': 1.90, 'P': 2.19, 'S': 2.58, 
        'Cl': 3.16, 'K': 0.82, 'Ca': 1.00, 'Br': 2.96, 'I': 2.66
    }
    
    valence_electrons_table = {
        'H': 1, 'He': 2, 'Li': 1, 'Be': 2, 'B': 3, 'C': 4, 'N': 5, 'O': 6, 'F': 7, 'Ne': 8,
        'Na': 1, 'Mg': 2, 'Al': 3, 'Si': 4, 'P': 5, 'S': 6, 'Cl': 7, 'Ar': 8, 
        'K': 1, 'Ca': 2, 'Br': 7, 'I': 7
    }
    
    atom = mol.GetAtomWithIdx(0)
    atom_number = atom.GetAtomicNum()
    symbol = atom.GetSymbol()
    atom_mass = atom.GetMass()
    electronegativity = electronegativity_table[symbol]
    valence_electrons = valence_electrons_table[symbol]
    atom_type_enc = [atom_number, atom_mass, electronegativity, valence_electrons]
    
    n_heavy_neighbors_enc = [int(atom.GetDegree())] 
    formal_charge_enc = [int(atom.GetFormalCharge())]
    hybridisation_type_enc = one_hot_encoding(str(atom.GetHybridization()), ["S", "SP", "SP2", "SP3", "SP3D", "SP3D2", "OTHER"])
    
    is_in_a_ring_enc = [int(atom.IsInRing())]
    is_aromatic_enc = [int(atom.GetIsAromatic())]

    atom_feature_vector = atom_type_enc + n_heavy_neighbors_enc + formal_charge_enc + hybridisation_type_enc + is_in_a_ring_enc + is_aromatic_enc
    
    if hydrogens_implicit == True:
        n_hydrogens_enc = [int(atom.GetTotalNumHs())]
        atom_feature_vector += n_hydrogens_enc

    return np.array(atom_feature_vector)

def get_bond_features(bond, 
                  use_stereochemistry = True):
    """
    Takes an RDKit bond object as input and gives a 1d-numpy array of bond features as output.
    """

    permitted_list_of_bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]

    bond_type_enc = one_hot_encoding(bond.GetBondType(), permitted_list_of_bond_types)
    
    bond_is_conj_enc = [int(bond.GetIsConjugated())]
    
    bond_is_in_ring_enc = [int(bond.IsInRing())]
    
    bond_feature_vector = bond_type_enc + bond_is_conj_enc + bond_is_in_ring_enc
    
    if use_stereochemistry == True:
        stereo_type_enc = one_hot_encoding(str(bond.GetStereo()), ["STEREOZ", "STEREOE", "STEREOANY", "STEREONONE"])
        bond_feature_vector += stereo_type_enc

    return np.array(bond_feature_vector)

def create_pytorch_geometric_graph_data_list_from_smiles(x_smiles):
    """
    Inputs:
    
    x_smiles = [smiles_1, smiles_2, ....] ... a list of SMILES strings
    
    Outputs:
    
    data_list = [G_1, G_2, ...] ... a list of torch_geometric.data.Data objects which represent labeled molecular graphs that can readily be used for machine learning
    
    """
    
    data_list = []
    
    for i, smiles in enumerate(x_smiles):
        
        # convert SMILES to RDKit mol object
        mol = Chem.MolFromSmiles(smiles)

        # get feature dimensions
        n_nodes = mol.GetNumAtoms()
        n_edges = 2*mol.GetNumBonds()
        unrelated_smiles = "O=O"
        unrelated_mol = Chem.MolFromSmiles(unrelated_smiles)
        n_node_features = len(get_atom_features(unrelated_mol.GetAtomWithIdx(0)))
        n_edge_features = len(get_bond_features(unrelated_mol.GetBondBetweenAtoms(0,1)))

        # construct node feature matrix X of shape (n_nodes, n_node_features)
        X = np.zeros((n_nodes, n_node_features))

        for atom in mol.GetAtoms():
            X[atom.GetIdx(), :] = get_atom_features(atom)
            
        X = torch.tensor(X, dtype = torch.float)
        
        # construct edge index array E of shape (2, n_edges)
        (rows, cols) = np.nonzero(GetAdjacencyMatrix(mol))
        torch_rows = torch.from_numpy(rows.astype(np.int64)).to(torch.long)
        torch_cols = torch.from_numpy(cols.astype(np.int64)).to(torch.long)
        E = torch.stack([torch_rows, torch_cols], dim = 0)
        
        # construct edge feature array EF of shape (n_edges, n_edge_features)
        EF = np.zeros((n_edges, n_edge_features))
        
        for (k, (i,j)) in enumerate(zip(rows, cols)):
            
            EF[k] = get_bond_features(mol.GetBondBetweenAtoms(int(i),int(j)))
        
        EF = torch.tensor(EF, dtype = torch.float)
        
        
        # construct Pytorch Geometric data object and append to data list
        data_list.append(Data(x = X, edge_index = E, edge_attr = EF))

    return data_list

In [127]:
create_pytorch_geometric_graph_data_list_from_smiles([unique_smiles[0]])

[Data(x=[26, 16], edge_index=[2, 60], edge_attr=[60, 10])]

In [128]:
unique_smiles = targets_drug_valid_smiles_df["SMILES"].unique().tolist()
unique_smiles_graphs = create_pytorch_geometric_graph_data_list_from_smiles(unique_smiles)
smile_to_smiles_df = pd.DataFrame(columns = ["SMILES", "SMILES_GRAPH"], data = zip(unique_smiles, unique_smiles_graphs))
targets_drug_valid_smiles_graphs_df = pd.merge(targets_drug_valid_smiles_df, smile_to_smiles_df, on = "SMILES")

In [129]:
targets_drug_valid_smiles_graphs_df

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,...,Name,Synonyms,Targets,Target pathway,PUB_CHEM_ID,Datasets,number of cell lines,Screening site,SMILES,SMILES_GRAPH
0,GDSC2,401,18945558,683667,PFSK-1,SIDM01132,MB,1003,Camptothecin,TOP1,...,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication,24360,GDSC2,968,SANGER,CC[C@@]1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=...,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00..."
1,GDSC2,401,18945796,684052,A673,SIDM00848,UNCLASSIFIED,1003,Camptothecin,TOP1,...,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication,24360,GDSC2,968,SANGER,CC[C@@]1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=...,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00..."
2,GDSC2,401,18946078,684057,ES5,SIDM00263,UNCLASSIFIED,1003,Camptothecin,TOP1,...,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication,24360,GDSC2,968,SANGER,CC[C@@]1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=...,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00..."
3,GDSC2,401,18946335,684059,ES7,SIDM00269,UNCLASSIFIED,1003,Camptothecin,TOP1,...,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication,24360,GDSC2,968,SANGER,CC[C@@]1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=...,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00..."
4,GDSC2,401,18946617,684062,EW-11,SIDM00203,UNCLASSIFIED,1003,Camptothecin,TOP1,...,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication,24360,GDSC2,968,SANGER,CC[C@@]1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=...,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201695,GDSC2,401,19187483,1659928,SNU-175,SIDM00216,COREAD,2359,GSK2830371,WIP1,...,GSK2830371,,WIP1,Other,70983932,GDSC2,731,SANGER,CC1=C(C=C(C=N1)Cl)NCC2=CC=C(S2)C(=O)N[C@@H](CC...,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00..."
201696,GDSC2,401,19187936,1660034,SNU-407,SIDM00214,COREAD,2359,GSK2830371,WIP1,...,GSK2830371,,WIP1,Other,70983932,GDSC2,731,SANGER,CC1=C(C=C(C=N1)Cl)NCC2=CC=C(S2)C(=O)N[C@@H](CC...,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00..."
201697,GDSC2,401,19188194,1660035,SNU-61,SIDM00194,COREAD,2359,GSK2830371,WIP1,...,GSK2830371,,WIP1,Other,70983932,GDSC2,731,SANGER,CC1=C(C=C(C=N1)Cl)NCC2=CC=C(S2)C(=O)N[C@@H](CC...,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00..."
201698,GDSC2,401,19188734,1674021,SNU-C5,SIDM00498,COREAD,2359,GSK2830371,WIP1,...,GSK2830371,,WIP1,Other,70983932,GDSC2,731,SANGER,CC1=C(C=C(C=N1)Cl)NCC2=CC=C(S2)C(=O)N[C@@H](CC...,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00..."


## Create gene, mutation, and copy number df

In [130]:
def get_transformed_merged_drug_model_df(models, targets):
    columns_with_most_val_filled = models.columns[models.isnull().sum(axis=0)/ models.shape[0] <= 0.1]
    filtered_columns = columns_with_most_val_filled
    models = models[filtered_columns].fillna("Unkown")
    columns_besides_id = list(filter(lambda x: x != "model_id", filtered_columns))
    models = models.rename(columns={"model_id":"SANGER_MODEL_ID"}) ##rename for join
    limited_drug_target = targets[targets["DRUG_ID"] == 1803] ##filter specific drug #1862
    limited_drug_target_filtered = limited_drug_target.loc[:, ["CELL_LINE_NAME", "SANGER_MODEL_ID", "MIN_CONC", "LN_IC50"]]
    model_drug_information = pd.merge(limited_drug_target_filtered, models, how="inner", on="SANGER_MODEL_ID") ## merge drug and model information

    model_drug_information["model_idx"] = model_drug_information.index ##store index for later joins to retrieve edge index
    return model_drug_information

In [131]:
merged_drug_model_df = get_transformed_merged_drug_model_df(models,  targets)

In [132]:
copy_numbers = pd.read_csv("copy_number_variation.csv")
copy_numbers_df = pd.pivot_table(copy_numbers, values='total_copy_number', index=['model_id'], columns=['symbol'], aggfunc="sum", fill_value=0.0)
copy_numbers_df = copy_numbers_df.reset_index()

In [133]:
mutations = pd.read_csv("mutations.csv")
mutations["one_hot"] = np.ones(mutations.shape[0])
## TODO possible improvement, e.g., VAF
mutations_df = pd.pivot_table(mutations, values="one_hot", index=['model_id'],
                       columns=['gene_symbol'], aggfunc="sum", fill_value=0.0).reset_index()

In [134]:
rna_seq_df = pd.read_csv("gene_expression.csv",index_col=0, header = None,low_memory=False)

def get_transformed_df_in_model_gene_value_format(rna_seq_df):
    rna_seq_df_transposed = rna_seq_df.transpose().iloc[0:, :]
    new_header = rna_seq_df_transposed.iloc[0, :] 
    rna_seq_df_transposed = rna_seq_df_transposed.iloc[1:, :] 
    rna_seq_df_transposed.columns = new_header
    rna_seq_df_transposed = rna_seq_df_transposed.iloc[:, [0, *list(range(5, new_header.shape[0]))]]
    rna_seq_df_transposed = rna_seq_df_transposed.rename(columns={rna_seq_df_transposed.columns[0]: "SANGER_MODEL_ID"})
    rna_seq_df_transposed_stacked = rna_seq_df_transposed.set_index('SANGER_MODEL_ID').stack().reset_index(name='ExpressionValue')
    rna_seq_df_transposed_stacked = rna_seq_df_transposed_stacked.rename(columns={1:'symbol'})
    return rna_seq_df_transposed_stacked

transformed_seq_data = get_transformed_df_in_model_gene_value_format(rna_seq_df)
transformed_seq_data["ExpressionValue"] = transformed_seq_data["ExpressionValue"].astype(np.float32)
expression_df = pd.pivot_table(transformed_seq_data, values='ExpressionValue', index=['SANGER_MODEL_ID'],
                       columns=['symbol'], aggfunc="sum", fill_value=0.0)
expression_df = expression_df.reset_index().rename(columns = {"SANGER_MODEL_ID": "model_id"})

In [135]:
copy_numbers_df.dtypes

symbol
model_id     object
ABCB1       float64
ABI1        float64
ABL1        float64
ABL2        float64
             ...   
ZNF780A     float64
ZNF93       float64
ZNRF3       float64
ZRSR2       float64
ZXDB        float64
Length: 764, dtype: object

In [136]:
def rename_col(df, type:str):
    feature_col = list(filter(lambda col: col != "model_id", df.columns))
    rename_col = {col: f"{col}_{type}" for col in feature_col}
    df = df.rename(columns=rename_col)
    return df
expression_df = rename_col(expression_df, "exp")
copy_numbers_df = rename_col(copy_numbers_df, "cnv") 
mutations_df = rename_col(mutations_df, "mut") 

In [137]:
features_df = pd.merge(pd.merge(expression_df, copy_numbers_df, on = "model_id"), mutations_df, on = "model_id")

In [138]:
all_columns = list(filter(lambda col: col != "model_id", features_df.columns))

## 3. Create Masks

In [139]:
files = os.listdir("all_features")
filename_to_features = dict()
for file in files:
    if "csv" not in file:
        continue
    file_df = pd.read_csv(os.path.join("all_features", file))
    filename_to_features[file] = file_df.values.squeeze()

In [140]:
filename_to_mask = dict()

In [141]:
for file in tqdm(filename_to_features):
    if file in filename_to_mask:
        continue
    mask = np.zeros(len(all_columns), dtype = np.bool_)
    for i, col in enumerate(all_columns):
        file_cols = filename_to_features[file]
        file_cols = list(map(lambda file_col: file_col.replace(".1", ""), file_cols)) ## For some weird reason feature names changed slightly
        mask[i] = col in file_cols
    assert len(file_cols) == mask.sum(), file
    filename_to_mask[file] = mask

  0%|          | 0/295 [00:00<?, ?it/s]

In [142]:
drug_id_to_mask = {key[4:8]: filename_to_mask[key] for key in filename_to_mask}

## 4. Overall dataset with SMILES PyGGRaphs and Cell model features

In [143]:
targets_drug_valid_smiles_graphs_red_df = targets_drug_valid_smiles_graphs_df.loc[:, ["SANGER_MODEL_ID", "LN_IC50", "SMILES_GRAPH", "DRUG_ID"]]
targets_drug_valid_smiles_graphs_red_df = targets_drug_valid_smiles_graphs_red_df.rename(columns = {"SANGER_MODEL_ID": "model_id"})
targets_drug_valid_smiles_graphs_red_features_df = pd.merge(targets_drug_valid_smiles_graphs_red_df, features_df, on = "model_id", suffixes = ("", "_r"))
targets_drug_valid_smiles_graphs_red_features_df.head()

Unnamed: 0,model_id,LN_IC50,SMILES_GRAPH,DRUG_ID,A1BG_exp,A1BG-AS1_exp,A1CF_exp,A2M_exp,A2M-AS1_exp,A2ML1_exp,...,ZNF429_mut,ZNF521_mut,ZNF626_mut,ZNF680_mut,ZNF721_mut,ZNF814_mut,ZNF93_mut,ZNRF3_mut,ZRSR2_mut,ZXDB_mut
0,SIDM01132,-1.462148,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00...",1003,0.98,14.04,0.0,0.98,0.0,0.03,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,SIDM00848,-4.869447,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00...",1003,0.84,5.57,0.0,5.15,0.32,11.98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,SIDM00263,-3.360684,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00...",1003,4.0,15.64,0.0,26.620001,0.61,70.669998,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SIDM00269,-5.045014,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00...",1003,2.44,17.25,0.0,14.56,0.08,0.37,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,SIDM00203,-3.74162,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00...",1003,1.88,11.06,0.0,0.08,0.2,7.67,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [144]:
targets_drug_valid_smiles_graphs_red_features_df

Unnamed: 0,model_id,LN_IC50,SMILES_GRAPH,DRUG_ID,A1BG_exp,A1BG-AS1_exp,A1CF_exp,A2M_exp,A2M-AS1_exp,A2ML1_exp,...,ZNF429_mut,ZNF521_mut,ZNF626_mut,ZNF680_mut,ZNF721_mut,ZNF814_mut,ZNF93_mut,ZNRF3_mut,ZRSR2_mut,ZXDB_mut
0,SIDM01132,-1.462148,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00...",1003,0.98,14.04,0.00,0.980000,0.00,0.030000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,SIDM00848,-4.869447,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00...",1003,0.84,5.57,0.00,5.150000,0.32,11.980000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,SIDM00263,-3.360684,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00...",1003,4.00,15.64,0.00,26.620001,0.61,70.669998,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SIDM00269,-5.045014,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00...",1003,2.44,17.25,0.00,14.560000,0.08,0.370000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,SIDM00203,-3.741620,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00...",1003,1.88,11.06,0.00,0.080000,0.20,7.670000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196308,SIDM00216,5.423870,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00...",2359,0.26,0.46,0.23,0.000000,1.01,0.030000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196309,SIDM00214,5.042005,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00...",2359,0.35,0.03,0.00,0.000000,0.80,3.400000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196310,SIDM00194,6.130028,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00...",2359,0.00,0.05,3.30,0.160000,0.16,1.610000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196311,SIDM00498,6.151265,"[(x, [tensor([ 6.0000, 12.0110, 2.5500, 4.00...",2359,0.03,0.14,0.03,0.000000,0.00,0.070000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Create graphs of feature-selected drug-specifc cell models 

In [145]:
acetalax_test = False ##boolean for testing my approach

In [146]:
## Create batch graphs and one batch represents all combinations for one drug

In [147]:
from sklearn.preprocessing import MinMaxScaler

batch_graphs = []

class Batch:
    def __init__(self, id, drug_graph, cell_drug_graph,  train_labels, test_labels):
        self.id = id
        self.drug_graph = drug_graph
        self.cell_drug_graph = cell_drug_graph
        self.train_labels = train_labels
        self.test_labels = test_labels
        
    def __repr__(self):
        return f"""
        self.id: {self.id}
        self.drug_graph: {self.drug_graph.__repr__()}\n
        self.cell_drug_graph: {self.cell_drug_graph.__repr__()}\n
        self.train_labels: {self.train_labels.shape}\n
        self.test_labels: {self.test_labels.shape}\n
        """

def add_super_node_to_graph(graph):
    graph= graph.cpu()
    num_nodes, num_features = graph.x.shape
    super_node = torch.zeros((1, num_features), dtype = graph.x.dtype)
    graph_node_sources = torch.arange(num_nodes).unsqueeze(0)
    super_node_targets = (torch.ones((num_nodes)) * (num_nodes)).unsqueeze(0)
    super_edge_index = torch.cat((graph_node_sources, super_node_targets), dim = 0)
    new_edge_index = torch.cat((graph.edge_index, super_edge_index), dim  = 1)
    new_features = torch.cat((graph.x, super_node), dim = 0)
    new_edge_attr = torch.cat((graph.edge_attr, torch.zeros((super_edge_index.shape[-1], graph.edge_attr.shape[-1]), dtype = graph.edge_attr.dtype)), dim  = 0)
    graph.x = new_features
    graph.edge_index = new_edge_index.type(torch.long)
    graph.edge_attr = new_edge_attr
    graph.super_node_mask = torch.zeros(graph.x.shape[0],dtype =torch.bool)
    graph.super_node_mask[-1] = 1

def get_edges(group_features, id, drug_id_to_mask):
    all_drug_specific_cell_model_features = torch.from_numpy(group_features).type(torch.float32)
    selected_feature_index_np = np.nonzero(drug_id_to_mask[str(id)])[0] # Num_Sel_features
    selected_feature_index = torch.from_numpy(selected_feature_index_np).type(torch.long) # Num_Sel_features
    selected_feature_index_exp = selected_feature_index.unsqueeze(0).expand(group_features.shape[0], selected_feature_index.shape[-1]) # Num_CM x Num_Sel_features
    source_edge_index = selected_feature_index_exp.flatten() # (Num_CM x Num_Sel_features) x 1
    
    target_edge_index = torch.arange(group_features.shape[0]).unsqueeze(1).expand(group_features.shape[0], selected_feature_index_exp.shape[-1])
    target_edge_index = target_edge_index.flatten()
    edge_index = torch.cat([source_edge_index.unsqueeze(0), target_edge_index.unsqueeze(0)], dim = 0)
    edge_attr = all_drug_specific_cell_model_features[:, selected_feature_index_np].flatten()
    return (edge_index, edge_attr)

for id, group in targets_drug_valid_smiles_graphs_red_features_df.groupby("DRUG_ID"):
    if acetalax_test and id != 1803:
        continue
    print(id)
    group_features = group.iloc[:, 4:].values
    node_features = torch.ones(group_features.shape[-1]) # Num_CMfeatures

    #TODO: We have to use the same train test splits as in feature selector -> Label leak!
    train_df, test_df = train_test_split(group, test_size = .1) 
    train_features, test_features = train_df.iloc[:, 4:].values, test_df.iloc[:, 4:].values
    
    train_edge_index, train_edge_attr = get_edges(train_features, id, drug_id_to_mask)
    test_edge_index, test_edge_attr = get_edges(test_features, id, drug_id_to_mask)
    
    cell_model_graph = Data(x = node_features, train_edge_attr = train_edge_attr, train_edge_index= train_edge_index, test_edge_index = test_edge_index, test_edge_attr = test_edge_attr)
    drug_graph = group.iloc[0, :]["SMILES_GRAPH"]
    train_labels = torch.from_numpy(train_df["LN_IC50"].values).type(torch.float32)
    test_labels = torch.from_numpy(test_df["LN_IC50"].values).type(torch.float32)
    add_super_node_to_graph(drug_graph)
    batch = Batch(id, drug_graph, cell_model_graph, train_labels, test_labels)
    batch_graphs.append(batch)

1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1029
1030
1031
1032
1033
1036
1037
1038
1039
1042
1043
1046
1047
1049
1050
1051
1053
1054
1057
1058
1059
1060
1062
1067
1068
1069
1072
1073
1079
1080
1083
1085
1086
1088
1089
1093
1096
1129
1131
1133
1149
1168
1175
1177
1179
1180
1190
1191
1192
1194
1199
1200
1237
1239
1243
1248
1249
1250
1372
1373
1375
1378
1494
1502
1507
1510
1511
1512
1529
1549
1553
1557
1558
1560
1561
1563
1564
1576
1578
1593
1598
1613
1614
1615
1617
1618
1620
1621
1622
1624
1626
1627
1629
1630
1631
1632
1634
1635
1786
1799
1803
1849
1852
1853
1854
1855
1862
1909
1910
1912
1913
1915
1917
1918
1919
1922
1924
1925
1926
1927
1928
1930
1931
1932
1933
1936
1939
1940
1941
1997
2040
2043
2044
2045
2046
2048
2096
2106
2107
2110
2111
2169
2172
2173
2174
2175
2177
2359


In [148]:
from sklearn.linear_model import LinearRegression

def evaluate_lr_results():
    lr_results = np.zeros((len(batch_graphs), 2))

    for i, batch in enumerate(batch_graphs):
        num_cell_model_features = drug_id_to_mask[str(batch.id)].sum()
        train_features = batch.cell_drug_graph.train_edge_attr.view(-1, num_cell_model_features)
        test_features = batch.cell_drug_graph.test_edge_attr.view(-1, num_cell_model_features)
        train_labels = batch.train_labels
        test_labels = batch.test_labels
    
        lr = LinearRegression()
        lr.fit(train_features, train_labels)
        pred_test = lr.predict(test_features)
        pred_train = lr.predict(train_features)
        r2_train, r2_test = r2_score(train_labels, pred_train), r2_score(test_labels, pred_test)
        lr_results[i, 0] = r2_train	
        lr_results[i, 1] = r2_test
    return lr_results
lr_res = evaluate_lr_results()
lr_res_df = pd.DataFrame(lr_res)

In [150]:
lr_res_df.describe()

Unnamed: 0,0,1
count,171.0,171.0
mean,0.943787,0.782192
std,0.059853,0.244394
min,0.439207,-0.904345
25%,0.938062,0.752507
50%,0.961491,0.83421
75%,0.972919,0.913223
max,1.0,0.9882


In [32]:
# ## TODO: We have to use the same train test splits as in feature selector
# ## Train test split rather over different cell models (i.e., each drug at least seen once?) -> if we dont know we have to check the paper
# train_graphs, test_graphs = train_test_split(batch_graphs) if not acetalax_test else ([], [])
# len(train_graphs), len(test_graphs)

In [151]:
if acetalax_test:
    test_edge_index = batch_graphs[0].cell_drug_graph.edge_index[:, -(436)*200:]
    test_edge_attr = batch_graphs[0].cell_drug_graph.edge_attr[-(436)*200:]
    
    train_edge_index = batch_graphs[0].cell_drug_graph.edge_index[:, :-(436)*200]
    train_edge_attr = batch_graphs[0].cell_drug_graph.edge_attr[:-(436)*200]
    
    train_graph = Batch(batch_graphs[0].drug_graph.clone(),
                        batch_graphs[0].cell_drug_graph.clone(),
                        torch.clone(batch_graphs[0].labels))
    test_graph = Batch(batch_graphs[0].drug_graph.clone(),
                        batch_graphs[0].cell_drug_graph.clone(),
                        torch.clone(batch_graphs[0].labels))
    
    train_graph.cell_drug_graph.edge_index = train_edge_index
    train_graph.cell_drug_graph.edge_attr = train_edge_attr
    train_graph.labels = train_graph.labels[:-200]
    
    test_graph.cell_drug_graph.edge_index = test_edge_index
    test_graph.cell_drug_graph.edge_attr = test_edge_attr
    test_graph.cell_drug_graph.edge_index[1] = test_graph.cell_drug_graph.edge_index[1] -501
    test_graph.labels = test_graph.labels[-200:]
    
    train_graphs = [train_graph]
    test_graphs = [test_graph]

In [152]:
import math
# This function is copied from https://github.com/gordicaleksa/pytorch-GAT/blob/main/The%20Annotated%20GAT%20(PPI).ipynb
## Thanks to Aleksa Gordic for the implementation!
def explicit_broadcast(this, other):
    # Append singleton dimensions until this.dim() == other.dim()
    for _ in range(this.dim(), other.dim()):
        this = this.unsqueeze(-1)
    return this.expand_as(other)
    
ACTIVATION_FUNS = {
    "RELU": torch.nn.ReLU,
    "LEAKY_RELU": torch.nn.LeakyReLU,
    "ELU":  torch.nn.ELU
}
class GNN(torch.nn.Module):
    def __init__(self, hidden_dim, dropout, heads, act):
        super(GNN, self).__init__()
        hidden_dim = int(hidden_dim) ## TODO add further params
        self.conv_0 = GATConv(-1, hidden_dim, heads=heads, concat=True, add_self_loops=True, dropout=dropout)
        self.conv_1 = GATConv(hidden_dim*heads, hidden_dim, heads = heads, concat=False, add_self_loops= True, dropout=dropout)
        
        self.dropout = torch.nn.Dropout(p = dropout)
        self.activate = ACTIVATION_FUNS[act]()
    def forward(self, x, edge_index, edge_attr):
        ##TODO check if dropout in input might make sense
        x = self.conv_0(x, edge_index, edge_attr)
        x = self.activate(x)
        x = self.dropout(x)
        x = self.conv_1(x, edge_index, edge_attr)
        return x
        
class CustomGNN(torch.nn.Module):
    def __init__(self, num_cell_model_features, hidden_dim_cell_models, hidden_dim_drug, dropout, heads, act, device):
        super(CustomGNN, self).__init__()
        self.lin = torch.nn.Linear(num_cell_model_features, hidden_dim_cell_models)
        
        self.gnn_drug = GNN(hidden_dim_drug, dropout, heads, act) # (Atoms + 1) x H_d
        self.lin_end = torch.nn.Linear(hidden_dim_cell_models + hidden_dim_drug, 1)
        self.device = device

    def forward(self, drug_graph, cell_drug_graph):
        ## Drug-sepcific CellModel Information
        cell_node_features =  self.lin.weight.transpose(1,0).clone() * cell_drug_graph.x.unsqueeze(1).clone()
        cell_node_features_lift = torch.index_select(cell_node_features.clone(), 0, cell_drug_graph.edge_index[0].clone()) # E_cd x H_c
        
        drug_specific_cell_features = cell_node_features_lift.clone() * cell_drug_graph.edge_attr.unsqueeze(1).clone() # E_cd x H_c
        drug_specific_cell_features_aggregated = torch.zeros((cell_drug_graph.edge_index[1].max() + 1, drug_specific_cell_features.shape[1]), device = self.device, dtype=drug_specific_cell_features.dtype) #CxH_c
        broadcasted_target_index = explicit_broadcast(cell_drug_graph.edge_index[1], drug_specific_cell_features)
        drug_specific_cell_features_aggregated.scatter_reduce_(0, broadcasted_target_index.clone(), drug_specific_cell_features.clone(), "sum") # C x H_c
        drug_specific_cell_features_aggregated = drug_specific_cell_features_aggregated + self.lin.bias
        
        ## Drug-sepcfic information
        drug_node_features = self.gnn_drug(drug_graph.x.clone(), drug_graph.edge_index.clone(), drug_graph.edge_attr.clone()) # (Atoms + 1) x H_d
        drug_super_node_features = drug_node_features[-1, :] # 1 x H_d
        drug_super_node_features_exp = drug_super_node_features.unsqueeze(0).expand(drug_specific_cell_features_aggregated.shape[0], drug_super_node_features.shape[-1]) # CxH_d

        ## Combine drug and cell information
        drug_cell_features = torch.cat([drug_super_node_features_exp, drug_specific_cell_features_aggregated], 1) # C x (H_d + H_c)
        drug_cell_features = torch.nn.functional.relu(drug_cell_features)
        out = self.lin_end(drug_cell_features) # C x 1
        
        return out # drug_specific_cell_features_aggregated
        

In [170]:
device = torch.device("cuda:0")
heads = 2
dropout = 0.1
model = CustomGNN(38_975, 512, 64, dropout, heads, "ELU", device).to(device) #312, 512, .4, 4
loss_fun = torch.nn.MSELoss()
lr = 1e-4 #1e-4
optim = torch.optim.Adam(params= model.parameters(), lr = lr) #, weight_decay=1e-5

In [184]:
epochs = 200  #800
for epoch in tqdm(range(epochs)):
    acc_loss = 0
    model.train()
    for batch in batch_graphs:
        batch.cell_drug_graph.edge_index = batch.cell_drug_graph.train_edge_index
        batch.cell_drug_graph.edge_attr = batch.cell_drug_graph.train_edge_attr
        out = model(batch.drug_graph.to(device), batch.cell_drug_graph.to(device))
        loss = loss_fun(out, batch.train_labels.unsqueeze(1).to(device))
        optim.zero_grad()
        loss.backward()
        acc_loss += loss.item()
        optim.step()
        # break
    print(f"{acc_loss/len(batch_graphs):.2f}")

  0%|          | 0/200 [00:00<?, ?it/s]

0.67
0.69
0.67
0.66
0.71
0.73
0.67
0.65
0.67
0.65
0.62
0.60
0.60
0.70
0.68
0.82
0.70
0.66
0.65
0.64
0.70
0.65
0.66
0.65
0.65
0.65
0.68
0.65
0.62
0.61
0.63
0.63
0.62
0.66
0.64
0.61
0.62
0.61
0.57
0.64
0.69
0.70
0.71
0.67
0.66
0.61
0.59
0.60
0.64
0.66
0.57
0.58
0.65
0.62
0.67
0.62
0.56
0.58
0.59
0.67
0.66
0.70
0.63
0.62
0.73
0.72
0.63
0.56
0.56
0.65
0.65
0.65
0.61
0.61
0.61
0.57
0.57
0.57
0.55
0.59
0.60
0.62
0.60
0.61
0.59
0.60
0.57
0.60
0.64
0.59
0.57
0.57
0.55
0.60
0.61
0.57
0.51
0.55
0.65
0.61
0.60
0.62
0.60
0.57
0.54
0.59
0.62
0.63
0.62
0.59
0.58
0.58
0.53
0.54
0.54
0.55
0.61
0.56
0.55
0.56
0.57
0.62
0.65
0.62
0.59


KeyboardInterrupt: 

In [185]:
def evaluate(batch, use_train_edges = False):
    with torch.inference_mode():
        model.eval()
        
        if use_train_edges:
            print("Use Train edges")
            batch.cell_drug_graph.edge_index = batch.cell_drug_graph.train_edge_index
            batch.cell_drug_graph.edge_attr = batch.cell_drug_graph.train_edge_attr
            batch.labels = batch.train_labels
        if not use_train_edges:
            print("Use Test edges")
            batch.cell_drug_graph.edge_index = batch.cell_drug_graph.test_edge_index
            batch.cell_drug_graph.edge_attr = batch.cell_drug_graph.test_edge_attr
            batch.labels = batch.test_labels
            
        out = model(batch.drug_graph.to(device), batch.cell_drug_graph.to(device))
        r2 = r2_score(batch.labels, out.squeeze().cpu())
    return r2
    
if acetalax_test:
    print(evaluate(train_graph))
    print(evaluate(test_graph))

In [186]:
num_train_instances = 0
for batch in batch_graphs:
    max_train_idx = batch.cell_drug_graph.train_edge_index[1, :].max()
    num_train_instances += max_train_idx
num_train_instances

tensor(176431, device='cuda:0')

In [187]:
ids = []
r2s = []
for batch in batch_graphs:
    r2 = evaluate(batch, False)
    ids.append(batch.id)
    r2s.append(r2)
    print(r2)

Use Test edges
0.6016890253069137
Use Test edges
0.8285669371037201
Use Test edges
0.6158692468284103
Use Test edges
0.9412408300635664
Use Test edges
0.6348940625366772
Use Test edges
0.7215185195173673
Use Test edges
0.49533084613130185
Use Test edges
0.6522992556353004
Use Test edges
0.6143441665110734
Use Test edges
0.7325962249936453
Use Test edges
0.8966740837608608
Use Test edges
0.5994266451804824
Use Test edges
0.7364467114126593
Use Test edges
0.7503043207955296
Use Test edges
0.664351723216921
Use Test edges
0.7389551589170131
Use Test edges
0.4503127399718677
Use Test edges
0.8354514250936601
Use Test edges
0.5747474441511878
Use Test edges
0.6341509563997137
Use Test edges
0.7305905229250997
Use Test edges
0.37547306609086395
Use Test edges
0.788058487042244
Use Test edges
0.5724143342200176
Use Test edges
0.6825477911141518
Use Test edges
0.8724219647353203
Use Test edges
0.674392519205703
Use Test edges
0.624289061911222
Use Test edges
0.7550597404491607
Use Test edges
0

In [188]:
## TODO multi label evaluation in sepaarte notebook

In [189]:
data = {
  "Drug_ID": ids,
  "R2": r2s
}

results_df_new = pd.DataFrame(data=data)
results_df_new.describe()

Unnamed: 0,Drug_ID,R2
count,171.0,171.0
mean,1456.625731,0.485331
std,398.755137,0.333397
min,1003.0,-1.870381
25%,1058.5,0.388888
50%,1494.0,0.566468
75%,1854.5,0.664814
max,2359.0,0.941241


In [162]:
results_df_new.describe()

Unnamed: 0,Drug_ID,R2
count,171.0,171.0
mean,1456.625731,0.472283
std,398.755137,0.272487
min,1003.0,-1.312067
25%,1058.5,0.362395
50%,1494.0,0.520782
75%,1854.5,0.643588
max,2359.0,0.92217


In [None]:
# results_df_new.to_csv("r2_results_drugs_gat.csv")

In [None]:
for batch in batch_graphs:
    print(evaluate(batch, True))

In [None]:
for graph in test_graphs:
    print(evaluate(graph))

In [None]:
raise Exception("Only old code below this cell.")

In [None]:
list(map(lambda batch: batch.drug_graph.x.shape[0] - 1, batch_graphs))

In [None]:
list(map(lambda batch: batch.drug_graph.edge_index.shape[1], batch_graphs))

In [None]:
batch_graphs[0].drug_graph

In [None]:
batch_graphs[1].drug_graph

## Old code

In [None]:

train_smiles_graphs_features_df, test_smiles_graphs_features_df = train_test_split(targets_drug_valid_smiles_graphs_red_features_df)

In [None]:
((train_smiles_graphs_features_df["DRUG_ID"] == 1803).sum(),
(test_smiles_graphs_features_df["DRUG_ID"] == 1803).sum())

In [None]:
acetalax_train_mask = train_smiles_graphs_features_df["DRUG_ID"] == 1803
acetalax_test_mask = test_smiles_graphs_features_df["DRUG_ID"] == 1803

In [None]:
train_smiles_graphs_features_df.shape, test_smiles_graphs_features_df.shape

In [None]:
y_train = train_smiles_graphs_features_df.pop("LN_IC50").values
y_test = test_smiles_graphs_features_df.pop("LN_IC50").values

In [None]:
train_graphs = train_smiles_graphs_features_df.pop("SMILES_GRAPH").values
test_graphs = test_smiles_graphs_features_df.pop("SMILES_GRAPH").values

In [None]:
# for graph in train_graphs:
#     add_super_node_to_graph(graph)
# for graph in test_graphs:
#     add_super_node_to_graph(graph)

In [None]:
train_graphs[:2]

In [None]:
next(iter(DataLoader(train_graphs[:2], batch_size=len(train_graphs[:2])))).edge_index

In [None]:
train_drug_ids = train_smiles_graphs_features_df.pop("DRUG_ID").values
test_drug_ids = test_smiles_graphs_features_df.pop("DRUG_ID").values

In [None]:
train_smiles_graphs_features_df.pop("model_id")
test_smiles_graphs_features_df.pop("model_id")

In [None]:
train_features = train_smiles_graphs_features_df.values
test_features = test_smiles_graphs_features_df.values

In [None]:
train_features.shape

In [None]:
train_drug_masks = np.zeros((train_drug_ids.shape[0], list(drug_id_to_mask.values())[0].shape[0]), dtype = np.bool_)
for i, drug_id in tqdm(enumerate(train_drug_ids)):
    train_drug_masks[i] = drug_id_to_mask[str(drug_id)]

In [47]:
test_drug_masks = np.zeros((test_drug_ids.shape[0], list(drug_id_to_mask.values())[0].shape[0]), dtype = np.bool_)
for i, drug_id in tqdm(enumerate(test_drug_ids)):
    test_drug_masks[i] = drug_id_to_mask[str(drug_id)]

0it [00:00, ?it/s]

## 5. Model definition

In [48]:
train_drug_masks[acetalax_train_mask].sum(axis = 0).shape

(38975,)

In [49]:
(features_df == 0).sum().sum() / (features_df.shape[0]*features_df.shape[1]) ## The graph is pretty sparse i have a nice idea to save memory -> HetGNN

0.4189280432779676

In [50]:
ACTIVATION_FUNS = {
    "RELU": torch.nn.ReLU,
    "LEAKY_RELU": torch.nn.LeakyReLU,
    "ELU":  torch.nn.ELU
}


class GNN(torch.nn.Module):
    def __init__(self, hidden_dim, dropout, act):
        super(GNN, self).__init__()
        hidden_dim = int(hidden_dim) ## TODO add further params
        self.conv_0 = GCNConv(-1, hidden_dim)
        self.conv_1 = GCNConv(hidden_dim, hidden_dim)
        
        self.dropout = torch.nn.Dropout(p = dropout)
        self.activate = ACTIVATION_FUNS[act]()
    def forward(self, x, edge_index):
        ##TODO check if dropout in input might make sense
        x = self.conv_0(x, edge_index)
        x = self.activate(x)
        x = self.dropout(x)
        x = self.conv_1(x, edge_index)
        return x

## TODO: POtentially instead of making the LR trainable use a fixed pre-trained LR
class LR(torch.nn.Module):
    def __init__(self):
        super(LR, self).__init__()
        self.lin = torch.nn.Linear(38975, 1)

    def forward(self, x, mask):
        masked_weight = self.lin.weight * mask   
        return (x * masked_weight).sum(-1) + self.lin.bias #torch.nn.functional.linear(x, masked_weight, self.lin.bias)

class Ensemble(torch.nn.Module):
    def __init__(self, hidden_dim, dropout, act = "RELU"):
        self.lr_model = LR()
        self.gnn = GNN(hidden_dim, dropout, act)
        self.lin = Linear(1+hidden_dim, 1)
    
    def forward(self, x_models, mask, drug_graphs):
        x_models = self.lr_model(x_models, mask)
        
        x_drugs = self.gnn(x_drugs, edge_index_drugs).mean(0)
        x_combined = torch.cat((x_models, x_drugs), dim = -1)
        x_combined = self.lin(x_combined)
        return x_combined        

In [55]:
train_graphs[acetalax_train_mask]

array([Data(x=[30, 79], edge_index=[2, 66], edge_attr=[66, 10]),
       Data(x=[30, 79], edge_index=[2, 66], edge_attr=[66, 10]),
       Data(x=[30, 79], edge_index=[2, 66], edge_attr=[66, 10]),
       Data(x=[30, 79], edge_index=[2, 66], edge_attr=[66, 10]),
       Data(x=[30, 79], edge_index=[2, 66], edge_attr=[66, 10]),
       Data(x=[30, 79], edge_index=[2, 66], edge_attr=[66, 10]),
       Data(x=[30, 79], edge_index=[2, 66], edge_attr=[66, 10]),
       Data(x=[30, 79], edge_index=[2, 66], edge_attr=[66, 10]),
       Data(x=[30, 79], edge_index=[2, 66], edge_attr=[66, 10]),
       Data(x=[30, 79], edge_index=[2, 66], edge_attr=[66, 10]),
       Data(x=[30, 79], edge_index=[2, 66], edge_attr=[66, 10]),
       Data(x=[30, 79], edge_index=[2, 66], edge_attr=[66, 10]),
       Data(x=[30, 79], edge_index=[2, 66], edge_attr=[66, 10]),
       Data(x=[30, 79], edge_index=[2, 66], edge_attr=[66, 10]),
       Data(x=[30, 79], edge_index=[2, 66], edge_attr=[66, 10]),
       Data(x=[30, 79], e

In [68]:
train_features[acetalax_train_mask].shape

(505, 38975)

In [51]:
## TODO just test basic LR with masks to make sure everything is correct till now

model = LR()
optim = torch.optim.Adam(params=model.parameters(), lr = 1e-2, weight_decay=1e-5)
loss_fun = torch.nn.MSELoss()
for epoch in range(1000):
    model.train()
    out = model(torch.from_numpy(train_features[acetalax_train_mask]).type(torch.float), torch.from_numpy(train_drug_masks[acetalax_train_mask]))
    loss = loss_fun(out, torch.from_numpy(y_train[acetalax_train_mask]).type(torch.float))
    print(loss.item())
    optim.zero_grad()
    loss.backward()
    optim.step()

1094.8582763671875
175502.40625
11636.173828125
38981.4609375
99821.03125
69628.5234375
15568.5068359375
1550.15234375
28667.35546875
50661.80859375
40897.1875
14984.453125
258.9750671386719
7245.84912109375
22682.212890625
27314.751953125
17260.125
4105.26123046875
247.90655517578125
6862.2705078125
14419.677734375
14180.9013671875
7048.40234375
783.123779296875
1001.8217163085938
5791.9931640625
8801.5048828125
6677.40380859375
2123.106201171875
7.524038791656494
1832.9837646484375
4599.81689453125
4783.626953125
2342.4013671875
199.05003356933594
490.087158203125
2255.294189453125
3005.726806640625
1859.7249755859375
322.5599060058594
131.68885803222656
1148.35400390625
1809.4844970703125
1253.3265380859375
267.4592590332031
52.398258209228516
657.1261596679688
1095.26318359375
763.1075439453125
153.4313507080078
42.92595672607422
433.7669677734375
672.6166381835938
421.70513916015625
59.5186882019043
55.267852783203125
316.8634338378906
406.7918395996094
201.20547485351562
10.78996

In [52]:
from sklearn.metrics import r2_score

with torch.inference_mode():
    model.eval()
    out = model(torch.from_numpy(test_features[acetalax_test_mask]).type(torch.float), torch.from_numpy(test_drug_masks[acetalax_test_mask]))
    # out = torch.diagonal(out, 0)
    score = r2_score(y_test[acetalax_test_mask], out.cpu().numpy())
print(score)

0.7642579948187986


## Drugs where we might need web scraping techniques

In [53]:
 ##TODO selenium script on PUbchem to webscrape the smiles?
targets_drug_pubchem_df[targets_drug_pubchem_df["PUB_CHEM_ID"].isin(["None", "none", "-"])].loc[:,["DRUG_NAME_x", "SYNONYMS"]].drop_duplicates()

KeyError: "None of [Index(['DRUG_NAME_x', 'SYNONYMS'], dtype='object')] are in the [columns]"