In [1]:
import warnings
import pandas as pd
import numpy as np
import utils as ut
from DeepPurpose import utils, dataset
warnings.filterwarnings("ignore")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Preprocessing: L1000 Chemical Perturbation dataset

In [56]:
print('Preprocess L1000 Chemical Perturbation dataset...')
def read_gmt(file_path):
    gene_sets = {}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            gene_set_name = parts[0]
            # description = parts[1]
            genes = parts[2:]
            gene_sets[gene_set_name] = genes
            # {
            #     # "description": description,
            #     "genes": genes
            # }
    return gene_sets

PATH = '/lustre/home/debnathk/gramseq/'

gmt_file_path = PATH + "data/l1000/l1000_cp.gmt"
print('Printing the dataset...')
gene_sets = read_gmt(gmt_file_path)

df_l1000 = pd.DataFrame({'pert_name': gene_sets.keys(), 'genes': gene_sets.values()})
genes_df = pd.DataFrame(df_l1000['genes'].to_list(), columns=[f'gene_{i+1}' for i in range(df_l1000['genes'].str.len().max())])
df_l1000 = pd.concat([df_l1000.drop(columns=['genes']), genes_df], axis=1)

# Filter instances with concentrations 10 uM
df_l1000 = df_l1000.loc[df_l1000['pert_name'].str.contains('10uM')]

# Save as csv
df_l1000.to_csv(PATH + 'data/l1000/l1000_cp.csv', index=False)
print(f'No of instances in L1000 RNA-Seq dataset: {len(df_l1000)}')
df_l1000.head()

Preprocess L1000 Chemical Perturbation dataset...
Printing the dataset...
No of instances in L1000 RNA-Seq dataset: 440842


Unnamed: 0,pert_name,gene_1,gene_2,gene_3,gene_4,gene_5,gene_6,gene_7,gene_8,gene_9,...,gene_241,gene_242,gene_243,gene_244,gene_245,gene_246,gene_247,gene_248,gene_249,gene_250
0,ABY001_A375_XH_A13_afatinib_10uM up,RRAGC,VPS8,KCNJ2,DUSP5,BEX1,LIMS1,RAB13,COLEC12,ARMCX2,...,GEM,BEX3,VEGFA,NUPR1,TSPAN6,CA12,TMEM158,CHI3L1,CDC20,HMOX1
1,ABY001_A375_XH_A13_afatinib_10uM down,PCNA,S100A7,DNMT1,S100P,S100A9,PUF60,TMEM45A,GOLT1B,PRSS23,...,AKAP12,SCEL,CTSC,EAF2,MORF4L1,KCNK3,MYB,MAF,LTF,MFNG
2,ABY001_A375_XH_A14_erlotinib_10uM up,CDKN2C,TIMP3,COLGALT2,AMPD3,TGFB1,SERPINB3,MMP7,PIGR,WDR61,...,RHOBTB3,TMSB15A,RPS4Y1,JCHAIN,PLAT,FAM129A,ASNS,PCNA,TNFSF10,CHAC1
3,ABY001_A375_XH_A14_erlotinib_10uM down,PCP4,SPINK1,STEAP1,HOXC6,ITGB1BP1,MRPS16,XIST,UCHL1,FABP4,...,PEG3,WBP1L,SCG5,ATP5F1E,CCL19,EGLN1,MAST4,ATP6V1H,GPX2,EBP
4,ABY001_A375_XH_A15_neratinib_10uM up,ITGAE,PRPF4,UBE2S,OGT,RRAGC,ASNA1,TNFRSF1A,FGL2,GOLGA8A,...,HSPA1A,CHI3L1,KLK11,MSMO1,ADM,HSPA8,TMEM158,KRT14,HMGCS1,HMOX1


## Preprocessing: BindingDB dataset

In [3]:
print('Processing BindingDB dataset...')
X_names, X_smiles, X_targets, y = dataset.process_BindingDB(path= PATH + 'data/BindingDB/BindingDB_All_202407.tsv', y='Kd', binary = False, \
					convert_to_log = True, threshold = 30)

df_bindingdb = pd.DataFrame({'name': X_names, 'smiles': X_smiles, 'target sequence': X_targets, 'affinity': y})
df_bindingdb.to_csv(PATH + 'data/BindingDB/preprocessed/bindingdb.csv', index=False)

Processing BindingDB dataset...
Loading Dataset from path...
Beginning Processing...
There are 91751 drug target pairs.
Default set to logspace (nM -> p) for easier regression


In [4]:
print('Dataset summary: BindingDB dataset (Preprocessed)')
print(f'No of unique drugs: {len(set(X_smiles))}')
print(f'No of unique target: {len(set(X_targets))}')
print(f'No of total interactions: {len(X_smiles)}')

Dataset summary: BindingDB dataset (Preprocessed)
No of unique drugs: 22381
No of unique target: 1860
No of total interactions: 91751


## Create RNA-Seq data: BindingDB dataset

In [21]:
# Extract up genes
df_up = df_l1000[df_l1000['pert_name'].str.contains('up')]
df_up.head()

Unnamed: 0,pert_name,gene_1,gene_2,gene_3,gene_4,gene_5,gene_6,gene_7,gene_8,gene_9,...,gene_241,gene_242,gene_243,gene_244,gene_245,gene_246,gene_247,gene_248,gene_249,gene_250
0,ABY001_A375_XH_A13_afatinib_10uM up,RRAGC,VPS8,KCNJ2,DUSP5,BEX1,LIMS1,RAB13,COLEC12,ARMCX2,...,GEM,BEX3,VEGFA,NUPR1,TSPAN6,CA12,TMEM158,CHI3L1,CDC20,HMOX1
2,ABY001_A375_XH_A14_erlotinib_10uM up,CDKN2C,TIMP3,COLGALT2,AMPD3,TGFB1,SERPINB3,MMP7,PIGR,WDR61,...,RHOBTB3,TMSB15A,RPS4Y1,JCHAIN,PLAT,FAM129A,ASNS,PCNA,TNFSF10,CHAC1
4,ABY001_A375_XH_A15_neratinib_10uM up,ITGAE,PRPF4,UBE2S,OGT,RRAGC,ASNA1,TNFRSF1A,FGL2,GOLGA8A,...,HSPA1A,CHI3L1,KLK11,MSMO1,ADM,HSPA8,TMEM158,KRT14,HMGCS1,HMOX1
6,ABY001_A375_XH_A16_lapatinib_10uM up,SLC1A3,TPPP,SELENOP,ADGRL3,SERPINA4,MAP2K5,MMP26,NLRP2,RHOA,...,NNT,CRIP1,TGM2,IL32,MYRF,SPRY1,SLC11A2,ALDH1A1,FGFR3,CASP10
8,ABY001_A375_XH_B13_afatinib_2.5uM up,RP2,S100A11,CHST11,MARCKSL1,WAC,MPO,ITGB1,NNMT,CACNA1A,...,GDF15,CEBPB,CRISP3,MMP7,NFIL3,CASP10,AQP3,ASNS,CALML5,FBXO11


In [22]:
# Clean the drug names in the replicates - up
df_up['pert_name'] = df_up['pert_name'].apply(ut.extract_drug_name)
df_up.head()

Unnamed: 0,pert_name,gene_1,gene_2,gene_3,gene_4,gene_5,gene_6,gene_7,gene_8,gene_9,...,gene_241,gene_242,gene_243,gene_244,gene_245,gene_246,gene_247,gene_248,gene_249,gene_250
0,afatinib,RRAGC,VPS8,KCNJ2,DUSP5,BEX1,LIMS1,RAB13,COLEC12,ARMCX2,...,GEM,BEX3,VEGFA,NUPR1,TSPAN6,CA12,TMEM158,CHI3L1,CDC20,HMOX1
2,erlotinib,CDKN2C,TIMP3,COLGALT2,AMPD3,TGFB1,SERPINB3,MMP7,PIGR,WDR61,...,RHOBTB3,TMSB15A,RPS4Y1,JCHAIN,PLAT,FAM129A,ASNS,PCNA,TNFSF10,CHAC1
4,neratinib,ITGAE,PRPF4,UBE2S,OGT,RRAGC,ASNA1,TNFRSF1A,FGL2,GOLGA8A,...,HSPA1A,CHI3L1,KLK11,MSMO1,ADM,HSPA8,TMEM158,KRT14,HMGCS1,HMOX1
6,lapatinib,SLC1A3,TPPP,SELENOP,ADGRL3,SERPINA4,MAP2K5,MMP26,NLRP2,RHOA,...,NNT,CRIP1,TGM2,IL32,MYRF,SPRY1,SLC11A2,ALDH1A1,FGFR3,CASP10
8,afatinib,RP2,S100A11,CHST11,MARCKSL1,WAC,MPO,ITGB1,NNMT,CACNA1A,...,GDF15,CEBPB,CRISP3,MMP7,NFIL3,CASP10,AQP3,ASNS,CALML5,FBXO11


In [31]:
print(f'No of unique perturbagens in L1000 dataset: {len(df_up["pert_name"].unique())}')

No of unique perturbagens in L1000 dataset: 33587


In [54]:
# Filter bindingdb drugs present in l1000 data
selected_names = []
for name in X_names:
    for pert in df_up['pert_name']: 
        if pert in name:
            selected_names.append(pert)

print(selected_names)
           

# df_bindingdb2 = df_bindingdb[df_bindingdb['Name'].apply(lambda x: any(substring in x for substring in set(df_up['0'])))]
# df_bindingdb2.head()


KeyboardInterrupt: 

In [None]:
# One-hot encoding of drug SMILES
# S = pd.Series(X_drugs.unique()).apply(utils.smiles2onehot)
# S_dict = dict(zip(X_drugs.unique(), S))
# df_drugs = [S_dict[i] for i in X_drugs]
# one_hot_drugs = np.array(df_drugs)
# print(f'One-hot encoding of drug: {one_hot_drugs.shape}')

In [23]:
# Extract down genes
df_down = df_l1000[df_l1000['pert_name'].str.contains('down')]
# print(df_l1000_down.head())

# Clean the drug names in the replicates - down
df_down['pert_name'] = df_down['pert_name'].apply(ut.extract_drug_name)
df_down.head()

Unnamed: 0,pert_name,gene_1,gene_2,gene_3,gene_4,gene_5,gene_6,gene_7,gene_8,gene_9,...,gene_241,gene_242,gene_243,gene_244,gene_245,gene_246,gene_247,gene_248,gene_249,gene_250
1,afatinib,PCNA,S100A7,DNMT1,S100P,S100A9,PUF60,TMEM45A,GOLT1B,PRSS23,...,AKAP12,SCEL,CTSC,EAF2,MORF4L1,KCNK3,MYB,MAF,LTF,MFNG
3,erlotinib,PCP4,SPINK1,STEAP1,HOXC6,ITGB1BP1,MRPS16,XIST,UCHL1,FABP4,...,PEG3,WBP1L,SCG5,ATP5F1E,CCL19,EGLN1,MAST4,ATP6V1H,GPX2,EBP
5,neratinib,RPS4Y1,KRT18,THBS1,IGFBP3,PRSS23,ALDH1A3,CLU,PTCH1,CSGALNACT1,...,HIST1H2AC,PPM1H,ENPP2,ACP6,PEBP1,ATP1B3,COL14A1,SMAD3,RAB11FIP1,MANSC1
7,lapatinib,PUF60,HSPA1A,SPINK1,C3orf14,CYP1B1,MMP12,RRS1,KRT23,TSPYL5,...,MINPP1,PSMB10,STK10,PPP3CA,GAREM1,TTC19,EEF1D,PITRM1,CCN3,IL1R1
9,afatinib,MT1E,MAMLD1,SFRP1,GNAS,PCNA,TMEM176B,HEATR1,ROBO1,SERPINE2,...,FKBP1B,PPL,VCAM1,POLE2,PLOD2,CKS2,BATF3,PKIA,RAB27A,PLSCR4


In [9]:
# df_down.to_csv('../data/df_down.csv', index=False)

In [13]:
# drug_l1000_list = []
# for smiles, drug in smiles_cmap.items():
#     for drug_l1000 in df_down['0']:
#         if drug.lower() == drug_l1000.lower():
#             if drug not in drug_l1000_list:
#                 drug_l1000_list.append(drug)

# print(drug_l1000_list)

In [24]:
landmark_genes = pd.read_csv(PATH + 'data/landmark_genes.csv', header=None)

In [27]:
data_reg_list = []
for drug in df_up['pert_name'].unique()[0]:
    drug_count = 0
    df_reg = landmark_genes
    df_reg['up'] = [0] * 978
    df_reg['down'] = [0] * 978
    for drug_name in df_down['pert_name']:
        if drug_name == drug:
            drug_count += 1
    filtered_up = df_up[df_up['pert_name'] == drug]
    filtered_down = df_down[df_down['pert_name'] == drug]
    array_up = filtered_up.iloc[:, 1:].values
    array_up = array_up.flatten()
    array_down = filtered_down.iloc[:, 1:].values
    array_down = array_down.flatten()
    for item in array_up:
        df_reg.loc[df_reg[0] == item, 'up'] += 1
    for item in array_down:
        df_reg.loc[df_reg[0] == item, 'down'] += 1
    df_reg = df_reg.iloc[:, 1:] / drug_count
    df_reg = df_reg.values
    data_reg_list.append(df_reg)

data = np.stack(data_reg_list)
print(data.shape)

(8, 978, 2)


In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from DeepPurpose import utils, dataset
from DeepPurpose import DTI as models
import warnings
import pandas as pd
import numpy as np
import utils
from predictor_gvae_rnaseq_rnn import DLEPS
import tensorflow as tf
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import argparse
import matplotlib.pyplot as plt
from prettytable import PrettyTable
import h5py
import logging
import time
warnings.filterwarnings("ignore")

PATH  = '/lustre/home/debnathk/gramseq/'

# Preprocess bindingdb dataset
X_names, X_smiles, X_targets, y = dataset.process_BindingDB(path= PATH + 'data/BindingDB/BindingDB_All_202407.tsv', y='Kd', binary = False, \
					convert_to_log = True, threshold = 30)
df_bindingdb = pd.DataFrame({'name': X_names, 'smiles': X_smiles, 'target sequence': X_targets, 'affinity': y})
df_bindingdb.to_csv(PATH + 'data/BindingDB/preprocessed/bindingdb.csv', index=False)

# print(df_bindingdb.head())
print('Dataset summary: BindingDB dataset (Preprocessed)')
print(f'No of unique drugs: {len(set(X_smiles))}')
print(f'No of unique targets: {len(set(X_targets))}')
print(f'No of total interactions: {len(X_smiles)}')

# Convert drugs to series object
X_smiles = pd.Series(X_smiles)

# One-hot encoding of drug SMILES
S = pd.Series(X_smiles.unique()).apply(utils.smiles2onehot)
S_dict = dict(zip(X_smiles.unique(), S))
df_drugs = [S_dict[i] for i in X_smiles]
one_hot_drugs = np.array(df_drugs)
print(f'One-hot encoding of drug: {one_hot_drugs.shape}')

# Convert proteins to series object
X_targets = pd.Series(X_targets)

# One-hot encoding of proteins
AA = pd.Series(X_targets.unique()).apply(utils.protein2onehot)
AA_dict = dict(zip(X_targets.unique(), AA))
df_proteins = [AA_dict[i] for i in X_targets]
one_hot_proteins = np.array(df_proteins)
print(f'One-hot encoding of protein: {one_hot_proteins.shape}')

print(f'No of Labels: {y.shape}')

print("-----------------------Training - GVAE + RNN----------------------------")


2024-09-04 10:52:51.363646: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-04 10:53:18.459293: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-09-04 10:53:18.459347: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-09-04 10:53:20.140598: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-04 10:53:58.691084: W tensorflow/stream_executor/platform/de

Loading Dataset from path...
Beginning Processing...
There are 91751 drug target pairs.
Default set to logspace (nM -> p) for easier regression
Dataset summary: BindingDB dataset (Preprocessed)
No of unique drugs: 22381
No of unique targets: 1860
No of total interactions: 91751


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fe610993940>>
Traceback (most recent call last):
  File "/lustre/home/debnathk/.local/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


One-hot encoding of drug: (91751, 277, 76)
One-hot encoding of protein: (91751, 26, 1000)
No of Labels: (91751,)
-----------------------Training - GVAE + RNN----------------------------


In [1]:
import pickle
import json

PATH = '/home/debnathk/gramseq/'

# Load l1000 data
with open(PATH + 'data/l1000/l1000_vectors.pkl', 'rb') as file:
    data = pickle.load(file)
file.close()

# data.shape
# Load l1000 perturbagens dictionary file
with open(PATH + 'data/l1000/l1000_pert_dict.txt', 'r') as file:
    dict_l1000 = json.load(file)
    
dict_l1000


JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)