In [1]:
import warnings
import pandas as pd
import numpy as np
import utils as ut
from DeepPurpose import utils, dataset
warnings.filterwarnings("ignore")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Preprocessing: BindingDB dataset

In [2]:
print('Processing BindingDB dataset...')
PATH = '/lustre/home/debnathk/gramseq/'
X_names, X_smiles, X_targets, y = dataset.process_BindingDB(path= PATH + 'data/BindingDB/BindingDB_All_202407.tsv', y='Kd', binary = False, \
					convert_to_log = True, threshold = 30)

df_bindingdb = pd.DataFrame({'name': X_names, 'smiles': X_smiles, 'target sequence': X_targets, 'affinity': y})
df_bindingdb.to_csv(PATH + 'data/BindingDB/preprocessed/bindingdb.csv', index=False)

Processing BindingDB dataset...
Loading Dataset from path...
Beginning Processing...
There are 91751 drug target pairs.
Default set to logspace (nM -> p) for easier regression


In [4]:
print('Dataset summary: BindingDB dataset (Preprocessed)')
print(f'No of unique drugs: {len(set(X_smiles))}')
print(f'No of unique target: {len(set(X_targets))}')
print(f'No of total interactions: {len(X_smiles)}')

Dataset summary: BindingDB dataset (Preprocessed)
No of unique drugs: 22381
No of unique target: 1860
No of total interactions: 91751


Preprocess the downloaded l1000_cp file here

In [12]:
df_l1000 = pd.read_csv('../data/l1000_cp_10uM_all.csv')
df_l1000.head()

Unnamed: 0,0,2,3,4,5,6,7,8,9,10,...,242,243,244,245,246,247,248,249,250,251
0,ABY001_A375_XH_A13_afatinib_10uM up,RRAGC,VPS8,KCNJ2,DUSP5,BEX1,LIMS1,RAB13,COLEC12,ARMCX2,...,GEM,BEX3,VEGFA,NUPR1,TSPAN6,CA12,TMEM158,CHI3L1,CDC20,HMOX1
1,ABY001_A375_XH_A13_afatinib_10uM down,PCNA,S100A7,DNMT1,S100P,S100A9,PUF60,TMEM45A,GOLT1B,PRSS23,...,AKAP12,SCEL,CTSC,EAF2,MORF4L1,KCNK3,MYB,MAF,LTF,MFNG
2,ABY001_A375_XH_A14_erlotinib_10uM up,CDKN2C,TIMP3,COLGALT2,AMPD3,TGFB1,SERPINB3,MMP7,PIGR,WDR61,...,RHOBTB3,TMSB15A,RPS4Y1,JCHAIN,PLAT,FAM129A,ASNS,PCNA,TNFSF10,CHAC1
3,ABY001_A375_XH_A14_erlotinib_10uM down,PCP4,SPINK1,STEAP1,HOXC6,ITGB1BP1,MRPS16,XIST,UCHL1,FABP4,...,PEG3,WBP1L,SCG5,ATP5F1E,CCL19,EGLN1,MAST4,ATP6V1H,GPX2,EBP
4,ABY001_A375_XH_A15_neratinib_10uM up,ITGAE,PRPF4,UBE2S,OGT,RRAGC,ASNA1,TNFRSF1A,FGL2,GOLGA8A,...,HSPA1A,CHI3L1,KLK11,MSMO1,ADM,HSPA8,TMEM158,KRT14,HMGCS1,HMOX1


In [13]:

# Extract up genes
df_up = df_l1000[df_l1000['0'].str.contains('up')]
df_up.head()

Unnamed: 0,0,2,3,4,5,6,7,8,9,10,...,242,243,244,245,246,247,248,249,250,251
0,ABY001_A375_XH_A13_afatinib_10uM up,RRAGC,VPS8,KCNJ2,DUSP5,BEX1,LIMS1,RAB13,COLEC12,ARMCX2,...,GEM,BEX3,VEGFA,NUPR1,TSPAN6,CA12,TMEM158,CHI3L1,CDC20,HMOX1
2,ABY001_A375_XH_A14_erlotinib_10uM up,CDKN2C,TIMP3,COLGALT2,AMPD3,TGFB1,SERPINB3,MMP7,PIGR,WDR61,...,RHOBTB3,TMSB15A,RPS4Y1,JCHAIN,PLAT,FAM129A,ASNS,PCNA,TNFSF10,CHAC1
4,ABY001_A375_XH_A15_neratinib_10uM up,ITGAE,PRPF4,UBE2S,OGT,RRAGC,ASNA1,TNFRSF1A,FGL2,GOLGA8A,...,HSPA1A,CHI3L1,KLK11,MSMO1,ADM,HSPA8,TMEM158,KRT14,HMGCS1,HMOX1
6,ABY001_A375_XH_A16_lapatinib_10uM up,SLC1A3,TPPP,SELENOP,ADGRL3,SERPINA4,MAP2K5,MMP26,NLRP2,RHOA,...,NNT,CRIP1,TGM2,IL32,MYRF,SPRY1,SLC11A2,ALDH1A1,FGFR3,CASP10
8,ABY001_A375_XH_D13_pazopanib_10uM up,TMEM185B,COL9A3,DDX42,UPK1B,SCG2,LARP6,PRKCQ,SCN3A,CD74,...,KDM5D,NEBL,XBP1,ALDH1A1,CIAO3,GOLT1B,ALDH3A1,AKR1B10,ZFP36,RPS4Y1


In [14]:
# Clean the drug names in the replicates - up
df_up['0'] = df_up['0'].apply(ut.extract_drug_name)
df_up.head()

Unnamed: 0,0,2,3,4,5,6,7,8,9,10,...,242,243,244,245,246,247,248,249,250,251
0,afatinib,RRAGC,VPS8,KCNJ2,DUSP5,BEX1,LIMS1,RAB13,COLEC12,ARMCX2,...,GEM,BEX3,VEGFA,NUPR1,TSPAN6,CA12,TMEM158,CHI3L1,CDC20,HMOX1
2,erlotinib,CDKN2C,TIMP3,COLGALT2,AMPD3,TGFB1,SERPINB3,MMP7,PIGR,WDR61,...,RHOBTB3,TMSB15A,RPS4Y1,JCHAIN,PLAT,FAM129A,ASNS,PCNA,TNFSF10,CHAC1
4,neratinib,ITGAE,PRPF4,UBE2S,OGT,RRAGC,ASNA1,TNFRSF1A,FGL2,GOLGA8A,...,HSPA1A,CHI3L1,KLK11,MSMO1,ADM,HSPA8,TMEM158,KRT14,HMGCS1,HMOX1
6,lapatinib,SLC1A3,TPPP,SELENOP,ADGRL3,SERPINA4,MAP2K5,MMP26,NLRP2,RHOA,...,NNT,CRIP1,TGM2,IL32,MYRF,SPRY1,SLC11A2,ALDH1A1,FGFR3,CASP10
8,pazopanib,TMEM185B,COL9A3,DDX42,UPK1B,SCG2,LARP6,PRKCQ,SCN3A,CD74,...,KDM5D,NEBL,XBP1,ALDH1A1,CIAO3,GOLT1B,ALDH3A1,AKR1B10,ZFP36,RPS4Y1


In [15]:
# Filter bindingdb drugs present in l1000 data
selected_names = df_up['0'][df_up['0'].apply(lambda substring: any(substring in name for name in df_bindingdb['Name'].unique()))]

df_bindingdb2 = df_bindingdb[df_bindingdb['Name'].apply(lambda x: any(substring in x for substring in set(df_up['0'])))]
df_bindingdb2.head()


KeyboardInterrupt: 

In [None]:
# One-hot encoding of drug SMILES
# S = pd.Series(X_drugs.unique()).apply(utils.smiles2onehot)
# S_dict = dict(zip(X_drugs.unique(), S))
# df_drugs = [S_dict[i] for i in X_drugs]
# one_hot_drugs = np.array(df_drugs)
# print(f'One-hot encoding of drug: {one_hot_drugs.shape}')

In [11]:
    # Extract down genes
df_down = df_l1000[df_l1000['0'].str.contains('down')]
# print(df_l1000_down.head())

# Clean the drug names in the replicates - down
df_down['0'] = df_down['0'].apply(ut.extract_drug_name)
df_down.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_down['0'] = df_down['0'].apply(ut.extract_drug_name)


Unnamed: 0,0,2,3,4,5,6,7,8,9,10,...,242,243,244,245,246,247,248,249,250,251
1,afatinib,PCNA,S100A7,DNMT1,S100P,S100A9,PUF60,TMEM45A,GOLT1B,PRSS23,...,AKAP12,SCEL,CTSC,EAF2,MORF4L1,KCNK3,MYB,MAF,LTF,MFNG
3,erlotinib,PCP4,SPINK1,STEAP1,HOXC6,ITGB1BP1,MRPS16,XIST,UCHL1,FABP4,...,PEG3,WBP1L,SCG5,ATP5F1E,CCL19,EGLN1,MAST4,ATP6V1H,GPX2,EBP
5,neratinib,RPS4Y1,KRT18,THBS1,IGFBP3,PRSS23,ALDH1A3,CLU,PTCH1,CSGALNACT1,...,HIST1H2AC,PPM1H,ENPP2,ACP6,PEBP1,ATP1B3,COL14A1,SMAD3,RAB11FIP1,MANSC1
7,lapatinib,PUF60,HSPA1A,SPINK1,C3orf14,CYP1B1,MMP12,RRS1,KRT23,TSPYL5,...,MINPP1,PSMB10,STK10,PPP3CA,GAREM1,TTC19,EEF1D,PITRM1,CCN3,IL1R1
9,pazopanib,MRPS16,PIN1,XIST,LSM5,CXCL14,CDC20,F13A1,TTC39A,PRDX2,...,PPP1R2,NCF1C,MTCL1,RNF19B,MAOA,CRISPLD2,OSBPL8,SNX10,BBX,PPFIA1


In [9]:
# df_down.to_csv('../data/df_down.csv', index=False)

In [10]:
# drug_l1000_list = []
# for smiles, drug in smiles_cmap.items():
#     for drug_l1000 in df_down['0']:
#         if drug.lower() == drug_l1000.lower():
#             if drug not in drug_l1000_list:
#                 drug_l1000_list.append(drug)

# print(drug_l1000_list)

['L-theanine', 'L-citrulline', 'BRD-A18795974']


In [12]:
landmark_genes = pd.read_csv('../data/landmark_genes.csv', header=None)

In [13]:
data_reg_list = []
for drug in selected_names:
    drug_count = 0
    df_reg = landmark_genes
    df_reg['up'] = [0] * 978
    df_reg['down'] = [0] * 978
    for drug_name in df_down['0']:
        if drug_name == drug:
            drug_count += 1
    filtered_up = df_up[df_up['0'] == drug]
    filtered_down = df_down[df_down['0'] == drug]
    array_up = filtered_up.iloc[:, 1:].values
    array_up = array_up.flatten()
    array_down = filtered_down.iloc[:, 1:].values
    array_down = array_down.flatten()
    for item in array_up:
        df_reg.loc[df_reg[0] == item, 'up'] += 1
    for item in array_down:
        df_reg.loc[df_reg[0] == item, 'down'] += 1
    df_reg = df_reg.iloc[:, 1:] / drug_count
    df_reg = df_reg.values
    data_reg_list.append(df_reg)

data = np.stack(data_reg_list)
print(data.shape)