In [10]:
from __future__ import print_function
import random, pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch.utils.data as data
from scipy.stats import pearsonr
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

# Dependencies

In [17]:
fn = '/localscratch2/han/AnchorDrug/data/drug_fingerprints-1024.csv'
fp_map = pd.read_csv(fn, header=None, index_col=0)
fp_name = fp_map.index
fp_map = fp_map.to_numpy()

def get_morgan_fingerprint(mol, radius, nBits, FCFP=False):
    m = Chem.MolFromSmiles(mol)
    fp = AllChem.GetMorganFingerprintAsBitVect(m, radius=radius, nBits=nBits, useFeatures=FCFP)
    fp_bits = fp.ToBitString()
    finger_print = np.fromstring(fp_bits, 'u1') - ord('0')
    return finger_print

def get_drug_fp_batch(smile):
        fp_features = []
        for s in smile:
            # print(s)
            try:
                idx = np.where(fp_name == s)[0][0]
                fp_features.append(fp_map[idx])
            except:
                print(s)
                fp_features.append(get_morgan_fingerprint(s, 3, 1024, FCFP=False))
        fp_features = np.array(fp_features)
        # print(fp_features.shape)
        return fp_features

def get_drug_rdkfp_batch(smile):
    rdkfp = []
    for s in tqdm(smile):
        ref = Chem.MolFromSmiles(s)
        rdkfp.append(Chem.RDKFingerprint(ref))
    rdkfp = np.array(rdkfp)
    return rdkfp

# Load data

In [12]:
cell_list = ['ASC', 'NPC', 'HCC515', 'HT29', 'A375', 'HA1E', 'VCAP', 'A549', 'PC3', 'MCF7']
df_data = pd.read_csv('/localscratch2/han/AnchorDrug/data/level5_beta_trt_cp_24h_10uM.csv')
df_data = df_data[df_data['cell_iname'].isin(cell_list)]
gene_list = df_data.columns.to_list()[4:]
data_dict = {}

In [16]:
for cell in cell_list:
    print(cell)
    df_finetune = df_data[df_data['cell_iname'] == cell]
    df_finetune = df_finetune.groupby(by='SMILES').median(numeric_only = True).reset_index()
    drug_smiles = df_finetune['SMILES'].to_list()
    drug_ecfps = get_drug_fp_batch(drug_smiles)
    drug_rdkfps = get_drug_rdkfp_batch(drug_smiles)
    labels = df_finetune[gene_list].to_numpy()
    print(drug_ecfps.shape, drug_rdkfps.shape, labels.shape)
    data_dict[cell] = {
        'ecfp':drug_ecfps,
        'rdkfp':drug_rdkfps,
        'label':labels
        }

ASC


100%|██████████| 1021/1021 [00:01<00:00, 687.92it/s]


(1021, 1024) (1021, 2048) (1021, 978)
NPC


100%|██████████| 1234/1234 [00:01<00:00, 1064.92it/s]


(1234, 1024) (1234, 2048) (1234, 978)
HCC515


100%|██████████| 1377/1377 [00:01<00:00, 996.53it/s] 


(1377, 1024) (1377, 2048) (1377, 978)
HT29


100%|██████████| 1424/1424 [00:01<00:00, 989.96it/s] 


(1424, 1024) (1424, 2048) (1424, 978)
A375


100%|██████████| 2035/2035 [00:01<00:00, 1017.52it/s]


(2035, 1024) (2035, 2048) (2035, 978)
HA1E


100%|██████████| 2132/2132 [00:02<00:00, 974.07it/s] 


(2132, 1024) (2132, 2048) (2132, 978)
VCAP


100%|██████████| 2181/2181 [00:02<00:00, 999.82it/s] 


(2181, 1024) (2181, 2048) (2181, 978)
A549


100%|██████████| 2573/2573 [00:02<00:00, 942.73it/s] 


(2573, 1024) (2573, 2048) (2573, 978)
PC3


100%|██████████| 3060/3060 [00:03<00:00, 951.95it/s] 


(3060, 1024) (3060, 2048) (3060, 978)
MCF7


100%|██████████| 3254/3254 [00:03<00:00, 981.22it/s] 


(3254, 1024) (3254, 2048) (3254, 978)


In [None]:
for cell in cell_list:
    data = data_dict[cell]
    ecfp, rdkfp, label = data[]