## Introduction

In this notebook, I merge all previously processed data sources into a unified dataframe for modeling. These sources include:
- ECBD compound metadata and MoA annotations
- Aggregated Cell Painting morphological features
- Molecular features (Morgan fingerprints or MolCLR embeddings)

The goal is to create a multimodal dataset where each row corresponds to a unique compound with its morphological, chemical, and label information.

In [2]:
import os
import glob
from datetime import date

import numpy as np
import pandas as pd

In [3]:
# paths to data
save_path = "result/"

In [4]:
# loading our ecbd with fingerprint data
file_ecbd_fp_type = '/*[0-9]_' + 'ecbd_fp.csv'
files_ecbd_fp = glob.glob(save_path + file_ecbd_fp_type)

# gets latest file
max_file_ecbd_fp = max(files_ecbd_fp, key=os.path.getctime)

# load file
df_ecbd_fp = pd.read_csv(max_file_ecbd_fp)
df_ecbd_fp

Unnamed: 0,Metadata_EOS,Metadata_pdid,Metadata_name,Metadata_smiles,Metadata_inchi,Metadata_inchikey,probe,experimental probe,calculated probe,available,...,fp_246,fp_247,fp_248,fp_249,fp_250,fp_251,fp_252,fp_253,fp_254,fp_255
0,EOS101163,PD000002,I-BRD9,CCn1cc(-c2cccc(C(F)(F)F)c2)c2sc(C(=N)NC3CCS(=O...,InChI=1S/C22H22F3N3O3S2/c1-2-28-12-17(13-4-3-5...,WRUWGLUCNBMGPS-UHFFFAOYSA-N,1,1,1,1,...,1,1,1,0,0,1,0,0,0,0
1,EOS101593,PD000003,UNC0638,COc1cc2c(NC3CCN(C(C)C)CC3)nc(C3CCCCC3)nc2cc1OC...,InChI=1S/C30H47N5O2/c1-22(2)35-17-12-24(13-18-...,QOECJCJVIMVJGX-UHFFFAOYSA-N,1,1,1,1,...,0,0,0,1,0,1,0,0,0,1
2,EOS101154,PD000005,UNC1215,O=C(c1ccc(C(=O)N2CCC(N3CCCC3)CC2)c(Nc2ccccc2)c...,InChI=1S/C32H43N5O2/c38-31(36-20-12-27(13-21-3...,PQOOIERVZAXHBP-UHFFFAOYSA-N,1,1,1,1,...,0,0,0,0,0,1,1,0,1,0
3,EOS101601,PD000007,IOX1,O=C(O)c1ccc(O)c2ncccc12,InChI=1S/C10H7NO3/c12-8-4-3-7(10(13)14)6-2-1-5...,JGRPKOGHYBAVMW-UHFFFAOYSA-N,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,EOS101116,PD000008,IOX2,O=C(O)CNC(=O)c1c(O)c2ccccc2n(Cc2ccccc2)c1=O,InChI=1S/C19H16N2O5/c22-15(23)10-20-18(25)16-1...,CAOSCCRYLYQBES-UHFFFAOYSA-N,1,1,0,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2459,EOS102392,PD164271,PD164271,N#CCc1ccccn1,"InChI=1S/C7H6N2/c8-5-4-7-3-1-2-6-9-7/h1-3,6H,4H2",UKVQBONVSSLJBB-UHFFFAOYSA-N,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2460,EOS102394,PD164272,ALPHA-HEXACHLOROCYCLOHEXANE,Cl[C@H]1[C@H](Cl)[C@H](Cl)[C@@H](Cl)[C@H](Cl)[...,InChI=1S/C6H6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9...,JLYXXMFPNIAWKQ-LKPKBOIGSA-N,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2461,EOS102396,PD164274,PD164274,O=C(O)/C=C1/CCCc2ccccc2[C@@H]1O,InChI=1S/C13H14O3/c14-12(15)8-10-6-3-5-9-4-1-2...,UADPGHINQMWEAG-CHOZFAJLSA-N,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2462,EOS102400,PD164275,PD164275,CC1CCC2C(=O)OC(=O)C2C1,InChI=1S/C9H12O3/c1-5-2-3-6-7(4-5)9(11)12-8(6)...,FKBMTBAXDISZGN-UHFFFAOYSA-N,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [None]:
# loading our ecbd with molcrl data
file_ecbd_molcrl_type = '/*[0-9]_' + 'ecbd_molcrl.csv'
files_ecbd_molcrl = glob.glob(save_path + file_ecbd_molcrl_type)

# gets latest file
max_file_ecbd_molcrl = max(files_ecbd_molcrl, key=os.path.getctime)

# load file
df_ecbd_molcrl = pd.read_csv(max_file_ecbd_molcrl)
df_ecbd_molcrl

Unnamed: 0,Metadata_EOS,Metadata_pdid,Metadata_name,Metadata_smiles,Metadata_inchi,Metadata_inchikey,probe,experimental probe,calculated probe,available,...,fp_502,fp_503,fp_504,fp_505,fp_506,fp_507,fp_508,fp_509,fp_510,fp_511
0,EOS101163,PD000002,I-BRD9,CCn1cc(-c2cccc(C(F)(F)F)c2)c2sc(C(=N)NC3CCS(=O...,InChI=1S/C22H22F3N3O3S2/c1-2-28-12-17(13-4-3-5...,WRUWGLUCNBMGPS-UHFFFAOYSA-N,1,1,1,1,...,-0.009652,-0.009014,-0.004872,0.001793,0.006755,-0.015224,-0.003066,0.003297,0.004927,0.005235
1,EOS101593,PD000003,UNC0638,COc1cc2c(NC3CCN(C(C)C)CC3)nc(C3CCCCC3)nc2cc1OC...,InChI=1S/C30H47N5O2/c1-22(2)35-17-12-24(13-18-...,QOECJCJVIMVJGX-UHFFFAOYSA-N,1,1,1,1,...,-0.004147,-0.011597,0.000007,-0.000260,0.006644,-0.014582,-0.005006,0.003152,0.009103,0.003386
2,EOS101154,PD000005,UNC1215,O=C(c1ccc(C(=O)N2CCC(N3CCCC3)CC2)c(Nc2ccccc2)c...,InChI=1S/C32H43N5O2/c38-31(36-20-12-27(13-21-3...,PQOOIERVZAXHBP-UHFFFAOYSA-N,1,1,1,1,...,-0.005562,-0.010420,-0.001604,0.000005,0.006492,-0.015744,-0.004839,0.005303,0.006258,0.003480
3,EOS101601,PD000007,IOX1,O=C(O)c1ccc(O)c2ncccc12,InChI=1S/C10H7NO3/c12-8-4-3-7(10(13)14)6-2-1-5...,JGRPKOGHYBAVMW-UHFFFAOYSA-N,1,1,0,1,...,-0.005991,-0.010519,0.000367,-0.004945,0.009628,-0.012861,-0.008997,0.003438,0.011160,0.006716
4,EOS101116,PD000008,IOX2,O=C(O)CNC(=O)c1c(O)c2ccccc2n(Cc2ccccc2)c1=O,InChI=1S/C19H16N2O5/c22-15(23)10-20-18(25)16-1...,CAOSCCRYLYQBES-UHFFFAOYSA-N,1,1,0,1,...,-0.008961,-0.007427,-0.003365,-0.002786,0.007068,-0.014306,-0.006787,0.005757,0.005799,0.006099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2459,EOS102392,PD164271,PD164271,N#CCc1ccccn1,"InChI=1S/C7H6N2/c8-5-4-7-3-1-2-6-9-7/h1-3,6H,4H2",UKVQBONVSSLJBB-UHFFFAOYSA-N,0,0,0,1,...,-0.004179,-0.003555,-0.012376,0.000566,0.004126,-0.015840,0.002919,0.007134,0.004434,0.008699
2460,EOS102394,PD164272,ALPHA-HEXACHLOROCYCLOHEXANE,Cl[C@H]1[C@H](Cl)[C@H](Cl)[C@@H](Cl)[C@H](Cl)[...,InChI=1S/C6H6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9...,JLYXXMFPNIAWKQ-LKPKBOIGSA-N,0,0,0,1,...,0.003403,-0.016317,0.010066,0.004196,0.000373,-0.019137,-0.002833,-0.007359,0.013387,-0.003956
2461,EOS102396,PD164274,PD164274,O=C(O)/C=C1/CCCc2ccccc2[C@@H]1O,InChI=1S/C13H14O3/c14-12(15)8-10-6-3-5-9-4-1-2...,UADPGHINQMWEAG-CHOZFAJLSA-N,0,0,0,1,...,-0.004886,-0.007972,-0.001957,-0.001061,0.006149,-0.013120,-0.006037,0.004062,0.007926,0.005134
2462,EOS102400,PD164275,PD164275,CC1CCC2C(=O)OC(=O)C2C1,InChI=1S/C9H12O3/c1-5-2-3-6-7(4-5)9(11)12-8(6)...,FKBMTBAXDISZGN-UHFFFAOYSA-N,0,0,0,1,...,-0.006399,-0.005893,-0.001018,0.002218,0.004813,-0.016106,-0.002260,0.003595,0.005777,0.004184


We need to rename the columns to something more distinguishable.

In [7]:
binary_flags = [
    'probe',                # chemical probe? 
    'experimental probe',   # experimentally validated probe?
    'calculated probe',     # theoretically predicted probe?
    'available',            # available?
    'approved drug',        # approved drug?
    'P&D approved',         # Probe & Drug approved?
    'covalent binder',      # compound binds to the target covalently?
    'biased GPCR ligand',   # biased GPCR ligand?
    'inorganic',            # contains inorganic atoms?
    'structural alert',     # contains dangerous or unstable chemical fragments?
    'PAINS Family A',       # belongs to PAINS? (frequent false positives)
    'PAINS Family B',       #
    'PAINS Family C',       # 
    'Aggregator',           # prone to aggregation?
    'Obsolete',             # obsolete? (no longer used in research)
    'Nuisance',             # considered a nuisance? (interfering or undesirable)
]
chemical_features = [
    'mw',     # molecular weight
    'hba',    # H-bond acceptors
    'hbd',    # H-bond donors
    'rb',     # rotatable bonds
    'rc',     # ring count?
    'arc',    # aromatic ring count?
    'logp',   # lipophilicity
    'tpsa',   # topological polar surface area
    'fcsp3',  # fraction of sp3-hybridized carbon atoms
    'ncc',    # not quite clear, maybe number of carbon chains
    'lrs',    # possibly related to size/length
    'qed',    # drug-likeness score (0..1)
]
metadata = [
    'EOS',          # Unique ECBD compound ID
    'pdid',         # Compound ID
    'name',         # Name of the drug or compound
    'smiles',       # SMILES string
    'inchi',        # InChI string
    'inchikey',     # Hashed InChI key
    'cas',          # CAS number — reference chemical identifier (semicolon-separated string array)
    'synonyms',     # Alternative names of the compound (semicolon-separated string array)
    'no. targets',  # Number of targets (target_name). For example, if compound with pdid = PD000002 is linked to BRD4, BRD9, CECR2, then no. targets = 3.
]

binary_dict = {col: f'binary_{col}' for col in binary_flags}
chemical_dict = {col: f'chemical_{col}' for col in chemical_features}
metadata_dict = {col: f'Metadata_{col}' for col in metadata}

df_ecbd_fp = df_ecbd_fp.rename(columns={**binary_dict, **chemical_dict, **metadata_dict})
df_ecbd_fp

Unnamed: 0,Metadata_EOS,Metadata_pdid,Metadata_name,Metadata_smiles,Metadata_inchi,Metadata_inchikey,binary_probe,binary_experimental probe,binary_calculated probe,binary_available,...,fp_246,fp_247,fp_248,fp_249,fp_250,fp_251,fp_252,fp_253,fp_254,fp_255
0,EOS101163,PD000002,I-BRD9,CCn1cc(-c2cccc(C(F)(F)F)c2)c2sc(C(=N)NC3CCS(=O...,InChI=1S/C22H22F3N3O3S2/c1-2-28-12-17(13-4-3-5...,WRUWGLUCNBMGPS-UHFFFAOYSA-N,1,1,1,1,...,1,1,1,0,0,1,0,0,0,0
1,EOS101593,PD000003,UNC0638,COc1cc2c(NC3CCN(C(C)C)CC3)nc(C3CCCCC3)nc2cc1OC...,InChI=1S/C30H47N5O2/c1-22(2)35-17-12-24(13-18-...,QOECJCJVIMVJGX-UHFFFAOYSA-N,1,1,1,1,...,0,0,0,1,0,1,0,0,0,1
2,EOS101154,PD000005,UNC1215,O=C(c1ccc(C(=O)N2CCC(N3CCCC3)CC2)c(Nc2ccccc2)c...,InChI=1S/C32H43N5O2/c38-31(36-20-12-27(13-21-3...,PQOOIERVZAXHBP-UHFFFAOYSA-N,1,1,1,1,...,0,0,0,0,0,1,1,0,1,0
3,EOS101601,PD000007,IOX1,O=C(O)c1ccc(O)c2ncccc12,InChI=1S/C10H7NO3/c12-8-4-3-7(10(13)14)6-2-1-5...,JGRPKOGHYBAVMW-UHFFFAOYSA-N,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,EOS101116,PD000008,IOX2,O=C(O)CNC(=O)c1c(O)c2ccccc2n(Cc2ccccc2)c1=O,InChI=1S/C19H16N2O5/c22-15(23)10-20-18(25)16-1...,CAOSCCRYLYQBES-UHFFFAOYSA-N,1,1,0,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2459,EOS102392,PD164271,PD164271,N#CCc1ccccn1,"InChI=1S/C7H6N2/c8-5-4-7-3-1-2-6-9-7/h1-3,6H,4H2",UKVQBONVSSLJBB-UHFFFAOYSA-N,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2460,EOS102394,PD164272,ALPHA-HEXACHLOROCYCLOHEXANE,Cl[C@H]1[C@H](Cl)[C@H](Cl)[C@@H](Cl)[C@H](Cl)[...,InChI=1S/C6H6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9...,JLYXXMFPNIAWKQ-LKPKBOIGSA-N,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2461,EOS102396,PD164274,PD164274,O=C(O)/C=C1/CCCc2ccccc2[C@@H]1O,InChI=1S/C13H14O3/c14-12(15)8-10-6-3-5-9-4-1-2...,UADPGHINQMWEAG-CHOZFAJLSA-N,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2462,EOS102400,PD164275,PD164275,CC1CCC2C(=O)OC(=O)C2C1,InChI=1S/C9H12O3/c1-5-2-3-6-7(4-5)9(11)12-8(6)...,FKBMTBAXDISZGN-UHFFFAOYSA-N,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


Now we do the same for MolCRL dataset:

In [8]:
df_ecbd_molcrl = df_ecbd_molcrl.rename(columns={**binary_dict, **chemical_dict, **metadata_dict})
df_ecbd_molcrl

Unnamed: 0,Metadata_EOS,Metadata_pdid,Metadata_name,Metadata_smiles,Metadata_inchi,Metadata_inchikey,binary_probe,binary_experimental probe,binary_calculated probe,binary_available,...,fp_502,fp_503,fp_504,fp_505,fp_506,fp_507,fp_508,fp_509,fp_510,fp_511
0,EOS101163,PD000002,I-BRD9,CCn1cc(-c2cccc(C(F)(F)F)c2)c2sc(C(=N)NC3CCS(=O...,InChI=1S/C22H22F3N3O3S2/c1-2-28-12-17(13-4-3-5...,WRUWGLUCNBMGPS-UHFFFAOYSA-N,1,1,1,1,...,-0.009652,-0.009014,-0.004872,0.001793,0.006755,-0.015224,-0.003066,0.003297,0.004927,0.005235
1,EOS101593,PD000003,UNC0638,COc1cc2c(NC3CCN(C(C)C)CC3)nc(C3CCCCC3)nc2cc1OC...,InChI=1S/C30H47N5O2/c1-22(2)35-17-12-24(13-18-...,QOECJCJVIMVJGX-UHFFFAOYSA-N,1,1,1,1,...,-0.004147,-0.011597,0.000007,-0.000260,0.006644,-0.014582,-0.005006,0.003152,0.009103,0.003386
2,EOS101154,PD000005,UNC1215,O=C(c1ccc(C(=O)N2CCC(N3CCCC3)CC2)c(Nc2ccccc2)c...,InChI=1S/C32H43N5O2/c38-31(36-20-12-27(13-21-3...,PQOOIERVZAXHBP-UHFFFAOYSA-N,1,1,1,1,...,-0.005562,-0.010420,-0.001604,0.000005,0.006492,-0.015744,-0.004839,0.005303,0.006258,0.003480
3,EOS101601,PD000007,IOX1,O=C(O)c1ccc(O)c2ncccc12,InChI=1S/C10H7NO3/c12-8-4-3-7(10(13)14)6-2-1-5...,JGRPKOGHYBAVMW-UHFFFAOYSA-N,1,1,0,1,...,-0.005991,-0.010519,0.000367,-0.004945,0.009628,-0.012861,-0.008997,0.003438,0.011160,0.006716
4,EOS101116,PD000008,IOX2,O=C(O)CNC(=O)c1c(O)c2ccccc2n(Cc2ccccc2)c1=O,InChI=1S/C19H16N2O5/c22-15(23)10-20-18(25)16-1...,CAOSCCRYLYQBES-UHFFFAOYSA-N,1,1,0,1,...,-0.008961,-0.007427,-0.003365,-0.002786,0.007068,-0.014306,-0.006787,0.005757,0.005799,0.006099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2459,EOS102392,PD164271,PD164271,N#CCc1ccccn1,"InChI=1S/C7H6N2/c8-5-4-7-3-1-2-6-9-7/h1-3,6H,4H2",UKVQBONVSSLJBB-UHFFFAOYSA-N,0,0,0,1,...,-0.004179,-0.003555,-0.012376,0.000566,0.004126,-0.015840,0.002919,0.007134,0.004434,0.008699
2460,EOS102394,PD164272,ALPHA-HEXACHLOROCYCLOHEXANE,Cl[C@H]1[C@H](Cl)[C@H](Cl)[C@@H](Cl)[C@H](Cl)[...,InChI=1S/C6H6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9...,JLYXXMFPNIAWKQ-LKPKBOIGSA-N,0,0,0,1,...,0.003403,-0.016317,0.010066,0.004196,0.000373,-0.019137,-0.002833,-0.007359,0.013387,-0.003956
2461,EOS102396,PD164274,PD164274,O=C(O)/C=C1/CCCc2ccccc2[C@@H]1O,InChI=1S/C13H14O3/c14-12(15)8-10-6-3-5-9-4-1-2...,UADPGHINQMWEAG-CHOZFAJLSA-N,0,0,0,1,...,-0.004886,-0.007972,-0.001957,-0.001061,0.006149,-0.013120,-0.006037,0.004062,0.007926,0.005134
2462,EOS102400,PD164275,PD164275,CC1CCC2C(=O)OC(=O)C2C1,InChI=1S/C9H12O3/c1-5-2-3-6-7(4-5)9(11)12-8(6)...,FKBMTBAXDISZGN-UHFFFAOYSA-N,0,0,0,1,...,-0.006399,-0.005893,-0.001018,0.002218,0.004813,-0.016106,-0.002260,0.003595,0.005777,0.004184


Now we will read cell lines dataset:

In [None]:
file_cell_lines_common_type = '/*[0-9]_' + "cell_lines_common_agg.csv"
files_cell_lines_common = glob.glob(save_path + file_cell_lines_common_type)

# gets latest file
max_file_cell_lines_common = max(files_cell_lines_common, key=os.path.getctime)

# load file
df_cell_lines_common = pd.read_csv(max_file_cell_lines_common)
df_cell_lines_common

Unnamed: 0,Metadata_EOS,Nuc_AreaShape_Zernike_6_6,Nuc_Texture_Correlation_Mito_5_01_256,Cyto_Intensity_LowerQuartileIntensity_ER,Nuc_Neighbors_PercentTouching_1,Nuc_Neighbors_AngleBetweenNeighbors_1,Cyto_AreaShape_Zernike_9_7,Cells_AreaShape_Zernike_6_0,Cyto_Texture_InfoMeas1_DNA_3_03_256,Cyto_AreaShape_Zernike_4_4,...,Nuc_RadialDistribution_FracAtD_DNA_1of4,Cells_AreaShape_Zernike_9_5,Cells_Correlation_K_DNA_Mito,Nuc_AreaShape_Zernike_6_2,Cells_Texture_Correlation_DNA_10_03_256,Nuc_Texture_Correlation_Mito_3_03_256,Cyto_Granularity_12_ER,Cyto_Texture_Correlation_AGP_10_03_256,Cells_Neighbors_AngleBetweenNeighbors_Adjacent,Cyto_Texture_Correlation_DNA_5_02_256
0,EOS100001,-13.076729,-0.075131,-3.806660,-11.244612,-2.471498,-0.520458,-0.763770,5.002433,3.463625,...,5.114269,-0.202799,-1.305777,-2.421799,9.417494,-4.411698,-0.412077,9.985626,-1.417590,-6.019839
1,EOS100002,-6.121051,-0.096092,-3.332405,-4.885769,-1.823292,-1.991138,0.818538,2.771862,2.118787,...,1.603251,-1.252296,-1.453120,-0.405294,4.477472,-1.807204,0.532861,6.742989,-2.624253,-3.545882
2,EOS100003,-0.841283,3.043289,-1.163089,-4.470534,-3.757814,-1.945677,-0.630209,3.989770,-0.408144,...,-11.672445,-0.077826,1.765033,-0.454813,10.056370,0.695379,-0.194231,3.052824,-4.247954,-0.013115
3,EOS100005,-7.777577,0.781603,-2.461855,-8.290206,-2.337916,-2.168737,1.314309,3.665932,2.241780,...,2.963389,-0.685500,0.051649,-2.929235,1.061884,-0.341990,-0.020339,6.061573,-2.414724,-4.316167
4,EOS100009,-1.874141,0.362520,-0.928342,-4.953204,-1.865503,-2.037885,0.552125,2.449416,-1.847507,...,0.223875,-2.224329,0.323483,-0.899075,3.105818,-1.214778,-1.820063,3.994057,-2.520183,-1.534358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1113,EOS102439,-7.151318,2.822496,-0.193622,-4.181278,-1.689024,0.134334,0.265747,4.198375,0.858913,...,2.537504,-0.059932,2.612518,-2.918447,6.519919,4.065786,-0.792470,4.075449,-1.127720,-3.828830
1114,EOS102449,1.827569,0.133349,3.225831,0.231775,0.632112,0.454872,0.010330,2.707114,1.763995,...,-1.356474,0.246812,2.262251,1.110342,-0.426098,1.040523,1.432792,-1.081176,0.199339,0.609960
1115,EOS102459,-2.426490,1.695767,-1.543415,-4.204215,-0.030562,1.608140,-0.143577,2.335076,1.309221,...,1.823651,0.103712,0.193915,-0.180487,4.767596,1.654780,0.180921,3.885591,0.098539,-1.789195
1116,Nocodazole,-5.582580,0.604795,-0.678350,-16.500159,-3.462935,-3.183394,-0.078664,4.281409,1.673380,...,4.722664,-0.645975,-0.791788,-3.153556,5.807658,-2.134323,-0.335592,6.881238,-2.678616,-6.452264


Let's check merge result:

In [38]:
df_cell_lines_common.merge(df_ecbd_fp, on="Metadata_EOS", how="outer", indicator=True)['_merge'].value_counts()

right_only    1348
both          1116
left_only        2
Name: _merge, dtype: int64

In [39]:
df_cell_lines_ecdb = df_cell_lines_common.merge(df_ecbd_fp, on="Metadata_EOS", how='inner')
df_cell_lines_ecdb

Unnamed: 0,Metadata_EOS,Nuc_AreaShape_Zernike_6_6,Nuc_Texture_Correlation_Mito_5_01_256,Cyto_Intensity_LowerQuartileIntensity_ER,Nuc_Neighbors_PercentTouching_1,Nuc_Neighbors_AngleBetweenNeighbors_1,Cyto_AreaShape_Zernike_9_7,Cells_AreaShape_Zernike_6_0,Cyto_Texture_InfoMeas1_DNA_3_03_256,Cyto_AreaShape_Zernike_4_4,...,fp_246,fp_247,fp_248,fp_249,fp_250,fp_251,fp_252,fp_253,fp_254,fp_255
0,EOS100001,-13.076729,-0.075131,-3.806660,-11.244612,-2.471498,-0.520458,-0.763770,5.002433,3.463625,...,0,1,0,1,0,1,0,0,0,0
1,EOS100002,-6.121051,-0.096092,-3.332405,-4.885769,-1.823292,-1.991138,0.818538,2.771862,2.118787,...,0,0,0,0,1,1,0,1,0,0
2,EOS100003,-0.841283,3.043289,-1.163089,-4.470534,-3.757814,-1.945677,-0.630209,3.989770,-0.408144,...,0,0,0,0,0,1,0,0,0,1
3,EOS100005,-7.777577,0.781603,-2.461855,-8.290206,-2.337916,-2.168737,1.314309,3.665932,2.241780,...,0,0,0,0,0,1,0,0,0,1
4,EOS100009,-1.874141,0.362520,-0.928342,-4.953204,-1.865503,-2.037885,0.552125,2.449416,-1.847507,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,EOS102437,-1.677523,-3.705068,6.522804,-3.407188,-0.347425,-2.928558,0.803725,-0.807068,1.334239,...,1,0,0,1,0,0,0,0,0,0
1112,EOS102438,0.434756,0.630809,-2.412240,-0.779999,-0.405782,-0.345892,0.094349,-0.575895,0.576504,...,0,1,0,0,0,0,0,0,0,1
1113,EOS102439,-7.151318,2.822496,-0.193622,-4.181278,-1.689024,0.134334,0.265747,4.198375,0.858913,...,0,0,0,0,1,0,0,1,0,0
1114,EOS102449,1.827569,0.133349,3.225831,0.231775,0.632112,0.454872,0.010330,2.707114,1.763995,...,0,0,0,0,0,0,0,0,0,0


In [None]:
morph_cols = df_cell_lines_common.columns[1:].to_list()
morph_dict = {col: f'morphology_{col}' for col in morph_cols}

df_cell_lines_ecdb = df_cell_lines_ecdb.rename(columns=morph_dict)

metadata_cols = [col for col in df_cell_lines_ecdb.columns if col.startswith('Metadata_')]
binary_cols = [col for col in df_cell_lines_ecdb.columns if col.startswith('binary_')]
chemical_cols = [col for col in df_cell_lines_ecdb.columns if col.startswith('chemical_')]
moa_cols = [col for col in df_cell_lines_ecdb.columns if col.startswith('moa_')]
drug_status_cols = [col for col in df_cell_lines_ecdb.columns if col.startswith('drug_status_')]
fingerprints_cols = [col for col in df_cell_lines_ecdb.columns if col.startswith('fp_')]

used_cols = set(metadata_cols + binary_cols + moa_cols + drug_status_cols)
other_cols = [col for col in df_cell_lines_ecdb.columns if col not in used_cols]

# new order of columns
new_order = metadata_cols + binary_cols + moa_cols + drug_status_cols + other_cols

df_cell_lines_ecdb = df_cell_lines_ecdb[new_order]

filename = save_path + str(date.today()) + "_merged.csv"
df_cell_lines_ecdb.to_csv(filename, index = False)

df_cell_lines_ecdb

Unnamed: 0,Metadata_EOS,Metadata_pdid,Metadata_name,Metadata_smiles,Metadata_inchi,Metadata_inchikey,Metadata_no. targets,Metadata_cas,Metadata_synonyms,binary_probe,...,fp_246,fp_247,fp_248,fp_249,fp_250,fp_251,fp_252,fp_253,fp_254,fp_255
0,EOS100001,PD003310,vincristine,CC[C@]1(O)C[C@@H]2CN(CCc3c([nH]c4ccccc34)[C@@]...,InChI=1S/C46H56N4O10/c1-7-42(55)22-28-23-45(40...,OGWKCGZFUXNPDA-CFWMRBGOSA-N,4,132142-73-5;57-22-7,Leurocristine;NSC-67574;22-Oxovincaleukoblasti...,0,...,0,1,0,1,0,1,0,0,0,0
1,EOS100002,PD002995,finasteride,CC(C)(C)NC(=O)[C@H]1CC[C@H]2[C@@H]3CC[C@H]4NC(...,"InChI=1S/C23H36N2O2/c1-21(2,3)25-20(27)17-8-7-...",DBEPLOCGEIEOCV-WSBQPABSSA-N,18,98319-26-7;140375-21-9,CHEMBL710;MK-906;finasteride;Proscar;FINASTERI...,1,...,0,0,0,0,1,1,0,1,0,0
2,EOS100003,PD012798,Dubermatinib,CN1CCN(Cc2ccc(Nc3ncc(Cl)c(Nc4ccccc4S(=O)(=O)N(...,"InChI=1S/C24H30ClN7O2S/c1-30(2)35(33,34)22-7-5...",YUAALFPUEOYPNX-UHFFFAOYSA-N,3,1341200-45-0,TP0903;TP-0903;Dubermatinib(TP-0903);Dubermati...,0,...,0,0,0,0,0,1,0,0,0,1
3,EOS100005,PD002999,ezetimibe,O=C1[C@H](CC[C@H](O)c2ccc(F)cc2)[C@@H](c2ccc(O...,InChI=1S/C24H21F2NO3/c25-17-5-1-15(2-6-17)22(2...,OLNTVTPDXPETLC-XPWALMASSA-N,18,163222-33-1,ezetimibe;SCH 58235;Vytorin-ezetimibe;Ezetimib...,0,...,0,0,0,0,0,1,0,0,0,1
4,EOS100009,PD051680,Netazepide,CNc1cccc(NC(=O)N[C@@H]2N=C(c3ccccn3)c3ccccc3N(...,"InChI=1S/C28H30N6O3/c1-28(2,3)23(35)17-34-22-1...",YDZYKNJZCVIKPP-VWLOTQADSA-N,2,155488-25-8,YM-220;Netazepide;YF 476;YF-476;Sograzepide,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,EOS102437,PD095835,DIPHYLLIN,COc1cc2c(O)c3c(c(-c4ccc5c(c4)OCO5)c2cc1OC)C(=O...,InChI=1S/C21H16O7/c1-24-15-6-11-12(7-16(15)25-...,VMEJANRODATDOF-UHFFFAOYSA-N,11,22055-22-7,DIPHYLLIN,0,...,1,0,0,1,0,0,0,0,0,0
1112,EOS102438,PD085956,PD085956,Cc1[nH]c2ccccc2c1CN1CCN(c2ccccn2)CC1,InChI=1S/C19H22N4/c1-15-17(16-6-2-3-7-18(16)21...,ZAXKSLQZANTESC-UHFFFAOYSA-N,6,,,0,...,0,1,0,0,0,0,0,0,0,1
1113,EOS102439,PD086412,ESI-05,Cc1ccc(S(=O)(=O)c2c(C)cc(C)cc2C)cc1,"InChI=1S/C16H18O2S/c1-11-5-7-15(8-6-11)19(17,1...",CGPHOZWFSFNOEQ-UHFFFAOYSA-N,1,5184-64-5,NSC 116966;ESI-05;ESI 05,0,...,0,0,0,0,1,0,0,1,0,0
1114,EOS102449,PD009614,APRACLONIDINE,Nc1cc(Cl)c(NC2=NCCN2)c(Cl)c1,InChI=1S/C9H10Cl2N4/c10-6-3-5(12)4-7(11)8(6)15...,IEJXVRYNEISIKR-UHFFFAOYSA-N,12,66711-21-5,APRACLONIDINE;Pharmaprojects No. 1758,0,...,0,0,0,0,0,0,0,0,0,0


Now we have final merged dataset with 1,116 compounds!

Now we can do the same thing for MolCRL dataset: 

In [None]:
# merge
df_cell_lines_ecdb_molcrl = df_cell_lines_common.merge(df_ecbd_molcrl, on="Metadata_EOS", how='inner')

# rename
morph_cols = df_cell_lines_common.columns[1:].to_list()
morph_dict = {col: f'morphology_{col}' for col in morph_cols}

df_cell_lines_ecdb_molcrl = df_cell_lines_ecdb_molcrl.rename(columns=morph_dict)

metadata_cols = [col for col in df_cell_lines_ecdb_molcrl.columns if col.startswith('Metadata_')]
binary_cols = [col for col in df_cell_lines_ecdb_molcrl.columns if col.startswith('binary_')]
chemical_cols = [col for col in df_cell_lines_ecdb_molcrl.columns if col.startswith('chemical_')]
moa_cols = [col for col in df_cell_lines_ecdb_molcrl.columns if col.startswith('moa_')]
drug_status_cols = [col for col in df_cell_lines_ecdb_molcrl.columns if col.startswith('drug_status_')]
fingerprints_cols = [col for col in df_cell_lines_ecdb_molcrl.columns if col.startswith('fp_')]

used_cols = set(metadata_cols + binary_cols + moa_cols + drug_status_cols)
other_cols = [col for col in df_cell_lines_ecdb_molcrl.columns if col not in used_cols]

# new order of columns
new_order = metadata_cols + binary_cols + moa_cols + drug_status_cols + other_cols

df_cell_lines_ecdb_molcrl = df_cell_lines_ecdb_molcrl[new_order]

filename = save_path + str(date.today()) + "_merged_molcrl.csv"
df_cell_lines_ecdb_molcrl.to_csv(filename, index = False)

df_cell_lines_ecdb_molcrl

Also we wanted to try different approaches in cell lines aggregations. Let's do the same with fingeprints dataset to experiment with it later:

In [11]:
def load_merge_save(cell_lines_type: str, save_file_suffix: str, df_ecdb):
  file_cell_lines_common_type = '/*[0-9]_' + cell_lines_type
  files_cell_lines_common = glob.glob(save_path + file_cell_lines_common_type)

  # gets latest file
  max_file_cell_lines_common = max(files_cell_lines_common, key=os.path.getctime)

  # load file
  df_cell_lines_common = pd.read_csv(max_file_cell_lines_common)
  
  # merge
  df_cell_lines_ecdb_molcrl = df_cell_lines_common.merge(df_ecdb, on="Metadata_EOS", how='inner')

  # rename
  morph_cols = df_cell_lines_common.columns[1:].to_list()
  morph_dict = {col: f'morphology_{col}' for col in morph_cols}

  df_cell_lines_ecdb_molcrl = df_cell_lines_ecdb_molcrl.rename(columns=morph_dict)

  metadata_cols = [col for col in df_cell_lines_ecdb_molcrl.columns if col.startswith('Metadata_')]
  binary_cols = [col for col in df_cell_lines_ecdb_molcrl.columns if col.startswith('binary_')]
  chemical_cols = [col for col in df_cell_lines_ecdb_molcrl.columns if col.startswith('chemical_')]
  moa_cols = [col for col in df_cell_lines_ecdb_molcrl.columns if col.startswith('moa_')]
  drug_status_cols = [col for col in df_cell_lines_ecdb_molcrl.columns if col.startswith('drug_status_')]
  fingerprints_cols = [col for col in df_cell_lines_ecdb_molcrl.columns if col.startswith('fp_')]

  used_cols = set(metadata_cols + binary_cols + moa_cols + drug_status_cols)
  other_cols = [col for col in df_cell_lines_ecdb_molcrl.columns if col not in used_cols]

  # new order of columns
  new_order = metadata_cols + binary_cols + moa_cols + drug_status_cols + other_cols
  df_cell_lines_ecdb_molcrl = df_cell_lines_ecdb_molcrl[new_order]
  
  filename = save_path + str(date.today()) + save_file_suffix
  df_cell_lines_ecdb_molcrl.to_csv(filename, index = False)
  
load_merge_save("cell_lines_exp_mean.csv", "_merged_exp_mean.csv", df_ecbd_fp)
load_merge_save("cell_lines_exp_geometric_mean.csv", "_merged_exp_geometric_mean.csv", df_ecbd_fp)
load_merge_save("cell_lines_exp_closest_geometric_mean.csv", "_merged_exp_closest_geometric_mean.csv", df_ecbd_fp)
load_merge_save("cell_lines_exp_agm.csv", "_merged_exp_agm.csv", df_ecbd_fp)
load_merge_save("cell_lines_exp_closest_agm.csv", "_merged_exp_closest_agm.csv", df_ecbd_fp)

load_merge_save("cell_lines_exp_mean.csv", "_merged_molcrl_exp_mean.csv", df_ecbd_molcrl)
load_merge_save("cell_lines_exp_geometric_mean.csv", "_merged_molcrl_exp_geometric_mean.csv", df_ecbd_molcrl)
load_merge_save("cell_lines_exp_closest_geometric_mean.csv", "_merged_molcrl_exp_closest_geometric_mean.csv", df_ecbd_molcrl)
load_merge_save("cell_lines_exp_agm.csv", "_merged_molcrl_exp_agm.csv", df_ecbd_molcrl)
load_merge_save("cell_lines_exp_closest_agm.csv", "_merged_molcrl_exp_closest_agm.csv", df_ecbd_molcrl)

## Summary
- Successfully merged morphological profiles, molecular features, and MoA labels by compound ID.
- Performed alignment checks to ensure consistent sample indexing.
- Output: a complete multimodal dataframe ready for training and evaluation in MoA classification tasks.