# To find and reliably match all the cell barcodes

In [1]:

import pandas as pd
import pickle as pkl
import scanpy as sc
import anndata as ad


### Cell Embeddings

In [2]:
cell_embeddings = pd.read_csv("../ZafrensData/latent_embeddings_zel024.csv")

In [3]:
cell_embeddings.shape

(18736, 2001)

In [4]:
adata = sc.read_h5ad("../ZafrensData/zel024/rnaseq/4073_4074_pseudobulk.h5ad")



In [5]:
adata.obs['sample']

0            4073_1_-1_-1_-1_-1_False
1            4073_2_-1_-1_-1_-1_False
2            4073_3_-1_-1_-1_-1_False
3            4073_4_-1_-1_-1_-1_False
4            4073_5_-1_-1_-1_-1_False
                     ...             
18731    4074_-1_270_422_2_1630_False
18732    4074_-1_270_422_2_1940_False
18733    4074_-1_270_422_2_1941_False
18734    4074_-1_270_422_2_1943_False
18735    4074_-1_270_422_2_1945_False
Name: sample, Length: 18736, dtype: category
Categories (18736, object): ['4073_1_-1_-1_-1_-1_False', '4073_2_-1_-1_-1_-1_False', '4073_3_-1_-1_-1_-1_False', '4073_4_-1_-1_-1_-1_False', ..., '4074_-1_270_422_2_1940_False', '4074_-1_270_422_2_1941_False', '4074_-1_270_422_2_1943_False', '4074_-1_270_422_2_1945_False']

In [7]:
adata.shape

(18736, 29084)

### SMILES embedding

In [8]:

with open("../ZafrensData/embedded_smiles/embedded_smiles_0.pkl", "rb") as file:
    smiles_embedding_0 = pkl.load(file)

with open("../ZafrensData/embedded_smiles/embedded_smiles_1.pkl", "rb") as file:
    smiles_embedding_1 = pkl.load(file)

with open("../ZafrensData/embedded_smiles/embedded_smiles_2.pkl", "rb") as file:
    smiles_embedding_2 = pkl.load(file)
    

In [9]:

df_combined = pd.concat([smiles_embedding_0, smiles_embedding_1, smiles_embedding_2], axis=0).reset_index(drop=True)


In [10]:

smiles = pd.read_csv("../ZafrensData/smiles.csv")


In [12]:

smiles


Unnamed: 0,control_rx_id,bb1_id,bb2_id,bb3_id,bb4_id,SMILES
0,1,-1,-1,-1,-1,O=C1C=C(C(NCC2=CC3=C(C=C(CNCC4CCC4)N3)C=C2)=O)...
1,2,-1,-1,-1,-1,O=C(NCC1=C[N]2C=C(CNCC3CCCCC3)C=CC2=N1)C4=CC(=...
2,3,-1,-1,-1,-1,CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4C)c4nc(Cl)...
3,4,-1,-1,-1,-1,CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4S(c5ccc(C)...
4,5,-1,-1,-1,-1,CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4S(c5ccccc5...
...,...,...,...,...,...,...
14116,-1,215,195,2,1462,CC(=O)NC[C@H]1CCCN(c2ccnc(-c3cnn(CC(=O)N(C)C)c...
14117,-1,222,195,2,1462,CC(=O)NCC1CCN(c2ccnc(-c3cnn(CC(=O)N(C)C)c3)n2)CC1
14118,-1,238,195,2,1462,CC(=O)NC[C@@H]1CCN(c2ccnc(-c3cnn(CC(=O)N(C)C)c...
14119,-1,269,195,2,1462,CC(=O)NC[C@H]1CN(c2ccnc(-c3cnn(CC(=O)N(C)C)c3)...


In [42]:
# Columns to concatenate
columns_to_concat = ['control_rx_id', 'bb1_id', 'bb2_id', 'bb3_id', 'bb4_id']

# Concatenate columns with an underscore separator
smiles['sample'] = smiles[columns_to_concat].astype(str).agg('_'.join, axis=1)


In [17]:

smiles


Unnamed: 0,control_rx_id,bb1_id,bb2_id,bb3_id,bb4_id,SMILES,sample
0,1,-1,-1,-1,-1,O=C1C=C(C(NCC2=CC3=C(C=C(CNCC4CCC4)N3)C=C2)=O)...,1_-1_-1_-1_-1
1,2,-1,-1,-1,-1,O=C(NCC1=C[N]2C=C(CNCC3CCCCC3)C=CC2=N1)C4=CC(=...,2_-1_-1_-1_-1
2,3,-1,-1,-1,-1,CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4C)c4nc(Cl)...,3_-1_-1_-1_-1
3,4,-1,-1,-1,-1,CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4S(c5ccc(C)...,4_-1_-1_-1_-1
4,5,-1,-1,-1,-1,CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4S(c5ccccc5...,5_-1_-1_-1_-1
...,...,...,...,...,...,...,...
14116,-1,215,195,2,1462,CC(=O)NC[C@H]1CCCN(c2ccnc(-c3cnn(CC(=O)N(C)C)c...,-1_215_195_2_1462
14117,-1,222,195,2,1462,CC(=O)NCC1CCN(c2ccnc(-c3cnn(CC(=O)N(C)C)c3)n2)CC1,-1_222_195_2_1462
14118,-1,238,195,2,1462,CC(=O)NC[C@@H]1CCN(c2ccnc(-c3cnn(CC(=O)N(C)C)c...,-1_238_195_2_1462
14119,-1,269,195,2,1462,CC(=O)NC[C@H]1CN(c2ccnc(-c3cnn(CC(=O)N(C)C)c3)...,-1_269_195_2_1462


In [15]:

adata.obs['common'] = adata.obs['sample'].str.split('_', n=1).str[1].str.rsplit('_', n=1).str[0]


In [18]:

adata.obs['common']


0            1_-1_-1_-1_-1
1            2_-1_-1_-1_-1
2            3_-1_-1_-1_-1
3            4_-1_-1_-1_-1
4            5_-1_-1_-1_-1
               ...        
18731    -1_270_422_2_1630
18732    -1_270_422_2_1940
18733    -1_270_422_2_1941
18734    -1_270_422_2_1943
18735    -1_270_422_2_1945
Name: common, Length: 18736, dtype: object

In [19]:

intersection_set = set(smiles['sample']) & set(adata.obs['common'])


In [21]:

len(intersection_set)


12093

### Images

In [22]:

with open("../ZafrensData/final_dataset/images_34K.pkl", "rb") as file:
    images_34K = pkl.load(file)


In [23]:

with open("../ZafrensData/final_dataset/images_metadata_34K.pkl", "rb") as file:
    images_metadata_34K = pkl.load(file)
    

In [24]:
images_metadata_34K

Unnamed: 0,index,hdf5_dim_0_index,physical_well_id,control_rx_id,bb1_id,bb2_id,bb3_id,bb4_id,censored,sample
0,0,0,50,-1,213,417,1,1387,False,50_-1_213_417_1_1387_False
2,2,2,60,-1,207,123,2,1411,False,60_-1_207_123_2_1411_False
3,3,3,62,-1,207,352,2,1554,False,62_-1_207_352_2_1554_False
4,4,4,305,-1,215,420,1,1489,False,305_-1_215_420_1_1489_False
5,5,5,306,-1,202,352,1,440,False,306_-1_202_352_1_440_False
...,...,...,...,...,...,...,...,...,...,...
54405,11269,11269,49317,-1,705,869,1,1,False,49317_-1_705_869_1_1_False
54407,11271,11271,49334,-1,651,869,1,1,False,49334_-1_651_869_1_1_False
54409,11273,11273,49342,-1,706,1069,1,1,False,49342_-1_706_1069_1_1_False
54411,11275,11275,49343,-1,687,290,1,1,False,49343_-1_687_290_1_1_False


In [26]:
list(intersection_set)

['-1_213_418_2_1557',
 '-1_222_123_2_1491',
 '-1_222_418_2_1365',
 '-1_215_417_2_1551',
 '-1_198_420_2_1365',
 '-1_208_195_1_87',
 '-1_207_422_2_1941',
 '-1_208_420_2_1604',
 '-1_213_422_1_1382',
 '-1_213_422_1_1383',
 '-1_215_422_1_1592',
 '-1_222_195_2_1415',
 '-1_205_352_1_1557',
 '-1_202_195_2_1586',
 '-1_270_418_1_1381',
 '-1_213_422_2_1489',
 '-1_215_422_2_1418',
 '-1_269_123_2_1601',
 '-1_205_123_1_1533',
 '-1_270_420_1_414',
 '-1_207_420_2_82',
 '-1_270_420_2_1484',
 '-1_198_420_2_1404',
 '-1_202_420_2_1383',
 '-1_205_195_1_1499',
 '-1_207_123_1_1490',
 '-1_238_195_1_440',
 '-1_205_418_2_440',
 '-1_207_420_1_1604',
 '-1_269_195_1_1604',
 '-1_213_418_2_1601',
 '-1_269_195_1_1557',
 '-1_238_418_2_414',
 '-1_208_195_2_1537',
 '-1_208_195_1_1483',
 '-1_207_420_1_1467',
 '-1_238_352_2_1416',
 '-1_270_420_2_82',
 '-1_205_418_1_1559',
 '-1_222_422_1_1552',
 '-1_201_420_1_1526',
 '-1_213_418_1_1380',
 '-1_208_352_1_1552',
 '-1_215_123_1_1537',
 '-1_222_352_1_82',
 '-1_215_420_1_1413',


In [34]:

images_metadata_34K['common'] = images_metadata_34K['sample'].str.split('_', n=1).str[1].str.rsplit('_', n=1).str[0]


In [40]:
image_indices = list(images_metadata_34K[images_metadata_34K['common'].isin(list(intersection_set))].index)

In [48]:

images_metadata_34K


Unnamed: 0,index,hdf5_dim_0_index,physical_well_id,control_rx_id,bb1_id,bb2_id,bb3_id,bb4_id,censored,sample,common
0,0,0,50,-1,213,417,1,1387,False,50_-1_213_417_1_1387_False,-1_213_417_1_1387
2,2,2,60,-1,207,123,2,1411,False,60_-1_207_123_2_1411_False,-1_207_123_2_1411
3,3,3,62,-1,207,352,2,1554,False,62_-1_207_352_2_1554_False,-1_207_352_2_1554
4,4,4,305,-1,215,420,1,1489,False,305_-1_215_420_1_1489_False,-1_215_420_1_1489
5,5,5,306,-1,202,352,1,440,False,306_-1_202_352_1_440_False,-1_202_352_1_440
...,...,...,...,...,...,...,...,...,...,...,...
54405,11269,11269,49317,-1,705,869,1,1,False,49317_-1_705_869_1_1_False,-1_705_869_1_1
54407,11271,11271,49334,-1,651,869,1,1,False,49334_-1_651_869_1_1_False,-1_651_869_1_1
54409,11273,11273,49342,-1,706,1069,1,1,False,49342_-1_706_1069_1_1_False,-1_706_1069_1_1
54411,11275,11275,49343,-1,687,290,1,1,False,49343_-1_687_290_1_1_False,-1_687_290_1_1


In [47]:
len(image_indices)

18220

In [45]:

selected_images_zel024 = images_34K[image_indices, :, :]


In [46]:
selected_images_zel024

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.