# To find and reliably match all the cell barcodes

In [1]:

import pandas as pd
import pickle as pkl
import scanpy as sc
import anndata as ad
import numpy as np


In [2]:

adata = sc.read_h5ad("../ZafrensData/zel024/rnaseq/4073_4074_pseudobulk.h5ad")
adata.obs['common'] = adata.obs['sample'].str.split('_', n=1).str[1].str.rsplit('_', n=1).str[0]
adata.obs = adata.obs.reset_index().rename(columns={'level_0': 'cell_index'})




### SMILES embedding

In [3]:

smiles = pd.read_csv("../ZafrensData/smiles.csv")
# Columns to concatenate
columns_to_concat = ['control_rx_id', 'bb1_id', 'bb2_id', 'bb3_id', 'bb4_id']

# Concatenate columns with an underscore separator
smiles['common'] = smiles[columns_to_concat].astype(str).agg('_'.join, axis=1)

smiles = smiles.reset_index().rename(columns={'index': 'smiles_index'})
smiles['smiles_index'] = smiles['smiles_index'].astype('Int64')


In [4]:

RNA_metadata = adata.obs
merged_df = RNA_metadata.merge(smiles, on = 'common', how = 'left').set_index(RNA_metadata.index)


### Images

In [5]:
with open("../ZafrensData/final_dataset/images_metadata_zel024_125_20K.pkl", "rb") as file:
    images_metadata_34K = pkl.load(file)

In [6]:

images_metadata_34K['common'] = images_metadata_34K['sample'].str.split('_', n=1).str[1].str.rsplit('_', n=1).str[0]


In [7]:
images_metadata_34K

Unnamed: 0,hdf5_dim_0_index,physical_well_id,control_rx_id,bb1_id,bb2_id,bb3_id,bb4_id,censored,sample,common
0,0,50,-1,213,417,1,1387,False,50_-1_213_417_1_1387_False,-1_213_417_1_1387
1,2,60,-1,207,123,2,1411,False,60_-1_207_123_2_1411_False,-1_207_123_2_1411
2,3,62,-1,207,352,2,1554,False,62_-1_207_352_2_1554_False,-1_207_352_2_1554
3,4,305,-1,215,420,1,1489,False,305_-1_215_420_1_1489_False,-1_215_420_1_1489
4,5,306,-1,202,352,1,440,False,306_-1_202_352_1_440_False,-1_202_352_1_440
...,...,...,...,...,...,...,...,...,...,...
20381,21894,49257,-1,222,418,2,1544,False,49257_-1_222_418_2_1544_False,-1_222_418_2_1544
20382,21898,49260,-1,201,195,1,1418,False,49260_-1_201_195_1_1418_False,-1_201_195_1_1418
20383,21900,49269,-1,205,420,2,1497,False,49269_-1_205_420_2_1497_False,-1_205_420_2_1497
20384,21902,49270,-1,269,422,1,1613,False,49270_-1_269_422_1_1613_False,-1_269_422_1_1613


In [8]:
# Merge on index

images_metadata_34K = images_metadata_34K[images_metadata_34K['common'].isin(list(merged_df['common']))]
#images_metadata_34K.drop(columns=['index'], inplace=True)
images_metadata_34K = images_metadata_34K.reset_index().rename(columns={'index': 'image_index'})
images_metadata_34K['image_index'] = images_metadata_34K['image_index'].astype('Int64')



In [9]:
images_metadata_34K

Unnamed: 0,image_index,hdf5_dim_0_index,physical_well_id,control_rx_id,bb1_id,bb2_id,bb3_id,bb4_id,censored,sample,common
0,0,0,50,-1,213,417,1,1387,False,50_-1_213_417_1_1387_False,-1_213_417_1_1387
1,1,2,60,-1,207,123,2,1411,False,60_-1_207_123_2_1411_False,-1_207_123_2_1411
2,2,3,62,-1,207,352,2,1554,False,62_-1_207_352_2_1554_False,-1_207_352_2_1554
3,3,4,305,-1,215,420,1,1489,False,305_-1_215_420_1_1489_False,-1_215_420_1_1489
4,4,5,306,-1,202,352,1,440,False,306_-1_202_352_1_440_False,-1_202_352_1_440
...,...,...,...,...,...,...,...,...,...,...,...
18283,20380,21892,49253,2,-1,-1,-1,-1,False,49253_2_-1_-1_-1_-1_False,2_-1_-1_-1_-1
18284,20382,21898,49260,-1,201,195,1,1418,False,49260_-1_201_195_1_1418_False,-1_201_195_1_1418
18285,20383,21900,49269,-1,205,420,2,1497,False,49269_-1_205_420_2_1497_False,-1_205_420_2_1497
18286,20384,21902,49270,-1,269,422,1,1613,False,49270_-1_269_422_1_1613_False,-1_269_422_1_1613


In [10]:
smiles

Unnamed: 0,smiles_index,control_rx_id,bb1_id,bb2_id,bb3_id,bb4_id,SMILES,common
0,0,1,-1,-1,-1,-1,O=C1C=C(C(NCC2=CC3=C(C=C(CNCC4CCC4)N3)C=C2)=O)...,1_-1_-1_-1_-1
1,1,2,-1,-1,-1,-1,O=C(NCC1=C[N]2C=C(CNCC3CCCCC3)C=CC2=N1)C4=CC(=...,2_-1_-1_-1_-1
2,2,3,-1,-1,-1,-1,CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4C)c4nc(Cl)...,3_-1_-1_-1_-1
3,3,4,-1,-1,-1,-1,CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4S(c5ccc(C)...,4_-1_-1_-1_-1
4,4,5,-1,-1,-1,-1,CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4S(c5ccccc5...,5_-1_-1_-1_-1
...,...,...,...,...,...,...,...,...
14116,14116,-1,215,195,2,1462,CC(=O)NC[C@H]1CCCN(c2ccnc(-c3cnn(CC(=O)N(C)C)c...,-1_215_195_2_1462
14117,14117,-1,222,195,2,1462,CC(=O)NCC1CCN(c2ccnc(-c3cnn(CC(=O)N(C)C)c3)n2)CC1,-1_222_195_2_1462
14118,14118,-1,238,195,2,1462,CC(=O)NC[C@@H]1CCN(c2ccnc(-c3cnn(CC(=O)N(C)C)c...,-1_238_195_2_1462
14119,14119,-1,269,195,2,1462,CC(=O)NC[C@H]1CN(c2ccnc(-c3cnn(CC(=O)N(C)C)c3)...,-1_269_195_2_1462


## Final merge

In [11]:

final_metadata = merged_df.merge(images_metadata_34K, on = 'common', how = 'left') #.set_index(merged_df.index)


In [12]:

final_metadata


Unnamed: 0,cell_index,index,sample_x,device_id,control_rx_id_x,bb1_id_x,bb2_id_x,bb3_id_x,bb4_id_x,censored_x,...,image_index,hdf5_dim_0_index,physical_well_id,control_rx_id,bb1_id,bb2_id,bb3_id,bb4_id,censored_y,sample_y
0,0,0,4073_1_-1_-1_-1_-1_False,4073,1,-1,-1,-1,-1,False,...,27,31.0,1210.0,1.0,-1.0,-1.0,-1.0,-1.0,False,1210_1_-1_-1_-1_-1_False
1,0,0,4073_1_-1_-1_-1_-1_False,4073,1,-1,-1,-1,-1,False,...,36,44.0,2077.0,1.0,-1.0,-1.0,-1.0,-1.0,False,2077_1_-1_-1_-1_-1_False
2,0,0,4073_1_-1_-1_-1_-1_False,4073,1,-1,-1,-1,-1,False,...,91,116.0,2930.0,1.0,-1.0,-1.0,-1.0,-1.0,False,2930_1_-1_-1_-1_-1_False
3,0,0,4073_1_-1_-1_-1_-1_False,4073,1,-1,-1,-1,-1,False,...,110,141.0,3080.0,1.0,-1.0,-1.0,-1.0,-1.0,False,3080_1_-1_-1_-1_-1_False
4,0,0,4073_1_-1_-1_-1_-1_False,4073,1,-1,-1,-1,-1,False,...,124,159.0,3148.0,1.0,-1.0,-1.0,-1.0,-1.0,False,3148_1_-1_-1_-1_-1_False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35070,18732,35725,4074_-1_270_422_2_1940_False,4074,-1,270,422,2,1940,False,...,12299,6861.0,20093.0,-1.0,270.0,422.0,2.0,1940.0,False,20093_-1_270_422_2_1940_False
35071,18733,35729,4074_-1_270_422_2_1941_False,4074,-1,270,422,2,1941,False,...,19656,19934.0,47042.0,-1.0,270.0,422.0,2.0,1941.0,False,47042_-1_270_422_2_1941_False
35072,18734,35733,4074_-1_270_422_2_1943_False,4074,-1,270,422,2,1943,False,...,5960,8079.0,42350.0,-1.0,270.0,422.0,2.0,1943.0,False,42350_-1_270_422_2_1943_False
35073,18735,35736,4074_-1_270_422_2_1945_False,4074,-1,270,422,2,1945,False,...,4500,5587.0,29059.0,-1.0,270.0,422.0,2.0,1945.0,False,29059_-1_270_422_2_1945_False


In [13]:

with open("../data_for_training/matched_metadata_125_zel024.pkl", "wb") as file:
    pkl.dump(final_metadata, file)


In [26]:

selected_images_zel024 = images_34K[image_indices, :, :]


## Saving the embeddings matrix

In [None]:


with open("../ZafrensData/embedded_smiles/embedded_smiles_0.pkl", "rb") as file:
    smiles_embedding_0 = pkl.load(file)

with open("../ZafrensData/embedded_smiles/embedded_smiles_1.pkl", "rb") as file:
    smiles_embedding_1 = pkl.load(file)

with open("../ZafrensData/embedded_smiles/embedded_smiles_2.pkl", "rb") as file:
    smiles_embedding_2 = pkl.load(file)

df_combined = pd.concat([smiles_embedding_0, smiles_embedding_1, smiles_embedding_2], axis=0).reset_index(drop=True)
smiles_embeddings = np.array(df_combined)
with open("../data_for_training/smiles_embedding_matrix.pkl", "wb") as file:
    pkl.dump(smiles_embeddings, file)
    