# Dino features for the supervised baseline experiment

In [1]:
import os
import pandas as pd
import numpy as np

Function for extracting metadata and CellProfiler feature column indices:

In [2]:
def get_feature_cols(df: pd.DataFrame, 
                     features_type: str="dino") -> list() :
    """Splits columns of input dataframe into columns contining metadata and columns containing dino features
    :param df: input data frame
    :param features_type:
        "standard" for anything do not start with emb_
        "dino": for dino features names as start with emb_
    :return : feature_columns , info_columns
    """

    feature_cols = [ c for c in df.columns if ( c.startswith("emb_") | c.startswith("INCHIKEY" )) ]
 
    info_cols =  list( set(df.columns).difference(set(feature_cols)) )
    return feature_cols, info_cols

Function for impute Nan values

In [3]:
def impute_features( data, feature_cols, outlier_threshold=1000):
    
    for feat in feature_cols:
        impute_idx = data[feat].isin([np.inf, -np.inf, np.nan])
        if impute_idx.sum()>0:
            print('Nan or inf feature %s for %i samples'%(feat, impute_idx.sum()))
        if outlier_threshold:
            impute_idx = impute_idx | (data[feat].abs()>outlier_threshold)
        if impute_idx.sum()>0:
            data.loc[ impute_idx, feat] = data.loc[ impute_idx==False, feat].median()
            print('Imputing to median %s for %i samples'%(feat, impute_idx.sum()))
    return data

Load Dino features:

In [4]:
dino_df = pd.read_csv('/SSL_data/supervisedCNNs/Bray_bioactives/embeddings/DINO/well_features.csv')

Check how the data looks like

In [5]:
dino_df.head()

Unnamed: 0,INCHIKEY,Plate,Well,broad_sample,Object_Count,mmoles_per_liter,Plate_Map_Name,index,solvent,emb_0000,...,emb_0370,emb_0371,emb_0372,emb_0374,emb_0376,emb_0377,emb_0378,emb_0379,emb_0380,emb_0381
0,RIHXBZGHHXCGPP-MZKRTTBSSA-N,26795,A10,BRD-K53839527-001-01-4,684,5.124438,H-CBLD-004-4,9,DMSO,-0.032752,...,0.93246,0.887492,-0.33747,0.787882,0.873322,-0.200632,0.160022,0.079105,0.497942,1.460706
1,RIHXBZGHHXCGPP-MZKRTTBSSA-N,26679,A10,BRD-K53839527-001-01-4,499,5.124438,H-CBLD-004-4,20808,DMSO,-0.112787,...,0.902729,1.962275,0.467594,0.215321,0.154898,-0.988313,0.958924,-0.667498,0.476173,-0.022082
2,RIHXBZGHHXCGPP-MZKRTTBSSA-N,26680,A10,BRD-K53839527-001-01-4,570,5.124438,H-CBLD-004-4,76419,DMSO,1.300692,...,0.429968,0.801803,-0.458164,-0.684393,0.74548,-0.320938,-0.930174,-1.180052,0.321221,-0.329831
3,RIHXBZGHHXCGPP-MZKRTTBSSA-N,26794,A10,BRD-K53839527-001-01-4,580,5.124438,H-CBLD-004-4,121514,DMSO,-0.392056,...,-0.232391,-0.585265,-0.500658,-0.975645,1.927277,-1.467134,-0.342321,0.240753,1.190493,-0.667392
4,RIHXBZGHHXCGPP-SXWKCWPCSA-N,26795,A12,BRD-K71719006-001-01-6,736,5.225829,H-CBLD-004-4,11,DMSO,-0.569492,...,0.47329,-0.462824,0.762132,-1.412534,-0.02843,0.147524,-0.386727,-0.550231,-0.547129,0.666332


In [6]:
dino_df.shape

(46868, 280)

In [7]:
for col in dino_df.columns:
    print(col)

INCHIKEY
Plate
Well
broad_sample
Object_Count
mmoles_per_liter
Plate_Map_Name
index
solvent
emb_0000
emb_0001
emb_0003
emb_0005
emb_0007
emb_0008
emb_0010
emb_0011
emb_0012
emb_0013
emb_0014
emb_0015
emb_0016
emb_0017
emb_0018
emb_0019
emb_0022
emb_0023
emb_0025
emb_0026
emb_0027
emb_0030
emb_0032
emb_0033
emb_0035
emb_0036
emb_0037
emb_0042
emb_0045
emb_0046
emb_0047
emb_0048
emb_0050
emb_0053
emb_0054
emb_0055
emb_0056
emb_0061
emb_0062
emb_0064
emb_0065
emb_0066
emb_0067
emb_0068
emb_0069
emb_0070
emb_0073
emb_0074
emb_0076
emb_0077
emb_0079
emb_0080
emb_0081
emb_0082
emb_0084
emb_0085
emb_0086
emb_0087
emb_0089
emb_0091
emb_0092
emb_0093
emb_0094
emb_0096
emb_0097
emb_0098
emb_0100
emb_0101
emb_0102
emb_0103
emb_0104
emb_0105
emb_0106
emb_0107
emb_0108
emb_0109
emb_0110
emb_0111
emb_0112
emb_0113
emb_0115
emb_0117
emb_0119
emb_0121
emb_0122
emb_0123
emb_0124
emb_0125
emb_0126
emb_0128
emb_0130
emb_0131
emb_0132
emb_0134
emb_0135
emb_0136
emb_0137
emb_0140
emb_0141
emb_0142
emb_0144

Get metadata and feature column indices:

In [8]:
feature_cols, meta_cols = get_feature_cols(dino_df)

**Remove whole-image features which were found to decrease performance:**

In [9]:
print(f"Number of dino features: {len(feature_cols)}")

Number of dino features: 272


In [10]:
print(f"Number of meta columns: {len(meta_cols)}")

Number of meta columns: 8


In [11]:
print(meta_cols)

['Well', 'index', 'solvent', 'mmoles_per_liter', 'Object_Count', 'Plate_Map_Name', 'broad_sample', 'Plate']


In [12]:
print(feature_cols)

['INCHIKEY', 'emb_0000', 'emb_0001', 'emb_0003', 'emb_0005', 'emb_0007', 'emb_0008', 'emb_0010', 'emb_0011', 'emb_0012', 'emb_0013', 'emb_0014', 'emb_0015', 'emb_0016', 'emb_0017', 'emb_0018', 'emb_0019', 'emb_0022', 'emb_0023', 'emb_0025', 'emb_0026', 'emb_0027', 'emb_0030', 'emb_0032', 'emb_0033', 'emb_0035', 'emb_0036', 'emb_0037', 'emb_0042', 'emb_0045', 'emb_0046', 'emb_0047', 'emb_0048', 'emb_0050', 'emb_0053', 'emb_0054', 'emb_0055', 'emb_0056', 'emb_0061', 'emb_0062', 'emb_0064', 'emb_0065', 'emb_0066', 'emb_0067', 'emb_0068', 'emb_0069', 'emb_0070', 'emb_0073', 'emb_0074', 'emb_0076', 'emb_0077', 'emb_0079', 'emb_0080', 'emb_0081', 'emb_0082', 'emb_0084', 'emb_0085', 'emb_0086', 'emb_0087', 'emb_0089', 'emb_0091', 'emb_0092', 'emb_0093', 'emb_0094', 'emb_0096', 'emb_0097', 'emb_0098', 'emb_0100', 'emb_0101', 'emb_0102', 'emb_0103', 'emb_0104', 'emb_0105', 'emb_0106', 'emb_0107', 'emb_0108', 'emb_0109', 'emb_0110', 'emb_0111', 'emb_0112', 'emb_0113', 'emb_0115', 'emb_0117', 'em

## Datasplits
The datasplits from the Hofmarcher et al study are in the following directory:

In [13]:
datasplit_dir = '/SSL_data/supervisedCNNs/Bray_bioactives/datasplits_Hofmarcher'

There were 3 versions of train/val/test splits:

In [14]:
sorted(os.listdir(datasplit_dir))

['datasplit1-test.csv',
 'datasplit1-train.csv',
 'datasplit1-val.csv',
 'datasplit2-test.csv',
 'datasplit2-train.csv',
 'datasplit2-val.csv',
 'datasplit3-test.csv',
 'datasplit3-train.csv',
 'datasplit3-val.csv']

To obtain train splits, load e.g. `datasplit1-train.csv` and use `INCHIKEY` column to subset the feature and activity data:

In [15]:
train_df_3 = pd.read_csv(os.path.join(datasplit_dir, 'datasplit3-train.csv'))

In [16]:
train_df_3[['INCHIKEY']].head()

Unnamed: 0,INCHIKEY
0,CAJIGINSTLKQMM-UHFFFAOYSA-N
1,CAJIGINSTLKQMM-UHFFFAOYSA-N
2,CAJIGINSTLKQMM-UHFFFAOYSA-N
3,CAJIGINSTLKQMM-UHFFFAOYSA-N
4,CAJIGINSTLKQMM-UHFFFAOYSA-N


In [17]:
train_df_3.shape

(198737, 17)

In [18]:
test_df_3 = pd.read_csv(os.path.join(datasplit_dir, 'datasplit3-test.csv'))
test_df_3.shape

(56687, 17)

In [19]:
val_df_3 = pd.read_csv(os.path.join(datasplit_dir, 'datasplit3-val.csv'))
val_df_3.shape

(28610, 17)

check the labels 

In [20]:
bioactivity_labels_dir = '/SSL_data/supervisedCNNs/Bray_bioactives/ChEMBL/Label_Matrix_Hofmarcher'

In [21]:
compound_index_df = pd.read_csv(os.path.join(bioactivity_labels_dir, 'row-compound-index.csv'))

In [22]:
compound_index_df.head()

Unnamed: 0,INDEX,INCHIKEY
0,0,IENZQIKPVFGBNW-UHFFFAOYSA-N
1,1,GSDSWSVVBLHKDQ-UHFFFAOYSA-N
2,2,CGIGDMFJXJATDK-UHFFFAOYSA-N
3,3,DSXXEELGXBCYNQ-UHFFFAOYSA-N
4,4,MYSWGUAQZAJSOK-UHFFFAOYSA-N


In [23]:
print(f"Number of compound indexes: {len(compound_index_df)}")

Number of compound indexes: 10574


Now we will merge compound with featuers

In [24]:
dino_df = dino_df[feature_cols]
compound_index_featuers = pd.merge(compound_index_df, dino_df, on='INCHIKEY')
print(compound_index_featuers.head())

   INDEX                     INCHIKEY  emb_0000  emb_0001  emb_0003  emb_0005  \
0      0  IENZQIKPVFGBNW-UHFFFAOYSA-N  0.334818 -0.342595 -0.773422  2.529346   
1      0  IENZQIKPVFGBNW-UHFFFAOYSA-N -0.524029  2.206661  3.621344  2.206456   
2      0  IENZQIKPVFGBNW-UHFFFAOYSA-N  0.812762 -0.155273  0.270054  0.153479   
3      0  IENZQIKPVFGBNW-UHFFFAOYSA-N -0.370551  0.804067 -0.948769 -0.254127   
4      0  IENZQIKPVFGBNW-UHFFFAOYSA-N -1.240808  0.592322 -0.556329  1.012893   

   emb_0007  emb_0008  emb_0010  emb_0011  ...  emb_0370  emb_0371  emb_0372  \
0  0.299519 -0.010472  0.108848  0.662685  ...  0.388371 -0.716367  0.948132   
1  1.265477 -3.243098 -2.430532  0.203687  ...  1.184454 -0.097744  0.291923   
2 -1.498994 -0.514010 -0.331458 -1.430916  ...  0.314736  0.588206 -0.605676   
3 -0.949441  0.609493  0.651699  0.913270  ...  1.068782  0.500040  0.525435   
4  0.863586 -1.683191 -4.079464 -3.931553  ...  2.162899 -1.896001  3.889791   

   emb_0374  emb_0376  emb_0377 

In [25]:
print(len(compound_index_featuers["INDEX"].unique()))

10574


In [26]:
assay_index_df = pd.read_csv(os.path.join(bioactivity_labels_dir, 'column-assay-index.csv'))

In [27]:
assay_index_df.head()

Unnamed: 0,ASSAY_ID
0,600885
1,688422
2,688493
3,688810
4,688812


In [28]:
print(f"Number of assay indexes: {len(assay_index_df)}")

Number of assay indexes: 209


In [29]:
from scipy.io import mmread
the_matrix = mmread("/SSL_data/supervisedCNNs/Bray_bioactives/ChEMBL/Label_Matrix_Hofmarcher/label-matrix.mtx")
print(the_matrix.shape)

(10574, 209)


In [30]:
print(the_matrix)

  (6537, 0)	1
  (6542, 0)	1
  (6543, 0)	1
  (6552, 0)	-1
  (6689, 0)	-1
  (6708, 0)	1
  (6821, 0)	1
  (6829, 0)	1
  (8169, 0)	1
  (8170, 0)	1
  (8174, 0)	1
  (8175, 0)	1
  (8176, 0)	1
  (8177, 0)	1
  (8180, 0)	1
  (8181, 0)	-1
  (8183, 0)	1
  (8184, 0)	1
  (8185, 0)	1
  (8187, 0)	1
  (8189, 0)	1
  (8190, 0)	1
  (8191, 0)	1
  (8192, 0)	1
  (8193, 0)	1
  :	:
  (9651, 208)	-1
  (9662, 208)	-1
  (9684, 208)	-1
  (9712, 208)	-1
  (9720, 208)	-1
  (9765, 208)	-1
  (9780, 208)	-1
  (9782, 208)	-1
  (9787, 208)	-1
  (9790, 208)	-1
  (9805, 208)	-1
  (9835, 208)	-1
  (9854, 208)	-1
  (9861, 208)	-1
  (9884, 208)	-1
  (9919, 208)	-1
  (9924, 208)	-1
  (9941, 208)	-1
  (9971, 208)	-1
  (10013, 208)	-1
  (10070, 208)	-1
  (10079, 208)	-1
  (10162, 208)	-1
  (10196, 208)	-1
  (10489, 208)	-1


In [31]:
labels = the_matrix.toarray()
print(labels.shape)

(10574, 209)


In [32]:
print(compound_index_featuers.shape)

(46868, 273)


In [33]:
compound_index_featuers_train_3 = compound_index_featuers[compound_index_featuers['INCHIKEY'].isin(train_df_3['INCHIKEY'])]
print(compound_index_featuers_train_3.shape)

(32793, 273)


In [34]:
compound_index_featuers_test_3 = compound_index_featuers[compound_index_featuers['INCHIKEY'].isin(test_df_3['INCHIKEY'])]
print(compound_index_featuers_test_3.shape)

(9361, 273)


In [35]:
compound_index_featuers_val_3 = compound_index_featuers[compound_index_featuers['INCHIKEY'].isin(val_df_3['INCHIKEY'])]
print(compound_index_featuers_val_3.shape)

(4714, 273)


In [36]:
data_train_3 = compound_index_featuers_train_3.to_numpy()
print(data_train_3.shape)

(32793, 273)


In [37]:
data_test_3 = compound_index_featuers_test_3.to_numpy()
print(data_test_3.shape)

(9361, 273)


In [38]:
data_val_3 = compound_index_featuers_val_3.to_numpy()
print(data_val_3.shape)

(4714, 273)


In [39]:
print(labels.shape)

(10574, 209)


In [40]:
complete_labels_train_3 =[]
for row in data_train_3:
        complete_labels_train_3.append(labels[row[0]])
complete_labels_train_3 = np.array(complete_labels_train_3)
print(complete_labels_train_3.shape)

(32793, 209)


In [41]:
complete_labels_test_3 =[]
for row in data_test_3:
        complete_labels_test_3.append(labels[row[0]])
complete_labels_test_3 = np.array(complete_labels_test_3)
print(complete_labels_test_3.shape)

(9361, 209)


In [42]:
complete_labels_val_3 =[]
for row in data_val_3:
        complete_labels_val_3.append(labels[row[0]])
complete_labels_val_3 = np.array(complete_labels_val_3)
print(complete_labels_val_3.shape)

(4714, 209)


In [43]:
print(data_train_3)

[[0 'IENZQIKPVFGBNW-UHFFFAOYSA-N' 0.3348176061876053 ...
  1.0471962618309707 -0.1003683394503087 1.3074855022835752]
 [0 'IENZQIKPVFGBNW-UHFFFAOYSA-N' -0.5240294909048041 ...
  -0.9170041223014592 -0.6950384489143155 -1.618507320326431]
 [0 'IENZQIKPVFGBNW-UHFFFAOYSA-N' 0.8127623744852834 ...
  1.4707974751583373 0.2969140476712674 -0.8340753285553378]
 ...
 [10571 'YAJYINBQFXCAPI-PFPZSTESSA-N' 0.8057629597040371 ...
  1.5862600589357088 -0.8147514997414632 0.2079127237112941]
 [10571 'YAJYINBQFXCAPI-PFPZSTESSA-N' 0.8806734499512878 ...
  -0.1072417268111464 0.5434782873494887 -0.5320157980528339]
 [10571 'YAJYINBQFXCAPI-PFPZSTESSA-N' -0.3982987344568561 ...
  -0.4036100953459722 -0.03478793118381 0.4213423429644478]]


In [44]:
final_featuers_train_3 = data_train_3[:,2:]
print(final_featuers_train_3.shape)

(32793, 271)


In [45]:
final_featuers_test_3 = data_test_3[:,2:]
print(final_featuers_test_3.shape)

(9361, 271)


In [46]:
final_featuers_val_3 = data_val_3[:,2:]
print(final_featuers_val_3.shape)

(4714, 271)


In [47]:
np.save('/SSL_data/supervisedCNNs/Bray_bioactives/processed_data/dino/data_split_3/features_train3.npy', final_featuers_train_3)
np.save('/SSL_data/supervisedCNNs/Bray_bioactives/processed_data/dino/data_split_3/all_labels_train3.npy', complete_labels_train_3)


np.save('/SSL_data/supervisedCNNs/Bray_bioactives/processed_data/dino/data_split_3/features_test3.npy', final_featuers_test_3)
np.save('/SSL_data/supervisedCNNs/Bray_bioactives/processed_data/dino/data_split_3/all_labels_test3.npy', complete_labels_test_3)

np.save('/SSL_data/supervisedCNNs/Bray_bioactives/processed_data/dino/data_split_3/features_val3.npy', final_featuers_val_3)
np.save('/SSL_data/supervisedCNNs/Bray_bioactives/processed_data/dino/data_split_3/all_labels_val3.npy', complete_labels_val_3)


l = np.load('/SSL_data/supervisedCNNs/Bray_bioactives/processed_data/dino/data_split_3/all_labels_val3.npy')
f = np.load('/SSL_data/supervisedCNNs/Bray_bioactives/processed_data/dino/data_split_3/features_val3.npy', allow_pickle=True)

print(l == complete_labels_val_3)
print(f == final_featuers_val_3)

[[ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 ...
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]]
[[ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 ...
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]]
