# Molecule Feature & Distance Matrices

Import all the necessary packages:

In [1]:
import numpy as np
import pandas as pd
import pyrfume
from ast import literal_eval
from pyrfume.features import smiles_to_mordred, smiles_to_morgan
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import pdist, squareform

Load the identifiers.csv and molecules.csv as DataFrames:

In [2]:
identifiers = pd.read_csv('identifiers.csv')
molecules = pd.read_csv('molecules.csv')

Gather all SMILES of molecules as a list:

In [3]:
smiles = molecules['IsomericSMILES'].tolist()

Obtain all mordred features from SMILES:

In [4]:
mordred_features = smiles_to_mordred(smiles)

  0%|          | 0/20 [00:00<?, ?it/s]


Computing Mordred features...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 31.88it/s]


There are 20 molecules and 1826 features


In [5]:
mordred_features.head()

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
C1=CC=C(C=C1)C=O,5.656854,5.42766,0,0,10.424292,2.135779,4.271558,10.424292,1.303037,2.969338,...,8.298291,35.247635,106.041865,7.574419,64,7,34.0,36.0,2.611111,2.0
CCCC(=O)O,3.754314,4.057055,1,0,6.155367,1.902113,3.804226,6.155367,1.025895,2.5951,...,7.131699,29.439488,88.052429,6.289459,32,3,20.0,18.0,3.611111,1.583333
CCC(=O)O,3.047207,3.305183,1,0,5.226252,1.847759,3.695518,5.226252,1.04525,2.408576,...,6.834109,27.25413,74.036779,6.730616,18,2,16.0,14.0,3.361111,1.333333
CC(C)C(=O)O,3.932653,4.244375,1,0,6.0,2.0,4.0,6.0,1.0,2.610845,...,7.626083,30.69869,88.052429,6.289459,29,4,22.0,21.0,4.222222,1.444444
CCC(C)CC(=O)O,5.277917,5.655215,1,0,8.565187,2.042079,4.084158,8.565187,1.070648,2.899769,...,7.8842,34.080836,116.08373,5.804186,71,6,30.0,29.0,4.722222,2.0


Merge CIDs with DataFrame of features:

In [6]:
features = pd.merge(molecules, mordred_features, left_on='IsomericSMILES', right_index=True)
features = features.drop(['MolecularWeight', 'IsomericSMILES', 'IUPACName', 'name'], axis=1)
features.head()

Unnamed: 0,CID,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,240,5.656854,5.42766,0,0,10.424292,2.135779,4.271558,10.424292,1.303037,...,8.298291,35.247635,106.041865,7.574419,64,7,34.0,36.0,2.611111,2.0
1,264,3.754314,4.057055,1,0,6.155367,1.902113,3.804226,6.155367,1.025895,...,7.131699,29.439488,88.052429,6.289459,32,3,20.0,18.0,3.611111,1.583333
2,1032,3.047207,3.305183,1,0,5.226252,1.847759,3.695518,5.226252,1.04525,...,6.834109,27.25413,74.036779,6.730616,18,2,16.0,14.0,3.361111,1.333333
3,6590,3.932653,4.244375,1,0,6.0,2.0,4.0,6.0,1.0,...,7.626083,30.69869,88.052429,6.289459,29,4,22.0,21.0,4.222222,1.444444
4,7755,5.277917,5.655215,1,0,8.565187,2.042079,4.084158,8.565187,1.070648,...,7.8842,34.080836,116.08373,5.804186,71,6,30.0,29.0,4.722222,2.0


Make values in CIDs column in identifiers be list type and iterate through each one. If a stimulus contains multiple CIDs, then the feature vectors for the CIDs will averaged and a new row for this mixture stimulus is added to the features matrix:

In [7]:
identifiers['CIDs'] = identifiers['CIDs'].apply(literal_eval)

In [8]:
temp = pd.DataFrame()

for cids in identifiers['CIDs']:
    if len(cids) == 2:
        cid1 = pd.DataFrame(features.loc[features['CID'] == cids[0]]).drop('CID', axis=1)
        cid2 = pd.DataFrame(features.loc[features['CID'] == cids[1]]).drop('CID', axis=1)
        combined = pd.concat([cid1, cid2]).mean()
        combined['CID'] = cids
        temp = pd.concat([temp, combined], axis=1)

temp = temp.T.reset_index(drop=True)

Create a new DataFrame copy of features, with the CID column as a list of CIDs, then concatenate the temporary DataFrame of averaged features from mixtures:

In [9]:
all_features = features.copy()
all_features['CID'] = all_features[['CID']].apply(lambda x: [x['CID']], axis=1)

In [10]:
all_features = pd.concat([all_features, temp]).reset_index(drop=True)
all_features.head()

Unnamed: 0,CID,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,[240],5.656854,5.42766,0,0,10.424292,2.135779,4.271558,10.424292,1.303037,...,8.298291,35.247635,106.041865,7.574419,64,7,34.0,36.0,2.611111,2.0
1,[264],3.754314,4.057055,1,0,6.155367,1.902113,3.804226,6.155367,1.025895,...,7.131699,29.439488,88.052429,6.289459,32,3,20.0,18.0,3.611111,1.583333
2,[1032],3.047207,3.305183,1,0,5.226252,1.847759,3.695518,5.226252,1.04525,...,6.834109,27.25413,74.036779,6.730616,18,2,16.0,14.0,3.361111,1.333333
3,[6590],3.932653,4.244375,1,0,6.0,2.0,4.0,6.0,1.0,...,7.626083,30.69869,88.052429,6.289459,29,4,22.0,21.0,4.222222,1.444444
4,[7755],5.277917,5.655215,1,0,8.565187,2.042079,4.084158,8.565187,1.070648,...,7.8842,34.080836,116.08373,5.804186,71,6,30.0,29.0,4.722222,2.0


Reset the type of the CID columns for features and identifiers to strings so that they can be easily joined:

In [11]:
all_features['CID'] = all_features[['CID']].apply(lambda x: str(x['CID']), axis=1)
identifiers['CIDs'] = identifiers[['CIDs']].apply(lambda x: str(x['CIDs']), axis=1)

Bring the stimuli identifiers into DataFrame and set the stimulus as the index:

In [12]:
stimuli_features = pd.merge(all_features, identifiers, left_on='CID', right_on='CIDs')
stimuli_features = stimuli_features.drop(['CID', 'CIDs', 'conc'], axis=1)
stimuli_features = stimuli_features.set_index('stimulus')

Normalize all columns of the features matrix, dropping columns that contain NaNs:

In [13]:
scaler = MinMaxScaler()
stimuli_features[stimuli_features.columns] = scaler.fit_transform(stimuli_features[stimuli_features.columns])

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


In [14]:
stimuli_features = stimuli_features.dropna(axis='columns')
stimuli_features

Unnamed: 0_level_0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
70,0.309023,0.313771,0.0,0.0,0.372372,0.407554,0.407554,0.372372,1.000000,0.459400,...,0.451076,0.243894,0.246024,0.922029,0.120735,0.333333,0.310345,0.318841,0.000000,0.328767
99,0.309023,0.313771,0.0,0.0,0.372372,0.407554,0.407554,0.372372,1.000000,0.459400,...,0.451076,0.243894,0.246024,0.922029,0.120735,0.333333,0.310345,0.318841,0.000000,0.328767
107,0.083732,0.111151,1.0,0.0,0.066559,0.076912,0.076912,0.066559,0.270594,0.152808,...,0.091680,0.066679,0.107739,0.425863,0.036745,0.066667,0.068966,0.057971,0.355556,0.123288
119,0.083732,0.111151,1.0,0.0,0.066559,0.076912,0.076912,0.066559,0.270594,0.152808,...,0.091680,0.066679,0.107739,0.425863,0.036745,0.066667,0.068966,0.057971,0.355556,0.123288
6,0.000000,0.000000,1.0,0.0,0.000000,0.000000,0.000000,0.000000,0.321536,0.000000,...,0.000000,0.000000,0.000000,0.596209,0.000000,0.000000,0.000000,0.000000,0.266667,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,0.237455,0.317897,1.0,0.0,0.170176,0.372196,0.372196,0.170176,0.231895,0.357397,...,0.410679,0.209068,0.269346,0.277062,0.097113,0.300000,0.241379,0.231884,0.830864,0.215753
136,0.449541,0.501067,0.5,0.0,0.326544,0.713103,0.713103,0.326544,0.388688,0.576518,...,0.737983,0.356059,0.400245,0.128705,0.173228,0.466667,0.517241,0.550725,0.772840,0.263699
141,0.449541,0.501067,0.5,0.0,0.326544,0.713103,0.713103,0.326544,0.388688,0.576518,...,0.737983,0.356059,0.400245,0.128705,0.173228,0.466667,0.517241,0.550725,0.772840,0.263699
136,0.449541,0.501067,0.5,0.0,0.326544,0.713103,0.713103,0.326544,0.388688,0.576518,...,0.737983,0.356059,0.400245,0.128705,0.173228,0.466667,0.517241,0.550725,0.772840,0.263699


Calculate the euclidean distance between all pairs of stimuli:

In [15]:
distance_matrix = pdist(stimuli_features, 'euclidean')

Create functions to easily lookup feature distance (euclidian distance) between two stimuli:

In [16]:
def get_pdist_index(stim1, stim2, features):
    i = features.index.get_loc(stim1)
    j = features.index.get_loc(stim2)
    m = features.shape[0]
    if i > j:
        index = m * j + i - ((j + 2) * (j + 1)) // 2
    else:    
        index = m * i + j - ((i + 2) * (i + 1)) // 2
    return index

def feature_distance_from_stimuli(stim1, stim2, features, distance_matrix):
    i = get_pdist_index(stim1, stim2, features)
    return distance_matrix[i]

Test out a pair of stimuli:

In [23]:
dist = feature_distance_from_stimuli(70, 6, stimuli_features, distance_matrix)
print(dist)

14.710942117834673


## Manual validation of pdist

In [18]:
def calculate_pdist_manual(stim1, stim2):
    dist_ex = np.linalg.norm(stimuli_features.loc[stim1].values - stimuli_features.loc[stim2].values)
    print(dist_ex)

In [24]:
calculate_pdist_manual(70, 6)

14.710942117834673
