# Clean Ravia2020 dataset

In [1]:
import pyrfume
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

def get_intensity(cid):
    values = tableS1[(tableS1['CID']==cid) & ((tableS1['Experiment']=='Exp1') | (tableS1['Experiment']=='Exp2')) ]['Intensity1'].values
    print(len(values))
    return values[0]

In [3]:
pyrfume.load_manifest('ravia_2020')

{'source': {'doi': '10.1038/s41586-020-2891-7',
  'title': 'A measure of smell enables the creation of olfactory metamers',
  'authors': 'A Ravia, K Snitz, D Honigstein, M Finkel, R Zirler, O Perl, L Secundo, C Laudamiel, D Harel, N Sobel',
  'tags': 'human;odorCharacter;mixtures;academic'},
 'raw': {'tableS1.csv': 'Supplementary Table 1',
  'tableS2.csv': 'Supplementary Table 2',
  'tableS3.csv': 'Supplementary Table 2'},
 'processed': {'molecules.csv': 'Information about odorant molecules used',
  'stimuli.csv': 'Maps stimulus ID to molecules/mixtures and experimental conditions used',
  'behavior_1.csv': 'Intensity of mixtures used',
  'behavior_2.csv': 'Similarity rating test results',
  'behavior_3.csv': 'Discrimination test results'},
 'code': {'main.py': 'Processing workflow'}}

## Preparing Stimuli

In [4]:
# Load goodcents datasets from pyrfume
molecules = pyrfume.load_data('ravia_2020/molecules.csv', remote=True)
behavior_similarity = pyrfume.load_data('ravia_2020/behavior_2.csv', remote=True).reset_index()
behavior_discrimination = pyrfume.load_data('ravia_2020/behavior_3.csv', remote=True).reset_index()
behavior_intensity = pyrfume.load_data('ravia_2020/behavior_1.csv', remote=True).reset_index()
stimuli = pyrfume.load_data('ravia_2020/stimuli.csv', remote=True).reset_index()
# behavior_similarity=behavior_similarity.reset_index()
# behavior_similarity['Stimulus 1'].unique()
# behavior=behavior.rename(columns={"Stimulus": "CID"})
# molecules.index.name = 'Stimulus'
# molecules = molecules.reset_index()

In [5]:
tableS1 = pyrfume.load_data('ravia_2020/tableS1.csv', remote=True)
tableS2 = pyrfume.load_data('ravia_2020/tableS2.csv', remote=True)
tableS3 = pyrfume.load_data('ravia_2020/tableS3.csv', remote=True)

In [6]:
behavior_similarity['Stimulus 1'] = behavior_similarity['Stimulus 1'].map(stimuli.set_index('Stimulus')['CID'])
behavior_similarity['Stimulus 2'] = behavior_similarity['Stimulus 2'].map(stimuli.set_index('Stimulus')['CID'])

In [7]:
molecules=molecules.reset_index()


In [8]:
# molecules.to_csv('./curated_datasets/ravia_molecules.csv', index=False)

In [9]:
from rdkit import Chem
def canonical_smiles(smiles):
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles), isomericSmiles = True)

def remove_stereo(smiles):
    smiles = smiles.replace('@','')
    smiles = smiles.replace('/','')
    smiles = smiles.replace('\\','')
    return canonical_smiles(smiles)

molecules['nonStereoSMILES'] = molecules['IsomericSMILES'].apply(lambda x: remove_stereo(x))

In [10]:
molecules

Unnamed: 0,CID,MolecularWeight,IsomericSMILES,IUPACName,name,nonStereoSMILES
0,126,122.12,C1=CC(=CC=C1C=O)O,4-hydroxybenzaldehyde,4-hydroxybenzaldehyde,O=Cc1ccc(O)cc1
1,126,122.12,C1=CC(=CC=C1C=O)O,4-hydroxybenzaldehyde,4-hydroxybenzaldehyde,O=Cc1ccc(O)cc1
2,176,60.05,CC(=O)O,acetic acid,acetic acid,CC(=O)O
3,176,60.05,CC(=O)O,acetic acid,acetic acid,CC(=O)O
4,180,58.08,CC(=O)C,propan-2-one,acetone,CC(C)=O
...,...,...,...,...,...,...
345,5365049,156.22,CC/C=C\CCOC(=O)CC,[(Z)-hex-3-enyl] propanoate,33467-74-2,CCC=CCCOC(=O)CC
346,5366074,190.28,C/C=C/C(=O)C1=C(C=CCC1(C)C)C,"(E)-1-(2,6,6-trimethylcyclohexa-1,3-dien-1-yl)...",damascenone,CC=CC(=O)C1=C(C)C=CCC1(C)C
347,5366074,190.28,C/C=C/C(=O)C1=C(C=CCC1(C)C)C,"(E)-1-(2,6,6-trimethylcyclohexa-1,3-dien-1-yl)...",damascenone,CC=CC(=O)C1=C(C)C=CCC1(C)C
348,5367698,218.29,CC/C=C\CCOC(=O)CC1=CC=CC=C1,[(Z)-hex-3-enyl] 2-phenylacetate,42436-07-7,CCC=CCCOC(=O)Cc1ccccc1


In [11]:
# molecules.to_csv('./curated_datasets/ravia_molecules.csv', index=False)

In [12]:
cid_to_IsomericSMILESs_mapping = molecules.groupby('CID')['IsomericSMILES'].apply(lambda x: ''.join(x)).to_dict()
behavior_similarity=behavior_similarity.rename(columns={"Stimulus 1": "CID Stimulus 1", "Stimulus 2": "CID Stimulus 2"})
# Replace Stimulus 1 and Stimulus 2 with concatenated IsomericSMILES values
# for col in ['CID Stimulus 1', 'CID Stimulus 2']:
behavior_similarity['Stimulus 1-IsomericSMILES'] = behavior_similarity['CID Stimulus 1'].apply(lambda cids: ''.join(cid_to_IsomericSMILESs_mapping[int(cid)] for cid in cids.split(';')))
behavior_similarity['Stimulus 2-IsomericSMILES'] = behavior_similarity['CID Stimulus 2'].apply(lambda cids: ''.join(cid_to_IsomericSMILESs_mapping[int(cid)] for cid in cids.split(';')))
behavior_similarity['Stimulus 1-IsomericSMILES_sep'] = behavior_similarity['CID Stimulus 1'].apply(lambda cids: ','.join(cid_to_IsomericSMILESs_mapping[int(cid)] for cid in cids.split(';')))
behavior_similarity['Stimulus 2-IsomericSMILES_sep'] = behavior_similarity['CID Stimulus 2'].apply(lambda cids: ','.join(cid_to_IsomericSMILESs_mapping[int(cid)] for cid in cids.split(';')))

In [13]:
cid_to_nonStereoSMILES_mapping = molecules.groupby('CID')['nonStereoSMILES'].apply(lambda x: ''.join(x)).to_dict()
# behavior_similarity=behavior_similarity.rename(columns={"Stimulus 1": "CID Stimulus 1", "Stimulus 2": "CID Stimulus 2"})
# Replace Stimulus 1 and Stimulus 2 with concatenated IsomericSMILES values
# for col in ['CID Stimulus 1', 'CID Stimulus 2']:
behavior_similarity['Stimulus 1-nonStereoSMILES'] = behavior_similarity['CID Stimulus 1'].apply(lambda cids: ''.join(cid_to_nonStereoSMILES_mapping[int(cid)] for cid in cids.split(';')))
behavior_similarity['Stimulus 2-nonStereoSMILES'] = behavior_similarity['CID Stimulus 2'].apply(lambda cids: ''.join(cid_to_nonStereoSMILES_mapping[int(cid)] for cid in cids.split(';')))
behavior_similarity['Stimulus 1-nonStereoSMILES_sep'] = behavior_similarity['CID Stimulus 1'].apply(lambda cids: ','.join(cid_to_nonStereoSMILES_mapping[int(cid)] for cid in cids.split(';')))
behavior_similarity['Stimulus 2-nonStereoSMILES_sep'] = behavior_similarity['CID Stimulus 2'].apply(lambda cids: ','.join(cid_to_nonStereoSMILES_mapping[int(cid)] for cid in cids.split(';')))

In [14]:
#add a column which contains intensity for each cid in the mixture and this intensity is read from tableS1 which contains intesnsity per each CID separately
behavior_similarity['Intensity 1'] = behavior_similarity['CID Stimulus 1'].apply(lambda cids: ';'.join(str(get_intensity(str(cid))) for cid in cids.split(';')))
behavior_similarity['Intensity 2'] = behavior_similarity['CID Stimulus 2'].apply(lambda cids: ';'.join(str(get_intensity(str(cid))) for cid in cids.split(';')))

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [15]:
#get intensity from tablS1 with CID
#check this for all CID in behavior_similarity


In [16]:
behavior_similarity.to_csv('../data/embeddings/molformer/curated_ravia2020_behavior_similarity_intensity.csv', index=False)