# Preprocess Olfaction data
Jake Albrecht
Mar 2024

In [17]:
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
import pandas as pd
import time

In [2]:
import synapseclient
import synapseutils
syn = synapseclient.login()

Welcome, Jake Albrecht!



Download files and read in the train and test sets

In [3]:
files = synapseutils.syncFromSynapse(syn,'syn53470621',path='.')

In [4]:
test_mix = pd.read_excel('Test set.xlsx',sheet_name='Mixture definition')
test_target = pd.read_excel('Test set.xlsx',sheet_name='Test set')


In [5]:
train_mix=pd.read_excel('Training Data.xls',sheet_name='Mixture definition')
train_target = pd.read_excel('Training Data.xls',sheet_name='Training Data')

In [6]:
unique_cids = pd.read_csv('CID.csv',header=None)

In [7]:
# replace typos
unique_cids.replace({81035281168:81035,81295281168:81295,11002307:2307},inplace=True)

In [15]:
#push zeros over in training data
def push_non_zero_left(row):
    non_zero = row.loc[row != 0]
    return pd.Series(non_zero.tolist() + [0] * (len(row) - len(non_zero)), index=row.index)

train_mix = train_mix.apply(push_non_zero_left, axis=1)
# replace typos
train_mix.replace({81035281168:81035,81295281168:81295,11002307:2307},inplace=True)

In [9]:
with pd.ExcelWriter('Training_Data.xlsx') as writer:  
    train_mix.to_excel(writer,sheet_name='Mixture definition',index=False)
    train_target.to_excel(writer,sheet_name='Training Data',index=False)

Look at unique CIDs in training and test sets, confirm that there are no values other than sentry value `0` in the list of CIDs in `CID.csv`)

In [16]:
traintest_uniques = set(test_mix.filter(regex='^CID').melt().value.dropna().astype('int64'))|\
                    set(train_mix.filter(regex='^CID').melt().value.dropna().astype('int64'))
set.difference(traintest_uniques,set(unique_cids[0]))

{0}

## use `rdkit` and `mordred` to generate fingerprints and descriptors respectively

Moriwaki H, Tian Y-S, Kawashita N, Takagi T (2018) Mordred: a molecular descriptor calculator. Journal of Cheminformatics 10:4 . doi: 10.1186/s13321-018-0258-y

RDKit: Open-source cheminformatics. https://www.rdkit.org

In [11]:
from mordred import Calculator, descriptors
calc = Calculator(descriptors, ignore_3D=True)

In [12]:
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=1024)

In [19]:
def get_mol(cid):
    try:
        compound = pcp.Compound.from_cid(cid)
        smi = compound.isomeric_smiles
        mol = Chem.MolFromSmiles(smi)
    except pcp.BadRequestError:
        print(f'{cid} not found')
    except pcp.PubChemHTTPError:
        time.sleep(10)
        mol = get_mol(cid)
    return mol

mols = {}
for cid in unique_cids[0]:
    mol = get_mol(cid)
    if mol:
        mols[cid] = mol
       
    



In [20]:
morganfp_df = pd.DataFrame.from_dict({ky:mfpgen.GetFingerprintAsNumPy(mols[ky]) for ky in mols},orient='index')
morganfp_df.to_csv('Morgan_Fingerprint.csv')

In [21]:
morganfp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
176,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
177,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
180,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
240,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
261,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2307,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
19789253,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25137858,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
mordred_df = calc.pandas(mols.values())
mordred_df.index = mols.keys()

100%|██████████| 171/171 [00:07<00:00, 23.26it/s]


In [23]:
mordred_df.astype('float').dropna(axis=1,how='all').to_csv('Mordred_Descriptors.csv')