In [2]:
import pandas as pd
from collections import defaultdict
from rdkit.Chem import MolFromSmiles, MolToSmiles

In [3]:
mix_df = pd.read_csv("mixture_smi_definitions_clean.csv")
gslf_df = pd.read_csv("../gs-lf/gs-lf_combined.csv")

In [4]:
def transform_dataframe(df):
    # Create a dictionary to store unique SMILES and their sources
    smiles_dict = defaultdict(list)
    
    # Iterate through the DataFrame
    for _, row in df.iterrows():
        dataset = row['Dataset']
        mixture_label = row['Mixture Label']
        source = f"{dataset}_{mixture_label}"
        
        # Check all smi_n columns
        for col in df.columns:
            if col.startswith('smi_'):
                smiles = row[col]
                if pd.notna(smiles):  # Check if the SMILES is not NaN
                    smiles_dict[smiles].append(source)
    
    # Create the new DataFrame
    new_df = pd.DataFrame({
        'SMILES': list(smiles_dict.keys()),
        'Sources': [','.join(sources) for sources in smiles_dict.values()]
    })
    
    return new_df

In [6]:
mix_smi_df = transform_dataframe(mix_df)
mix_smi_df['canonical_smiles'] = mix_smi_df.apply(lambda row: MolToSmiles(MolFromSmiles(row['SMILES'])), axis=1)
gslf_df['canonical_smiles'] = gslf_df.apply(lambda row: MolToSmiles(MolFromSmiles(row['IsomericSMILES'])), axis=1)

merged_df = pd.merge(mix_smi_df, gslf_df, on='canonical_smiles', how='left')

In [15]:
merged_df[merged_df['descriptors'].isna()]

Unnamed: 0,SMILES,Sources,canonical_smiles,IsomericSMILES,descriptors,alcoholic,aldehydic,alliaceous,almond,amber,...,tropical,vanilla,vegetable,vetiver,violet,warm,waxy,weedy,winey,woody
11,CC(=O)O[C@@H]1C[C@@H]2CC[C@]1(C2(C)C)C,"Snitz 1_2,Snitz 1_4,Snitz 1_5,Snitz 1_6,Snitz ...",CC(=O)O[C@@H]1C[C@@H]2CC[C@@]1(C)C2(C)C,,,,,,,,...,,,,,,,,,,
13,CCCCNCCCC,"Snitz 1_2,Snitz 1_4,Snitz 1_5,Snitz 1_6,Snitz ...",CCCCNCCCC,,,,,,,,...,,,,,,,,,,
28,CC(=CCC[C@](C)(C=C)O)C,"Snitz 1_4,Snitz 1_5,Snitz 1_6,Snitz 1_7,Snitz ...",C=C[C@](C)(O)CCC=C(C)C,,,,,,,,...,,,,,,,,,,
36,CC1=CC(=C(C=C1Cl)C(C)C)O,"Snitz 1_5,Snitz 1_6,Snitz 1_17,Snitz 1_18,Snit...",Cc1cc(O)c(C(C)C)cc1Cl,,,,,,,,...,,,,,,,,,,
42,CCC/C=C/C=O,"Snitz 1_5,Snitz 1_6,Snitz 1_21,Snitz 1_23,Snit...",CCC/C=C/C=O,,,,,,,,...,,,,,,,,,,
44,C#CC1=CC=CC=C1,"Snitz 1_5,Snitz 1_6,Snitz 1_13,Snitz 1_21,Snit...",C#Cc1ccccc1,,,,,,,,...,,,,,,,,,,
50,CCCCC/C=C/C=C/C=O,"Snitz 1_8,Snitz 1_9,Snitz 1_10,Snitz 1_17,Snit...",CCCCC/C=C/C=C/C=O,,,,,,,,...,,,,,,,,,,
52,C/C/1=C\CCC(=C)[C@H]2CC([C@@H]2CC1)(C)C,"Snitz 1_9,Snitz 1_10,Snitz 1_22,Snitz 1_23,Sni...",C=C1CC/C=C(\C)CC[C@@H]2[C@@H]1CC2(C)C,,,,,,,,...,,,,,,,,,,
53,CC/C=C/1\C2=CC=CC=C2C(=O)O1,"Snitz 1_9,Snitz 1_10,Snitz 1_21,Snitz 1_23,Sni...",CC/C=C1/OC(=O)c2ccccc21,,,,,,,,...,,,,,,,,,,
75,C[C@@H]1CC[C@H]([C@@H](C1)O)C(C)C,"Snitz 1_14,Snitz 1_16,Snitz 1_17,Snitz 1_18,Sn...",CC(C)[C@@H]1CC[C@@H](C)C[C@H]1O,,,,,,,,...,,,,,,,,,,


In [17]:
mix_smi_df

Unnamed: 0,SMILES,Sources,canonical_smiles
0,CCOC(=O)C1C(O1)(C)C2=CC=CC=C2,"Snitz 1_1,Snitz 1_9,Snitz 1_10,Snitz 1_21,Snit...",CCOC(=O)C1OC1(C)c1ccccc1
1,CCCC(=O)O,"Snitz 1_1,Snitz 1_9,Snitz 1_10,Snitz 1_21,Snit...",CCCC(=O)O
2,CC1=CC=C(C=C1)O,"Snitz 1_1,Snitz 1_10,Snitz 1_18,Snitz 1_28,Sni...",Cc1ccc(O)cc1
3,CC1=CC=C(C=C1)OC(=O)C(C)C,"Snitz 1_1,Snitz 1_9,Snitz 1_10,Snitz 1_21,Snit...",Cc1ccc(OC(=O)C(C)C)cc1
4,CC1=CC=C(C=C1)OC,"Snitz 1_1,Snitz 1_10,Snitz 1_12,Snitz 1_18,Sni...",COc1ccc(C)cc1
...,...,...,...
198,CC=O,"Bushdid_15,Bushdid_38,Bushdid_39,Bushdid_40,Bu...",CC=O
199,CC(=O)OCCC1=CC=CC=C1,"Bushdid_23,Bushdid_36,Bushdid_54,Bushdid_61,Bu...",CC(=O)OCCc1ccccc1
200,CC1=CCC(C=C1)C(C)C,"Bushdid_25,Bushdid_28,Bushdid_47,Bushdid_48,Bu...",CC1=CCC(C(C)C)C=C1
201,CCC(C)S,"Bushdid_25,Bushdid_29,Bushdid_31,Bushdid_32,Bu...",CCC(C)S
