In [1]:
import pyrfume
import pandas as pd
import os
import sys
import io
from ast import literal_eval

#pd.set_option('display.max_rows', 1000)
#pd.set_option('display.max_colwidth', 1000)

In [2]:
def capture_function_output(data_name):
    old_stdout = sys.stdout
    sys.stdout = io.StringIO()

    # pyrfume printout shows files available for pulling
    pyrfume.show_files(f"{data_name}")

    # Capture output
    output = sys.stdout.getvalue()

    # Restore the original stdout
    sys.stdout = old_stdout

    # Convert the string output to a dictionary
    try:
        output_dict = literal_eval(output)
        return output_dict
    except:
        print("Failed to convert output to dictionary. Raw output:")
        print(output)
        return None
    
def pull_pyrfume_data(data_name):
    data_dict = {}

    file_dict = capture_function_output(data_name)

    for key in file_dict.keys():
        data_dict[key] = pyrfume.load_data(f'{data_name}/{key}', remote=True)

    #pyrfume.show_files(f"{data_name}")    
    
    return data_dict

def inflate_behavior(target_columns, behavior_df):
    
    inflated_behavior = behavior_df.copy()

    if len(target_columns) > 1:
        inflated_behavior['Filtered Descriptors'] = inflated_behavior[target_columns].apply(
            lambda row: ';'.join(str(val).strip() for val in row if pd.notna(val) and str(val).strip()), axis=1)

        inflated_behavior['temp'] = inflated_behavior["Filtered Descriptors"].str.split(';')
    else:
        inflated_behavior['temp'] = inflated_behavior[target_columns[0]].str.split(';')

    # Step 2: Get unique descriptors
    unique_descriptors = set([descriptor for descriptors in inflated_behavior['temp'] for descriptor in descriptors])

    # Step 3: Create new columns for each unique descriptor

    descriptor_columns = pd.DataFrame({descriptor: inflated_behavior['temp'].apply(lambda x: 1 if descriptor in x else 0) for descriptor in unique_descriptors}
                                      )
    inflated_behavior = pd.concat([inflated_behavior, descriptor_columns], axis=1)

    inflated_behavior.drop(columns=['temp'], inplace=True)

    inflated_behavior = inflated_behavior[sorted(inflated_behavior.columns)]

    return inflated_behavior, sorted(list(unique_descriptors))

def merge_pyrfume_data(data_dict, data_name, pyrfume_files):
    behavior_df = data_dict[pyrfume_files[data_name]['behavior']].copy().reset_index()
    if pyrfume_files[data_name]['inflate'] != False:
        behavior_df, label_columns = inflate_behavior(pyrfume_files[data_name]['inflate'], behavior_df)
    else:
        label_columns = behavior_df.columns.to_list()
    molecules_df = data_dict[pyrfume_files[data_name]['molecules']].copy().reset_index()
    stimuli_df = data_dict[pyrfume_files[data_name]['stimuli']].copy().reset_index()

    stimuli_CID_col = pyrfume_files[data_name]['stimuli_CID_col']
    if stimuli_CID_col == False:
        merged_df = pd.merge(behavior_df, molecules_df, left_on="Stimulus", right_on="CID", how="left")
    else:    
        merged_df = pd.merge(behavior_df, stimuli_df, on="Stimulus", how='right')
        merged_df = pd.merge(merged_df, molecules_df, left_on=stimuli_CID_col, right_on="CID", how="left")

    return merged_df, label_columns


   

In [3]:
pyrfume_files = {"arctander_1960": {'behavior': 'behavior_1.csv',
                                    'molecules': 'molecules.csv',
                                    'stimuli': 'stimuli.csv',
                                    'stimuli_CID_col': "new_CID",
                                    'inflate': False},
                 "aromadb": {'behavior': "behavior.csv",
                             "molecules": "molecules.csv",
                             "stimuli": "stimuli.csv",
                             'stimuli_CID_col': "CID",
                             'inflate': ["Filtered Descriptors"]},
                 "flavornet": {'behavior': "behavior.csv",
                               "molecules": "molecules.csv",
                               "stimuli": "stimuli.csv",
                               'stimuli_CID_col': "CID",
                               'inflate': ["Descriptors"]},
                 "ifra_2019": {'behavior': "behavior.csv",
                               "molecules": "molecules.csv",
                               "stimuli": "stimuli.csv",
                               'stimuli_CID_col': "CID",
                               "inflate": ["Descriptor 1", "Descriptor 2", "Descriptor 3"]},
                 "sigma_2014": {'behavior': "behavior.csv",
                                "molecules": "molecules.csv",
                                "stimuli": "stimuli.csv",
                                'stimuli_CID_col': False,
                                "inflate": False},
                 "keller_2016": {'behavior': "behavior.csv",
                                 "molecules": "molecules.csv",
                                 "stimuli": "stimuli.csv",
                                 'stimuli_CID_col': "CID",
                                 "inflate": False},
                 "abraham_2012": {'behavior': "behavior.csv",
                                  "molecules": "molecules.csv",
                                  "stimuli": "stimuli.csv",
                                  'stimuli_CID_col': "CID",
                                  "inflate": False},
                 "mayhew_2022": {'behavior': "behavior_1.csv",
                                 "molecules": "molecules.csv",
                                 "stimuli": "stimuli.csv",
                                 'stimuli_CID_col': "CID",
                                 "inflate": ['odor.class']},
                 "leffingwell": {'behavior': "behavior.csv",
                                 "molecules": "molecules.csv",
                                 "stimuli": "stimuli.csv",
                                 'stimuli_CID_col': "CID",
                                 "inflate": False},
                 }

In [5]:
label_dict = {}

for data_name in pyrfume_files.keys():
    print(data_name)
    data_dict = pull_pyrfume_data(data_name)
    merged_df, label_dict[data_name] = merge_pyrfume_data(data_dict, data_name, pyrfume_files)
    os.makedirs(f"{data_name}", exist_ok=True)
    merged_df.to_csv(f"{data_name}/{data_name}_unclean.csv", index=False)

arctander_1960
aromadb
flavornet
ifra_2019
sigma_2014
keller_2016
abraham_2012
mayhew_2022
leffingwell


In [None]:
label_dict

{'arctander_1960': ['Stimulus',
  'acid',
  'aldehydic',
  'almond',
  'ambre',
  'animal',
  'anisic',
  'apple',
  'apricot',
  'aromatic',
  'balsamic',
  'banana',
  'berry',
  'brandy',
  'buttery',
  'camphoraceous',
  'caramelic',
  'citrus',
  'coco',
  'coconut',
  'creamy',
  'earthy',
  'ethereal',
  'fatty',
  'floral',
  'fruity',
  'gassy',
  'geranium',
  'grape',
  'green',
  'hay',
  'herbal',
  'honey',
  'hyacinth',
  'jasmin',
  'juicy',
  'leafy',
  'leather',
  'lilac',
  'lily',
  'medicinal',
  'metallic',
  'mimosa',
  'minty',
  'mossy',
  'mushroom',
  'musky',
  'musty',
  'narcissus',
  'nutty',
  'oily',
  'orange',
  'orange-blossom',
  'orris',
  'peach',
  'pear',
  'pepper',
  'phenolic',
  'pine',
  'pineapple',
  'plum',
  'powdery',
  'rooty',
  'rose',
  'sandalwood',
  'smoky',
  'sour',
  'spicy',
  'sulfuraceous',
  'tarry',
  'tea',
  'tobacco',
  'vanilla',
  'vanillin',
  'violet',
  'waxy',
  'winey',
  'woody'],
 'aromadb': ['acetic',
  'ac