In [10]:
import pyrfume
import pandas as pd
import os
import sys
import io
from ast import literal_eval

#pd.set_option('display.max_rows', 1000)
#pd.set_option('display.max_colwidth', 1000)

In [11]:
def capture_function_output(data_name):
    old_stdout = sys.stdout
    sys.stdout = io.StringIO()

    # pyrfume printout shows files available for pulling
    pyrfume.show_files(f"{data_name}")

    # Capture output
    output = sys.stdout.getvalue()

    # Restore the original stdout
    sys.stdout = old_stdout

    # Convert the string output to a dictionary
    try:
        output_dict = literal_eval(output)
        return output_dict
    except:
        print("Failed to convert output to dictionary. Raw output:")
        print(output)
        return None
    
def pull_pyrfume_data(data_name):
    data_dict = {}

    file_dict = capture_function_output(data_name)

    for key in file_dict.keys():
        data_dict[key] = pyrfume.load_data(f'{data_name}/{key}', remote=True)

    #pyrfume.show_files(f"{data_name}")    
    
    return data_dict

def inflate_behavior(target_columns, behavior_df):
    
    inflated_behavior = behavior_df.copy()

    if len(target_columns) > 1:
        inflated_behavior['Filtered Descriptors'] = inflated_behavior[target_columns].apply(
            lambda row: ';'.join(str(val).strip() for val in row if pd.notna(val) and str(val).strip()), axis=1)

        inflated_behavior['temp'] = inflated_behavior["Filtered Descriptors"].str.split(';')
    else:
        inflated_behavior['temp'] = inflated_behavior[target_columns[0]].str.split(';')

    # Step 2: Get unique descriptors
    unique_descriptors = set([descriptor for descriptors in inflated_behavior['temp'] for descriptor in descriptors])

    # Step 3: Create new columns for each unique descriptor

    descriptor_columns = pd.DataFrame({descriptor: inflated_behavior['temp'].apply(lambda x: 1 if descriptor in x else 0) for descriptor in unique_descriptors}
                                      )
    inflated_behavior = pd.concat([inflated_behavior, descriptor_columns], axis=1)

    inflated_behavior.drop(columns=['temp'], inplace=True)

    inflated_behavior.drop(columns=target_columns, inplace=True)

    inflated_behavior = inflated_behavior[sorted(inflated_behavior.columns)]

    return inflated_behavior, sorted(list(unique_descriptors))

def merge_pyrfume_data(data_dict, data_name, pyrfume_files, undesirable_columns):
    behavior_df = data_dict[pyrfume_files[data_name]['behavior']].copy().reset_index()
    if pyrfume_files[data_name]['inflate'] != False:
        behavior_df, label_columns = inflate_behavior(pyrfume_files[data_name]['inflate'], behavior_df)
    else:
        label_columns = behavior_df.columns.to_list()

    for undesirable in undesirable_columns:
        if undesirable in label_columns:
            label_columns.remove(undesirable)

    molecules_df = data_dict[pyrfume_files[data_name]['molecules']].copy().reset_index()
    stimuli_df = data_dict[pyrfume_files[data_name]['stimuli']].copy().reset_index()

    stimuli_CID_col = pyrfume_files[data_name]['stimuli_CID_col']
    if stimuli_CID_col == False:
        merged_df = pd.merge(behavior_df, molecules_df, left_on="Stimulus", right_on="CID", how="left")
    else:    
        merged_df = pd.merge(behavior_df, stimuli_df, on="Stimulus", how='right')
        merged_df = pd.merge(merged_df, molecules_df, left_on=stimuli_CID_col, right_on="CID", how="left")

    return merged_df, label_columns


   

In [12]:
pyrfume_files = {"arctander_1960": {'behavior': 'behavior_1.csv',
                                    'molecules': 'molecules.csv',
                                    'stimuli': 'stimuli.csv',
                                    'stimuli_CID_col': "new_CID",
                                    'inflate': False,
                                    'task': 'multilabel'},
                 "aromadb": {'behavior': "behavior.csv",
                             "molecules": "molecules.csv",
                             "stimuli": "stimuli.csv",
                             'stimuli_CID_col': "CID",
                             'inflate': ["Filtered Descriptors"],
                             'task': None},
                 "flavornet": {'behavior': "behavior.csv",
                               "molecules": "molecules.csv",
                               "stimuli": "stimuli.csv",
                               'stimuli_CID_col': "CID",
                               'inflate': ["Descriptors"],
                               'task': 'multilabel'},
                 "ifra_2019": {'behavior': "behavior.csv",
                               "molecules": "molecules.csv",
                               "stimuli": "stimuli.csv",
                               'stimuli_CID_col': "CID",
                               "inflate": ["Descriptor 1", "Descriptor 2", "Descriptor 3"],
                               'task': 'multilabel'},
                 "sigma_2014": {'behavior': "behavior.csv",
                                "molecules": "molecules.csv",
                                "stimuli": "stimuli.csv",
                                'stimuli_CID_col': False,
                                "inflate": False,
                                'task': 'multilabel'},
                 "keller_2016": {'behavior': "behavior.csv",
                                 "molecules": "molecules.csv",
                                 "stimuli": "stimuli.csv",
                                 'stimuli_CID_col': "CID",
                                 "inflate": False,
                                 'task': 'regression'},
                 "abraham_2012": {'behavior': "behavior.csv",
                                  "molecules": "molecules.csv",
                                  "stimuli": "stimuli.csv",
                                  'stimuli_CID_col': "CID",
                                  "inflate": False,
                                  'task': 'regression'},
                 "mayhew_2022": {'behavior': "behavior_1.csv",
                                 "molecules": "molecules.csv",
                                 "stimuli": "stimuli.csv",
                                 'stimuli_CID_col': "CID",
                                 "inflate": ['odor.class'],
                                 'task': 'binary'},
                 "leffingwell": {'behavior': "behavior.csv",
                                 "molecules": "molecules.csv",
                                 "stimuli": "stimuli.csv",
                                 'stimuli_CID_col': False,
                                 "inflate": False,
                                 'task': 'multilabel'},
                 }

In [13]:
file_dict = {}

undesirable_columns = ["Stimulus", "Descriptor 1", "Descriptor 2", "Descriptor 3", "Filtered Descriptors", "descriptors", "Dilution", "Replicate"]

for data_name in pyrfume_files.keys():
    file_dict[data_name] = {"unclean": f"{data_name}/{data_name}_unclean.csv"}
    print(data_name)
    data_dict = pull_pyrfume_data(data_name)
    merged_df, label_columns = merge_pyrfume_data(data_dict, data_name, pyrfume_files, undesirable_columns)
    file_dict[data_name].update({"label_columns": label_columns, "task": pyrfume_files[data_name]['task']})
    os.makedirs(f"{data_name}", exist_ok=True)
    merged_df.to_csv(f"{data_name}/{data_name}_unclean.csv", index=False)

arctander_1960
aromadb
flavornet
ifra_2019
sigma_2014
keller_2016
abraham_2012
mayhew_2022
leffingwell


In [14]:
gslf_df = pd.read_csv("gs-lf/curated_GS_LF_merged_4983.csv")

file_dict['gs-lf'] = {"unclean": "gs-lf/curated_GS_LF_merged_4983.csv",
                     "label_columns": gslf_df.columns[2:].to_list(),
                     'task': 'multilabel'}


In [17]:
file_df = pd.DataFrame(file_dict).T
file_df['label_columns'] = file_df['label_columns'].apply(lambda x: str(x))
file_df.to_csv("file_cleaning_features.csv", index_label="dataset")

In [18]:
file_df

Unnamed: 0,unclean,label_columns,task
arctander_1960,arctander_1960/arctander_1960_unclean.csv,"['acid', 'aldehydic', 'almond', 'ambre', 'anim...",multilabel
aromadb,aromadb/aromadb_unclean.csv,"['acetic', 'acid', 'alcoholic', 'almond', 'amm...",
flavornet,flavornet/flavornet_unclean.csv,"['acid', 'alcohol', 'alkaline', 'alkane', 'alm...",multilabel
ifra_2019,ifra_2019/ifra_2019_unclean.csv,"['Acidic', 'Aldehydic', 'Almond', 'Amber', 'An...",multilabel
sigma_2014,sigma_2014/sigma_2014_unclean.csv,"['potato', 'marigold', 'anise', 'herba-', 'car...",multilabel
keller_2016,keller_2016/keller_2016_unclean.csv,"['Descriptor', '1', '2', '3', '4', '5', '6', '...",regression
abraham_2012,abraham_2012/abraham_2012_unclean.csv,"['Log (1/ODT)', 'E', 'S', 'A', 'B', 'L', 'V', ...",regression
mayhew_2022,mayhew_2022/mayhew_2022_unclean.csv,"['Odor', 'Odorless']",binary
leffingwell,leffingwell/leffingwell_unclean.csv,"['alcoholic', 'aldehydic', 'alliaceous', 'almo...",multilabel
gs-lf,gs-lf/curated_GS_LF_merged_4983.csv,"['alcoholic', 'aldehydic', 'alliaceous', 'almo...",multilabel
