In [13]:
import pandas as pd
import os

def combine_csvs(input_directory, output_file):
    """
    Combines all single-row CSV files from a directory into a single CSV file.

    Args:
        input_directory (str): The path to the directory containing the CSV files.
        output_file (str): The path to the output CSV file.
    """
    # Create an empty list to store the dataframes from each CSV file
    all_dataframes = []

    # Get a list of all files in the specified directory
    try:
        files = os.listdir(input_directory)
    except FileNotFoundError:
        print(f"Error: The directory '{input_directory}' was not found.")
        return

    # Iterate over each file in the directory
    for file in files:
        # Check if the file is a CSV file
        if file.endswith('.csv'):
            file_path = os.path.join(input_directory, file)
            print(f"Reading {file_path}...")
            
            try:
                # Read the CSV file into a pandas DataFrame
                df = pd.read_csv(file_path)
                # Append the DataFrame to our list
                all_dataframes.append(df)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")

    # Check if any CSVs were found and read
    if not all_dataframes:
        print("No CSV files found in the directory. The output file will not be created.")
        return

    # Concatenate all dataframes in the list into a single DataFrame
    # The 'ignore_index=True' argument creates a new, clean index for the combined DataFrame
    combined_df = pd.concat(all_dataframes, ignore_index=True)

    # Save the combined DataFrame to the specified output file
    # 'index=False' prevents pandas from writing the DataFrame index as a column
    combined_df.to_csv(output_file, index=False)
    print(f"\nSuccessfully combined {len(all_dataframes)} files into {output_file}")

combine_csvs('results\single_solv_decoderFalse', 'single_solvent_learn.csv')

Reading results\single_solv_decoderFalse\run_20250825_122122_b75e4fbe8a0843dd9ec3c2793f29a5af.csv...
Reading results\single_solv_decoderFalse\run_20250825_122335_22bb712db5294dcda298f2e36b778019.csv...
Reading results\single_solv_decoderFalse\run_20250825_122555_fc7d456582e44d729d787e382c67dc62.csv...
Reading results\single_solv_decoderFalse\run_20250825_122654_57258a3ebe354b31ac83ef77df98291a.csv...
Reading results\single_solv_decoderFalse\run_20250825_122715_6a5b4bb21dd142518727a6f7e56575ee.csv...
Reading results\single_solv_decoderFalse\run_20250825_122918_3075841cb96c415699f071f21144fea0.csv...
Reading results\single_solv_decoderFalse\run_20250825_122954_c9df4697272c4c8a81f09b266dda34f7.csv...
Reading results\single_solv_decoderFalse\run_20250825_122957_52f3a5403d7a4982a381e7154ae4bb4c.csv...
Reading results\single_solv_decoderFalse\run_20250825_123022_1ce5465747c24278a84b85df698894ef.csv...
Reading results\single_solv_decoderFalse\run_20250825_123033_6ecf733bd0fc4663bc380ec6cc4948

In [2]:
import pandas as pd
#single solvent dataset with the SoDaDE model learning
df_SS_YE = pd.read_csv('single_solvent_learn.csv')
df_SS_YE.sort_values(by='avg_mse', ascending=True, inplace=True)

print(df_SS_YE.head())

           timestamp                              pretrained_model_path  \
143  20250825_132409  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
58   20250825_124944  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
166  20250825_133308  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
123  20250825_131710  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
142  20250825_132359  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   

         spange_path  nn_size  dropout_fp  dropout_nn  epochs  hidden_factor  \
143  spange_melt.csv       64        0.05        0.05      50              3   
58   spange_melt.csv       32        0.05        0.05      40              5   
166  spange_melt.csv       64        0.05        0.05      40              5   
123  spange_melt.csv       64        0.05        0.05      50              5   
142  spange_melt.csv       64        0.05        0.05      50              5   

     val_percent   avg_mse                                    mse_pe

In [3]:
import pandas as pd
#single solvent dataset with NO SoDaDE model learning

df_SS_NO = pd.read_csv('single_solvent_no_learn.csv')
df_SS_NO.sort_values(by='avg_mse', ascending=True, inplace=True)

print(df_SS_NO.head())

           timestamp                              pretrained_model_path  \
606  20250824_193823  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
491  20250824_192204  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
573  20250824_193413  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
329  20250824_185335  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
298  20250824_184710  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   

         spange_path  nn_size  dropout_fp  dropout_nn  epochs  hidden_factor  \
606  spange_melt.csv      128         0.3        0.05      50              3   
491  spange_melt.csv      128         0.3        0.05      40              5   
573  spange_melt.csv      128         0.3        0.05      50              4   
329  spange_melt.csv       64         0.1        0.05      50              3   
298  spange_melt.csv       64         0.1        0.05      50              4   

     val_percent   avg_mse                                    mse_pe

In [3]:
import pandas as pd
#full data dataset with the SoDaDE model learning

df_FY_YE = pd.read_csv('collated_results_csvs/full_yield_learn.csv')
df_FY_YE.sort_values(by='avg_mse', ascending=True, inplace=True)

print(df_FY_YE.iloc[0:15])

           timestamp                              pretrained_model_path  \
315  20250824_234319  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
317  20250824_234326  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
86   20250824_225040  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
272  20250824_233756  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
345  20250824_234719  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
338  20250824_234617  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
295  20250824_234140  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
136  20250824_230548  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
267  20250824_233706  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
494  20250825_001719  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
227  20250824_232818  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
358  20250824_234925  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
81   20250824_224718  val

In [5]:
import pandas as pd
#full data dataset with NO SoDaDE model learning

df_FY_NO = pd.read_csv('full_yield_no_learn.csv')
df_FY_NO.sort_values(by='avg_mse', ascending=True, inplace=True)

print(df_FY_NO.head())

           timestamp                              pretrained_model_path  \
533  20250824_215636  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
541  20250824_215943  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
596  20250824_221016  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
631  20250824_221538  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   
551  20250824_220203  val_loss0.1074_DPR_0.1_MP_0.3_DM_64_TL_5_heads...   

         spange_path  nn_size  dropout_fp  dropout_nn  epochs  hidden_factor  \
533  spange_melt.csv      128         0.1        0.05      50              5   
541  spange_melt.csv      128         0.1        0.05      50              5   
596  spange_melt.csv       64         0.1        0.05      50              5   
631  spange_melt.csv      128         0.3        0.05      50              4   
551  spange_melt.csv       64         0.1        0.05      50              4   

     val_percent   avg_mse                                    mse_pe

In [6]:

import json
import ast


def calc_diff_avg(df, index):

    string = df.loc[index, 'mse_per_solvent']
    avg_mse = float(df.loc[index, 'avg_mse']) 
    solv_res = json.loads(string)
    another_attempt = ast.literal_eval(solv_res)
    mse_to_remove = float(another_attempt['Acetonitrile.Acetic Acid'])
    print(avg_mse - (mse_to_remove/len(another_attempt)))
    return avg_mse - (mse_to_remove/len(another_attempt))

mse_143 = calc_diff_avg(df_SS_YE, 143)
mse_58 = calc_diff_avg(df_SS_YE, 58)
df_SS_NO

mse_606 = calc_diff_avg(df_SS_NO, 606)
mse_491 = calc_diff_avg(df_SS_NO, 491)



0.004177287189738853
0.00412311031890116
0.0036609199338262684
0.0038734416858419688


In [7]:

import json
import ast


def save_best_results(df, index, dict_name):

    string = df.loc[index, 'mse_per_solvent']
    avg_mse = float(df.loc[index, 'avg_mse']) 
    solv_res = json.loads(string)
    another_attempt = ast.literal_eval(solv_res)
    file_path = dict_name + '.json'
    with open(file_path, 'w') as json_file:
        json.dump(another_attempt, json_file, indent=4)


save_best_results(df_SS_YE, 143, 'single_solvent_SoDaDE_finetuning_MSE_per_solvent')
save_best_results(df_SS_NO, 606, 'single_solvent_SoDaDE_without_finetuning_MSE_per_solvent')
save_best_results(df_FY_YE, 315, 'full_data_SoDaDE_finetuning_MSE_per_ramp')
save_best_results(df_FY_NO, 533, 'full_data_SoDaDE_without_finetuning_MSE_per_ramp')
