In [57]:
import json
import pandas as pd
import copy
import numpy as np


In [58]:
with open('predict_properties/test_predictions.json', 'r') as res_file:
        results_dict = json.load(res_file)

with open('data/normalisation_stats.json', 'r') as norm_file:
        norm_dict = json.load(norm_file)

base_values = pd.read_csv('data/test_values.csv')

In [68]:
def avg_predictions(dictionary):
    predictions_averaged = {}

    for smiles, properties in dictionary.items():
        predictions_averaged[smiles] = {}

        for prop, indices in properties.items():
            total_sum = 0
            total_count = 0
            
            # Calculate the sum and count of all predictions
            for index, prediction_list in indices.items():
                total_sum += sum(prediction_list)
                total_count += len(prediction_list)
            
            # Calculate the average, handling the case where total_count is zero to prevent division by zero
            if total_count > 0:
                average_prediction = total_sum / total_count
            else:
                average_prediction = 0 # or None, depending on desired behavior
            
            predictions_averaged[smiles][prop] = average_prediction
    return predictions_averaged
    


In [55]:
true_values_dict = {}

# Use df.itertuples() for a memory-efficient way to iterate over DataFrame rows.
# 'index=False' prevents the row index from being included in the tuple.
for row in base_values.itertuples(index=False):
    # The first element of the tuple is the SMILES string
    smiles = row[1]
    
    # Initialize a new dictionary for this SMILES if it doesn't exist
    if smiles not in true_values_dict:
        true_values_dict[smiles] = {}
        
    # Iterate through the rest of the columns to get the properties and values
    # We slice the row tuple from the second element (index 1) onwards.
    # We also get the corresponding column names from df.columns, excluding 'SMILES'.
    for prop_name, value in zip(base_values.columns[1:], row[1:]):
        # Store the value in the nested dictionary
        true_values_dict[smiles][prop_name] = value


In [56]:
def unnormalize_dict(data_dict, stats):
    """
    Unnormalizes the values in a nested dictionary using z-score statistics.

    Args:
        data_dict (dict): The nested dictionary with normalized values.
                          Format: {smiles: {property: value}}
        stats (dict): The dictionary containing mean and std for each property.
                      Format: {property: {'mean': value, 'std': value}}

    Returns:
        dict: A new dictionary with unnormalized values.
    """
    unnormalized_data = copy.deepcopy(data_dict)
    
    # Iterate through each smiles string in the dictionary
    for smiles, properties in unnormalized_data.items():
        # Iterate through each property and its normalized value
        for prop, value in properties.items():
            try:
                mean = stats[prop]['mean']
                std = stats[prop]['std']
                
                # Apply the reverse z-score formula: x = (z * std) + mean
                unnormalized_value = (value * std) + mean
                
                # Update the value in the new dictionary
                unnormalized_data[smiles][prop] = unnormalized_value
            except KeyError:
                print(f"Warning: Statistics not found for property '{prop}'. Skipping unnormalization for this property.")
                # If stats are not found, we keep the original value
                unnormalized_data[smiles][prop] = value
            
    return unnormalized_data

In [59]:
prediction_avg_unnorm = unnormalize_dict(predictions_averaged, norm_dict)
true_values_dict_unorm = unnormalize_dict(true_values_dict, norm_dict)
print(prediction_avg_unnorm['CCCC#N']['ET30'])
print(true_values_dict_unorm.keys())

41.1042669770486
dict_keys(['Fc1c(F)c(F)c(F)c(F)c1F', 'CCCCOCCCC', 'CC1COC(=O)O1', 'CCCC#N', 'NC=O'])


In [61]:
import pandas as pd

def mse_dataframe_with_avgs(pred_dict, true_dict):
    # Compute MSE table
    data = {}
    for smiles, pred_props in pred_dict.items():
        data[smiles] = {}
        for prop, pred_value in pred_props.items():
            true_value = true_dict[smiles][prop]
            mse = (pred_value - true_value) ** 2
            data[smiles][prop] = mse
    df = pd.DataFrame.from_dict(data, orient='index')
    
    # Add per-row mean (average MSE per solvent)
    df["Average_per_solvent"] = df.mean(axis=1)
    
    # Add per-column mean (average MSE per property)
    avg_row = df.mean(axis=0)
    
    # The intersection of averages = overall average MSE
    avg_row["Average_per_solvent"] = avg_row.mean()
    
    # Append the averages row
    df.loc["Average_per_property"] = avg_row
    
    return df

# Example usage:
df_mse = mse_dataframe_with_avgs(prediction_avg_unnorm, true_values_dict_unorm)
print(df_mse)


                             ET30       delta     alpha        SA  \
CCCC#N                   1.948071    0.206529  0.000544  0.000233   
CC1COC(=O)O1            57.116130   71.389598  0.000619  0.007849   
Fc1c(F)c(F)c(F)c(F)c1F   2.828682    0.080920  0.000864  0.000096   
NC=O                    45.390906  173.220189  0.012045  0.063860   
CCCCOCCCC                4.641354    0.967948  0.000217  0.000105   
Average_per_property    22.385029   49.173037  0.002858  0.014429   

                           N_mol_cm3      beta        SB        fn       SdP  \
CCCC#N                  1.154671e-07  0.002269  0.000312  0.000012  0.000032   
CC1COC(=O)O1            1.611648e-06  0.001517  0.010645  0.000265  0.104825   
Fc1c(F)c(F)c(F)c(F)c1F  1.982280e-06  0.003618  0.003100  0.007477  0.016309   
NC=O                    8.203328e-05  0.059024  0.069810  0.000383  0.000516   
CCCCOCCCC               1.193025e-06  0.046849  0.010296  0.000001  0.016412   
Average_per_property    1.738714e-05

In [70]:
with open('predict_properties/template_results.json', 'r') as template_file:
        temp_dict = json.load(template_file)
temp_avg = avg_predictions(temp_dict)

temp_unnorm = unnormalize_dict(temp_avg, norm_dict)
df_mse_temp = mse_dataframe_with_avgs(temp_unnorm, true_values_dict_unorm)
print(df_mse_temp)

                             ET30       delta     alpha        SA  \
CCCC#N                   1.833297    0.231071  0.000546  0.000234   
CC1COC(=O)O1            58.325678   72.462646  0.000543  0.007967   
Fc1c(F)c(F)c(F)c(F)c1F   2.965540    0.094869  0.000540  0.000051   
NC=O                    43.324599  177.425504  0.005476  0.054523   
CCCCOCCCC                4.817030    1.025124  0.000194  0.000087   
Average_per_property    22.253229   50.247843  0.001460  0.012572   

                           N_mol_cm3      beta        SB        fn       SdP  \
CCCC#N                  2.017002e-07  0.002263  0.000347  0.000011  0.000073   
CC1COC(=O)O1            3.182043e-06  0.000103  0.004633  0.000308  0.123673   
Fc1c(F)c(F)c(F)c(F)c1F  8.497811e-07  0.003982  0.002409  0.007107  0.025870   
NC=O                    8.734540e-05  0.042954  0.043525  0.000552  0.002419   
CCCCOCCCC               1.351340e-06  0.044822  0.009980  0.000002  0.017944   
Average_per_property    1.858605e-05

In [71]:
import json
import matplotlib.pyplot as plt
import numpy as np
import os

def compare_predictions_by_n(dict_1, dict_2, ground_truth=None,
                           label1='Dataset 1', label2='Dataset 2', output_dir='plots'):
    """
    Create plots comparing predictions for each property from two different datasets,
    with optional ground truth values plotted as horizontal lines.

    Args:
        json_file_path1 (str): Path to the first JSON file containing the data
        json_file_path2 (str): Path to the second JSON file containing the data
        ground_truth (dict): Dictionary mapping property names to their ground truth values
        label1 (str): Label for the first dataset (default: 'Dataset 1')
        label2 (str): Label for the second dataset (default: 'Dataset 2')
        output_dir (str): Directory to save the plots (default: 'plots')
    """
    # Load data from both JSON files
    
    data1 = dict_1

    data2 = dict_2

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Get all property names from both datasets
    all_properties = set(data1.keys()) | set(data2.keys())

    # Process each property
    for property_name in all_properties:
        plt.figure(figsize=(12, 8))

        # Initialize variables to track x-axis range
        all_n_values = []

        # Process first dataset if property exists
        if property_name in data1:
            property_data1 = data1[property_name]

            # Prepare data for plotting
            n_values1 = []
            means1 = []
            std_devs1 = []
            mins1 = []
            maxs1 = []

            # Sort n values numerically
            sorted_ns1 = sorted(map(int, property_data1.keys()))

            for n in sorted_ns1:
                n_str = str(n)
                values = property_data1[n_str]
                n_values1.append(n)
                means1.append(np.mean(values))
                std_devs1.append(np.std(values))
                mins1.append(min(values))
                maxs1.append(max(values))

            all_n_values.extend(n_values1)

            # Plot first dataset
            plt.errorbar(n_values1, means1, yerr=std_devs1, fmt='-o',
                        capsize=5, capthick=2, label=f'{label1} Mean ± Std Dev',
                        color='blue', alpha=0.8)

            plt.fill_between(n_values1, mins1, maxs1, alpha=0.15,
                            label=f'{label1} Min/Max Range', color='blue')

        # Process second dataset if property exists
        if property_name in data2:
            property_data2 = data2[property_name]

            # Prepare data for plotting
            n_values2 = []
            means2 = []
            std_devs2 = []
            mins2 = []
            maxs2 = []

            # Sort n values numerically
            sorted_ns2 = sorted(map(int, property_data2.keys()))

            for n in sorted_ns2:
                n_str = str(n)
                values = property_data2[n_str]
                n_values2.append(n)
                means2.append(np.mean(values))
                std_devs2.append(np.std(values))
                mins2.append(min(values))
                maxs2.append(max(values))

            all_n_values.extend(n_values2)

            # Plot second dataset
            plt.errorbar(n_values2, means2, yerr=std_devs2, fmt='-s',
                        capsize=5, capthick=2, label=f'{label2} Mean ± Std Dev',
                        color='red', alpha=0.8)

            plt.fill_between(n_values2, mins2, maxs2, alpha=0.15,
                            label=f'{label2} Min/Max Range', color='red')

        # Plot ground truth if provided
        if ground_truth and property_name in ground_truth:
            # Get the range of x values to draw the line across the entire plot
            x_range = sorted(set(all_n_values))
            if x_range:
                x_min, x_max = min(x_range), max(x_range)
                # Add some padding to the line
                x_padding = (x_max - x_min) * 0.05 if x_max > x_min else 0.5
                x_line = [x_min - x_padding, x_max + x_padding]
                y_line = [ground_truth[property_name], ground_truth[property_name]]

                plt.plot(x_line, y_line, '--', color='green', linewidth=2,
                        label=f'Ground Truth ({ground_truth[property_name]:.3f})', alpha=0.8)

        # Handle case where property only exists in one dataset
        if property_name not in data1:
            print(f"Warning: Property '{property_name}' not found in {dict_1}")
        elif property_name not in data2:
            print(f"Warning: Property '{property_name}' not found in {dict_2}")

        # Handle case where ground truth is provided but property not found
        if ground_truth and property_name not in ground_truth:
            print(f"Warning: Ground truth value for '{property_name}' not provided")

        # Customize plot
        title = f'Prediction Comparison for {property_name} by Position n\n{label1} vs {label2}'
        if ground_truth and property_name in ground_truth:
            title += f' (Ground Truth: {ground_truth[property_name]:.3f})'
        plt.title(title)
        plt.xlabel('Prediction Position (n)')
        plt.ylabel('Prediction Value')

        # Set x-ticks to show all n values from both datasets
        unique_n_values = sorted(set(all_n_values))
        if unique_n_values:
            plt.xticks(unique_n_values)
        plt.grid(True, linestyle='--', alpha=0.6)
        plt.legend()

        # Save plot
        plot_filename = os.path.join(output_dir, f'{property_name}_comparison.png')
        plt.savefig(plot_filename, bbox_inches='tight', dpi=300)
        plt.close()

        print(f'Saved comparison plot for {property_name} to {plot_filename}')

# Example usage:
# ground_truth_values = {
#     'property1': 2.5,
#     'property2': 1.8,
#     'property3': 3.2
# }
#
# compare_predictions_by_n('/path/to/first_dataset.json', '/path/to/second_dataset.json',
#                         ground_truth=ground_truth_values,
#                         label1='Original Model', label2='Fine-tuned Model')

In [82]:
import copy

def unnormalize_dict_with_lists(data_dict, stats):
    """
    Unnormalizes the values in a nested dictionary using z-score statistics.
    Handles single values and lists of values.

    Args:
        data_dict (dict): The nested dictionary with normalized values.
                          Format: {smiles: {property: value or [values]}}
        stats (dict): The dictionary containing mean and std for each property.
                      Format: {property: {'mean': value, 'std': value}}

    Returns:
        dict: A new dictionary with unnormalized values.
    """
    unnormalized_data = copy.deepcopy(data_dict)
    
    for smiles, properties in unnormalized_data.items():
        for prop, indicies in properties.items():
            for index, values in indicies.items():
                try:
                    mean = stats[prop]['mean']
                    std = stats[prop]['std']
                    # Check if the value is a list and iterate if so
                    if isinstance(values, list):
                        unnormalized_data[smiles][prop][index] = [
                            (item * std) + mean for item in values
                        ]
                    
                    else:
                        # Apply the reverse z-score formula for a single value
                        unnormalized_value = (values * std) + mean
                        unnormalized_data[smiles][prop][index] = unnormalized_value

                except KeyError:
                    print(f"Warning: Statistics not found for property '{prop}'. Skipping.")
                    # The original value (single or list) is kept
                    unnormalized_data[smiles][prop] = value

    return unnormalized_data

In [None]:
temp_unorm = unnormalize_dict_with_lists(temp_dict, norm_dict)
results_unnorm = unnormalize_dict_with_lists(results_dict, norm_dict) 

compare_predictions_by_n(temp_dict['CCCC#N'],results_dict['CCCC#N'],

                      ground_truth=true_values_dict_unorm['CCCC#N'], label1='template predictions', label2='scratch predictions')

Saved comparison plot for fn to plots\fn_comparison.png
Saved comparison plot for alpha to plots\alpha_comparison.png
Saved comparison plot for delta to plots\delta_comparison.png
Saved comparison plot for SP to plots\SP_comparison.png
Saved comparison plot for SB to plots\SB_comparison.png
Saved comparison plot for n to plots\n_comparison.png
Saved comparison plot for N_mol_cm3 to plots\N_mol_cm3_comparison.png
Saved comparison plot for SA to plots\SA_comparison.png
Saved comparison plot for ET30 to plots\ET30_comparison.png
Saved comparison plot for pi_star to plots\pi_star_comparison.png
Saved comparison plot for beta to plots\beta_comparison.png
Saved comparison plot for SdP to plots\SdP_comparison.png
