# Function for Jupyter Notebooks
This notebook collects functions that may be used to analyse the results of catalyst evaluations of sets of such evaluations.

In [1]:
import glob
import math
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import mpl_toolkits.axisartist as axisartist
from matplotlib.pyplot import *
from matplotlib import font_manager
from matplotlib.font_manager import FontProperties
from matplotlib.lines import Line2D
from mpl_toolkits.axes_grid1 import host_subplot


 Define functions:

In [11]:
# Parameter that are constant, at least within a molecular modeling protocol.
# They are, however, dependent on the method used to compute energies. 
# So, we use these under the assumption that the method is consistent with the
# method assumed here, i.e., the AF protocol defined in 
# '../../data/sp_PBEPBE-GD3MBJ_DZ_singlet', but if a file provides values for 
# any such parameter we prefer thevalue declared by the file.
# Basically the assumption is that these values are the same as those 
# used by '../../src/fitness_function.sh'
hartree_to_kcalmol=627.49467516
G_Propene=-117.738020895
G_Ethene=-78.4716324751
G_HoveydaProd=-502.204109499
G_SIMes=-924.27249048
G_PCy3=-1046.11440274
G_HG_RuCl2_SIMes=-2402.13728523
G_HI_precursor=-2523.95659027
DG_referenceProductionBarrier=0.0222529298
DDG_reference_HGII=0.0222312922
DG_referenceProductionBarrier=0.0222714248
DG_referencePrecursorStabilityHII=-0.0052201621
DG_referencePrecursorStabilityHI=0.0117775312
print('WARNING: resetting values of constants to default. Make sure this is really what you want!')


def is_float(string: any) -> bool:
    """
    Checks if a value can be converted to a float

    Args:
        any (string): The string to evaluate.

    Returns:
        bool: boolean indicating whether the string can be converted to a float.
    """
    if string is None:
        return False
    try:
        float(string)
        return True
    except ValueError:
        return False


def extract_properties_from_sdf(pathname):
    """
    Reads the property/fields of an SDF file produced by the catalyst_evaluator, 
    that is, adhering to the filename convention, i.e., the filename is 'name-N_out.sdf', 
    where:
     * 'name' is any string that does contain '_' or '-', 
     * 'N' is an integer identifying the repetition/attempt.
    
    Args:
        pathname (string): the pathname to the file to read. Must adhere to the 
        convention (see above).

    Returns:
        dict: dictonary with each property/field as entry named after the proprty/field 
        name used in the SDF file.
    """
    if not os.path.isfile(pathname):
        print("ERROR: file '%s' not found!" % pathname)
    pattern = re.compile("^> *<.*>$")
    basename = os.path.basename(pathname)
    if not '_out.sdf' in basename:
        raise ValueError("Attempt to extract properties from file that is not an '_out.sdf file!")
    if not '-' in basename or basename.count('-') != 1:
        raise ValueError("Attempt to extract repetition number from malformulated filename '%s'!" % pathname)
    name_parts = basename.replace('_out.sdf','').rpartition('-')
    all_values = {}
    # Define values derived from file name
    all_values["NAME"] = name_parts[0]
    all_values["REPETITION"] = name_parts[2]
    # Read values from SDF property fields
    with open(pathname, "r") as input_file:
        lines = input_file.readlines()
        for index, line in enumerate(lines):
            if pattern.match(line):
                property_name = re.split('<|>', line)[2]
                value_str = lines[index + 1].rstrip()
                if is_float(value_str):
                    all_values[property_name] = float(value_str)
                else:
                    all_values[property_name] = value_str
    # Compute derived values
    loc_values = {}
    loc_values['G_Ethene'] = G_Ethene
    loc_values['G_Propene'] = G_Propene
    loc_values['G_PCy3'] = G_PCy3
    loc_values['G_HI_precursor'] = G_HI_precursor
    loc_values['hartree_to_kcalmol'] = hartree_to_kcalmol
    loc_values['freeEnergyX'] = 0
    loc_values['freeEnergyZ'] = 0
    loc_values['freeEnergyD'] = 0
    loc_values['freeEnergyE'] = 0
    loc_values['freeEnergyL'] = 0
    for optional_prop in ['G_Ethene', 'G_Propene', 'G_PCy3', 'G_HI_precursor', 'hartree_to_kcalmol']:
        if optional_prop in all_values:
            loc_values[optional_prop] = all_values[optional_prop]
        else:
            print("WARNING: Missing property '%s' in '%s'. Using default value." % (optional_prop, pathname))
    calculate_derived = True
    for required_prop in ['freeEnergyX', 'freeEnergyZ', 'freeEnergyD', 'freeEnergyE', 'freeEnergyL']:
        loc_values[required_prop] = all_values[required_prop]
        if not required_prop in all_values:
            print("WARNING: Missing property '%s'. Omitting calculation of barrier in '%s'." % (required_prop, pathname))
            calculate_derived = False
            break
    if calculate_derived:
        DeltaG_X_kcalmol = loc_values['hartree_to_kcalmol'] * (loc_values['freeEnergyX'] + 2.0*loc_values['G_Ethene'] - loc_values['freeEnergyD'] - 2.0*loc_values['G_Propene'])
        DeltaG_Z_kcalmol = loc_values['hartree_to_kcalmol'] * (loc_values['freeEnergyZ'] - loc_values['freeEnergyD'])
        DeltaG_synthesis_kcalmol = loc_values['hartree_to_kcalmol'] * (loc_values['freeEnergyE'] + loc_values['G_PCy3'] - loc_values['G_HI_precursor'] - loc_values['freeEnergyL'])
        all_values['DeltaG_X_kcalmol'] = DeltaG_X_kcalmol
        all_values['DeltaG_Z_kcalmol'] = DeltaG_Z_kcalmol
        all_values['DeltaG_synthesis_kcalmol'] = DeltaG_synthesis_kcalmol
    # All done
    return all_values


def plot_fitness_and_components(df_to_plot):
    """
    Plots the fitness score and its components (descriptors and weights) in two subplots 
    so that the role of both descriptors and weights can be detected visually. Considers
    all rows independently (i.e., no statistics of repetitions).
    
    Args:
        df_to_plot (DataFrame): pandas dataframe containing all the entries to plot.
        
    """
    rows_descriptors=['DESCRIPTOR_1', 'DESCRIPTOR_2', 'DESCRIPTOR_3', 'FITNESS']
    rows_weights=['WEIGHT_1', 'WEIGHT_2', 'WEIGHT_3', 'WEIGHT_4']
    descriptors_df=df_to_plot[rows_descriptors]
    weights_df=df_to_plot[rows_weights]
    fig, axs = plt.subplots(2, 1, figsize=(10, 5), gridspec_kw={'height_ratios': [2, 1]})
    descriptor_styles = {
        'DESCRIPTOR_1': {'color': 'green', 'linestyle': '--', 'marker': 's', 'legend': r'$D_{1}$'},
        'DESCRIPTOR_2': {'color': 'blue', 'linestyle': '--', 'marker': 'D', 'legend': r'$D_{2}$'},
        'DESCRIPTOR_3': {'color': 'orange', 'linestyle': '--', 'marker': '^', 'legend': r'$D_{3}$'},
        'FITNESS': {'color': 'red', 'linestyle': '-', 'marker': 's', 'legend': 'AF'}
    }

    # First subplot
    for column in descriptors_df.columns:
        style = descriptor_styles.get(column, {'color': 'black', 'linestyle': '-', 'marker': 'o', 'legend': column})
        axs[0].plot(descriptors_df.index, descriptors_df[column], color=style['color'], linestyle=style['linestyle'], marker=style['marker'], label=style['legend'])
    axs[0].set_xlabel('')
    axs[0].set_ylabel('Descriptor Value')
    axs[0].set_title('')
    axs[0].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
    #axs[0].set_ylim(-1, 20)  # Set y-axis range
    axs[0].legend()
    axs[0].grid(True, linestyle='--')
    
    # Define the styles and legend names for the second subplot
    weight_styles = {
        'WEIGHT_1': {'color': 'cyan', 'linestyle': ':', 'marker': 'D', 'markersize': 8, 'legend': r'$w_{1}$'},
        'WEIGHT_2': {'color': 'brown', 'linestyle': ':', 'marker': 'v', 'legend': r'$w_{2}$'},
        'WEIGHT_3': {'color': 'magenta', 'linestyle': ':', 'marker': 'o', 'legend': r'$w_{3}$'},
        'WEIGHT_4': {'color': 'black', 'linestyle': ':', 'marker': 'x', 'legend': r'$w_{4}$'}
    }
    
    # Second subplot
    for column in weights_df.columns:
        style = weight_styles.get(column, {'color': 'black', 'linestyle': '-', 'marker': 'o', 'legend': column})
        axs[1].plot(weights_df.index, weights_df[column], color=style['color'], linestyle=style['linestyle'], marker=style['marker'], markersize=style.get('markersize', 6), label=style['legend'])
    axs[1].set_xlabel('')
    axs[1].set_ylabel('Weight Value')
    axs[1].set_title('')
    axs[1].set_ylim(0, 1.1)  # Set y-axis range
    
    # Set font properties for X-axis ticks
    font_properties = font_manager.FontProperties(family='Arial', weight='bold', size=12)
    for label in axs[1].get_xticklabels():
        label.set_fontproperties(font_properties)
    
    axs[1].legend()
    axs[1].grid(True, linestyle='--')
    
    plt.tight_layout()
    plt.show()


def plot_one_series(series_name, df_all_evaluations):
    """
    Plots the scores of all entries with the given name (i.e., catalyst name).
    Plots fitness score and its components (descriptors and weights) in two subplots 
    so that the role of both descriptors and weights can be detected visually. 
    Considers all rows independently (i.e., no statistics of repetitions).
    
    Args:
        series_name (string): the name of the catalyst to limit the plot to.
        
        df_all_evaluations (DataFrame): pandas dataframe containing all the data for any catalyst.
    """
    
    plot_fitness_and_components(df_all_evaluations.loc[df_all_evaluations['NAME'].eq(series_name)])


# Prints the values of the columns given as argument for the group with the given name.
def print_values_for_series(column_names, series_name, df_all_evaluations):
    """
    Prints the values of the columns given as argument for the series with the given name (i.e., catalyst name).
    
    Args:
        column_names (string): the name of the columns to extract and print.
        
        series_name (string): the name of the catalyst to limit the plot to.
        
        df_all_evaluations (DataFrame): pandas dataframe containing all the data for any catalyst.
    """
    print(df_all_evaluations.loc[df_all_evaluations['NAME'].eq(series_name)][column_names])
    
    

