In [116]:
#!/usr/bin/env python
import glob
import os
import re
import pandas as pd

In [117]:
# define functions:

# Function to calculate the mean from a string containing space separated numbers.
def calculate_mean(numbers_str):
    numbers = [float(x) for x in numbers_str.split()]
    mean = sum(numbers) / len(numbers)
    return mean

# Function to check if a value can be converted to a float.
def is_float(string: any) -> bool:
    if string is None:
        return False
    try:
        float(string)
        return True
    except ValueError:
        return False

# Function to extract all properties from an sdf file. (takes regex paths as input) 
def extract_properties_from_sdf(pathname):
    if not os.path.isfile(pathname):
        print("ERROR: file '%s' is missing!" % pathname)
        quit(1)
    pattern = re.compile("^> *<.*>$")
    all_values = {}
    with open(pathname, "r") as input_file:
        lines = input_file.readlines()
        for index, line in enumerate(lines):
            if pattern.match(line):
                property_name = re.split('<|>', line)[2]
                value_str = lines[index + 1].rstrip()
                if is_float(value_str):
                    all_values[property_name] = float(value_str)
                else:
                    all_values[property_name] = value_str
    return all_values

In [118]:
# Collect paths to all the sdf files of test set 1 in 'pathnames' variable.
pathnames = []
regex = str(os.getcwd()+"/../../test/test_set_1/*/*_out.sdf").strip('\'')
pathnames.extend(sorted(glob.glob(regex)))


In [119]:
# store paths to catalyst groups in separate variables
H0_paths = []; HI_paths = []; HIIprime_paths = []; HII_paths = []; HC1Ph_paths = []
for pathname in pathnames:
    if "H0_" in pathname:
        H0_paths.append(pathname)
    if "HI_" in pathname:
        HI_paths.append(pathname)    
    if "HIIprime_" in pathname:
        HIIprime_paths.append(pathname)
    if "HII_" in pathname:
        HII_paths.append(pathname)
    if "HC1Ph_" in pathname:
        HC1Ph_paths.append(pathname)
        
print("Total number of Candidates = " + str(len(H0_paths + HI_paths + HIIprime_paths + HII_paths + HC1Ph_paths)))

Total number of Candidates = 30


In [120]:
# Now we calculate the mean of selected sdf properties within each catalyst group.
# The mean values are then stored in separate arrays for further use bellow.
# Finally, we print a simple overview of the fitness, descriptors and weights.

# define lists for catalyst groups(paths) and names.
catalyst_groups = [H0_paths, HI_paths, HIIprime_paths, HII_paths, HC1Ph_paths]
catalyst_name = ['H0', 'HI', 'HIIprime', 'HII', 'HC1Ph']

# define lists for desired properties (averages).
fitnesses = []; d1s = []; d2s = []; d3s = []; w1s = []; w2s = []; w3s = []; w4s = []
A_energies = []; E_energies = []; F_energies = []; C_energies = []; D_energies = []; X_energies = []; Z_energies = []; L_energies = []

# define lists for desired properties (standard deviations).
fitnessStds = []; d1Stds = []; d2Stds = []; d3Stds = []; w1Stds = []; w2Stds = []; w3Stds = []; w4Stds = []

i = 0

# Loop over each catalyst group to extract mean values and standardard deviations. 
for group in catalyst_groups:
    rows = []
    for candidate in group:
        rows.append(extract_properties_from_sdf(candidate))
        
    # Definte pandas datafram with all extracted properties, from all candidates.
    df = pd.DataFrame(rows)

    # colelct mean values from descriptors and weights.
    fitness = str(df.agg({'FITNESS': ['mean']})).split()[2]; fitnesses.append(fitness)
    fitnessStd = str(df.agg({'FITNESS': ['std']})).split()[2]
    print('Fitness: ' + str(round(float(fitness), 2)) + ' (St.dev: ' + str(round(float(fitnessStd), 2)) + ')')
    d1 = str(df.agg({'DESCRIPTOR_1': ['mean']})).split()[2]; d1s.append(d1)
    d2 = str(df.agg({'DESCRIPTOR_2': ['mean']})).split()[2]; d2s.append(d2)
    d3 = str(df.agg({'DESCRIPTOR_3': ['mean']})).split()[2]; d3s.append(d3)
    w1 = str(df.agg({'WEIGHT_1': ['mean']})).split()[2]; w1s.append(w1)
    w2 = str(df.agg({'WEIGHT_2': ['mean']})).split()[2]; w2s.append(w2)
    w3 = str(df.agg({'WEIGHT_3': ['mean']})).split()[2]; w3s.append(w3)
    w4 = str(df.agg({'WEIGHT_4': ['mean']})).split()[2]; w4s.append(w4)

    #collect standard deviations from descriptors and weights.
    d1Stds.append(float(str(df.agg({'DESCRIPTOR_1': ['std']})).split()[2]))
    d2Stds.append(float(str(df.agg({'DESCRIPTOR_2': ['std']})).split()[2]))
    d3Stds.append(float(str(df.agg({'DESCRIPTOR_3': ['std']})).split()[2]))
    w1Stds.append(float(str(df.agg({'WEIGHT_1': ['std']})).split()[2]))
    w2Stds.append(float(str(df.agg({'WEIGHT_2': ['std']})).split()[2]))
    w3Stds.append(float(str(df.agg({'WEIGHT_3': ['std']})).split()[2]))
    w4Stds.append(float(str(df.agg({'WEIGHT_4': ['std']})).split()[2]))

    #collect mean values from potential energies.
    A_energies.append(float(str(df.agg({'freeEnergyA': ['mean']})).split()[2]))
    E_energies.append(float(str(df.agg({'freeEnergyE': ['mean']})).split()[2]))
    F_energies.append(float(str(df.agg({'freeEnergyF': ['mean']})).split()[2]))
    C_energies.append(float(str(df.agg({'freeEnergyC': ['mean']})).split()[2]))
    D_energies.append(float(str(df.agg({'freeEnergyD': ['mean']})).split()[2]))
    X_energies.append(float(str(df.agg({'freeEnergyX': ['mean']})).split()[2]))
    Z_energies.append(float(str(df.agg({'freeEnergyZ': ['mean']})).split()[2]))
    L_energies.append(float(str(df.agg({'freeEnergyL': ['mean']})).split()[2]))

    # print overview.
    print(f"candidate: {catalyst_name[i]}")
    print('D1: ' + str(round(float(d1), 2)))
    print('D2: ' + str(round(float(d2), 2)))
    print('D3: ' + str(round(float(d3), 2)))
    print('w1: ' + str(round(float(w1), 2)))
    print('w2: ' + str(round(float(w2), 2)))
    print('w3: ' + str(round(float(w3), 2)))
    print('w4: ' + str(round(float(w4), 2)))

    i = i + 1



Fitness: 0.0 (St.dev: 0.0)
candidate: H0
D1: 0.0
D2: 1.47
D3: 2.66
w1: 1.0
w2: 0.0
w3: 0.04
w4: 1.0
Fitness: 3.45 (St.dev: 1.71)
candidate: HI
D1: 0.0
D2: 5.17
D3: 0.21
w1: 1.0
w2: 0.69
w3: 0.9
w4: 1.0
Fitness: 13.66 (St.dev: 0.01)
candidate: HIIprime
D1: 0.23
D2: 13.05
D3: 0.38
w1: 1.0
w2: 1.0
w3: 1.0
w4: 1.0
Fitness: 14.95 (St.dev: 0.02)
candidate: HII
D1: 1.0
D2: 13.95
D3: 0.0
w1: 1.0
w2: 1.0
w3: 1.0
w4: 1.0
Fitness: 19.91 (St.dev: 6.86)
candidate: HC1Ph
D1: 5.87
D2: 14.54
D3: 0.0
w1: 0.98
w2: 1.0
w3: 1.0
w4: 1.0


In [121]:
# Extract energies from HQF.
HQF_path = []
HQF_name = ['H0HQF', 'HIHQF', 'HIIprimeHQF', 'HIIHQF', 'HC1PhHQF']
i = 0
for pathname in pathnames:
    print(i)
    if HQF_name[i] in pathname:
        HQF_path.append(pathname)
        i = i + 1


0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3


In [122]:
print(HQF_name[3])
print(HQF_path[3])


HIIHQF


IndexError: list index out of range