In [14]:
import warnings
warnings.filterwarnings('ignore')

import os
import json
import zipfile
import pandas as pd
from tqdm.auto import tqdm
from io import TextIOWrapper
from collections import defaultdict

path = '../../dgym-data/analysis/noise/noise_2024-04-21_11-16-18.zip'

trial_indices = defaultdict(int)
records = []
with zipfile.ZipFile(path, 'r') as z:
    for filename in tqdm(z.namelist()):
        if filename.endswith('.json'):
            with z.open(filename) as file:
                with TextIOWrapper(file, encoding='utf-8') as text_file:
                    try:
                        result = json.load(text_file)
                        record = pd.DataFrame(
                            result['annotations']
                        ).reindex(
                            columns=['ABL1 pIC50', 'Log P', 'Log S']
                        ).dropna()
                        
                        sigma = result['sigma']
                        trial_indices[sigma] += 1

                        record['sigma'] = sigma
                        record['trial'] = trial_indices[sigma]
                        
                        records.append(record)
                        result = None
                    except:
                        continue

df = pd.concat(records)
records = None

  0%|          | 0/840 [00:00<?, ?it/s]

In [37]:
from dgym.envs.utility import MultipleUtilityFunction, ClassicUtilityFunction

# create evaluators
docking_utility_function = ClassicUtilityFunction(
    ideal=(9.5, 13),
    acceptable=(8, 13)
)

log_P_utility_function = ClassicUtilityFunction(
    ideal=(0.5, 1.85),
    acceptable=(-0.5, 3.5)
)

log_S_utility_function = ClassicUtilityFunction(
    ideal=(-3, 1),
    acceptable=(-4, 1)
)

composite_utility_function = MultipleUtilityFunction([
    docking_utility_function, log_P_utility_function, log_S_utility_function])

In [42]:
records = []
for (sigma, trial), values in tqdm(df.groupby(['sigma', 'trial'])):
    utility = composite_utility_function(values.values, precompute=True, method='average')
    record = {'sigma': sigma, 'trial': trial, 'utility': max(utility)}
    records.append(record)

  0%|          | 0/840 [00:00<?, ?it/s]

In [43]:
pd.DataFrame(records)

Unnamed: 0,sigma,trial,utility
0,0.0,1,0.903102
1,0.0,2,0.895274
2,0.0,3,0.923008
3,0.0,4,0.847339
4,0.0,5,0.935467
...,...,...,...
835,2.0,36,0.762931
836,2.0,37,0.807495
837,2.0,38,0.874204
838,2.0,39,0.944385
