# Collect Results from the Training, Pick the Best MolDQN
Get the results from the different runs and mark the MolDQN which was the best

In [1]:
from glob import glob
import pandas as pd
import json
import os

## Get all the Results
Get the results of any models

In [2]:
dirs = list(filter(os.path.isdir, glob(os.path.join('rl_tests', '*'))))
print(f'Found {len(dirs)} run directories')

Found 1 run directories


In [3]:
def summarize_run(path: str) -> dict:
    """Summarize the performance of a certain run
    
    Args:
        path (str): Path to the ML run
    Returns:
        (dict) Run summary
    """
    
    # First, load in the configuratoin
    with open(os.path.join(path, 'config.json')) as fp:
        output = json.load(fp)
    output['path'] = path
        
    # Load in the name of the system and some other performance details
    perf_path = os.path.join(path, 'performance.json')
    if os.path.isfile(perf_path):
        with open(perf_path) as fp:
            perf = json.load(fp)
            for k in ['hostname', 'runtime']:
                output[k] = perf[k]
        output['complete'] = True
    else:
        output['complete'] = False
            
    # Assess the run data
    try:
        mols = pd.read_csv(os.path.join(path, 'molecules.csv'))
    except:
        return output
    if not output['maximize']:
        mols['reward'] = -1 * mols['reward']
    output['episodes_completed'] = mols['episode'].max() + 1
    output['rewards_computed'] = len(mols)
    output['unique_molecules'] = len(set(mols['smiles']))
    output['unique_fraction'] = output['unique_molecules'] / len(mols)
    output['best_found'] = -1 * mols['u0_atom'].max()
    output['below_5ha'] = (mols['u0_atom'] >= 5).sum()
        
    return output

In [4]:
results = pd.DataFrame(summarize_run(d) for d in dirs).sort_values('below_5ha', ascending=False)
results.head()[['path', 'rewards_computed', 'unique_fraction', 'unique_molecules', 'best_found', 'below_5ha']]

Unnamed: 0,path,rewards_computed,unique_fraction,unique_molecules,best_found,below_5ha
0,rl_tests/u0_atom_2020-08-05T17.28.06.408880,199293,0.742565,147988,-6.134692,459


In [5]:
best_run = results.iloc[0]

## Link the best agent to the local directory
So that we can easily access it later

In [6]:
if os.path.islink('agent.pkl'):
    os.unlink('agent.pkl')
os.symlink(os.path.join(best_run['path'], 'agent.pkl'), 'agent.pkl')

## Save a list of "top molecules"
So that we can seed the molecular design engine

In [7]:
mols = pd.read_csv(os.path.join(best_run.path, 'molecules.csv'))

In [8]:
with open('best_mols.json', 'w') as fp:
    best_list = mols.sort_values('u0_atom').tail(1024)['smiles'].tolist()
    json.dump(best_list, fp)