# Learning Chemical Classification Programs

This uses LLMs to learn programs for classifying chemical structures (SMILES strings) into chemical classes or groupings

In [6]:
import yaml
from pydantic import BaseModel

from c3p.datamodel import ChemicalStructure, ChemicalClass, Dataset

In [7]:
# ensure you have executed Generate-Dataset.ipynb first
# this has a list of chemical classes and their instances
with open("inputs/dataset.json") as f:
    dataset = Dataset.model_validate_json(f.read())
#dataset = Dataset.parse_file("inputs/dataset.json")
filtered_classes = dataset.classes
len(filtered_classes)

615

## Test Utils

In [9]:
from c3p.evaluation import split_to_training_test

In [10]:
a, b = split_to_training_test(filtered_classes, n=3)

In [11]:
a[0].instances

[ChemicalStructure(name='all-trans-retinal', smiles='[H]C(=O)\\C=C(/C)\\C=C\\C=C(/C)\\C=C\\C1=C(C)CCCC1(C)C'),
 ChemicalStructure(name='all-trans-retinol', smiles='C\\C(=C/CO)\\C=C\\C=C(/C)\\C=C\\C1=C(C)CCCC1(C)C'),
 ChemicalStructure(name='all-trans-retinyl ester', smiles='CC(\\C=C\\C=C(C)\\C=C\\C1=C(C)CCCC1(C)C)=C/COC([*])=O'),
 ChemicalStructure(name='all-trans-retinoic acid', smiles='CC(\\C=C\\C1=C(C)CCCC1(C)C)=C/C=C/C(C)=C/C(O)=O'),
 ChemicalStructure(name='all-trans-3,4-didehydroretinoic acid', smiles='C1(C)(C)CC=CC(=C1\\C=C\\C(=C\\C=C\\C(=C\\C(=O)O)\\C)\\C)C')]

In [12]:
b[0].instances

[ChemicalStructure(name='all-trans-3,4-didehydroretinol', smiles='C1(C)(C)C(\\C=C\\C(=C\\C=C\\C(=C\\CO)\\C)\\C)=C(C)C=CC1')]

## Run an individual experiment

In [18]:
from c3p.datamodel import Config

# claude-sonnet seems best so far
config = Config(llm_model_name="lbl/claude-sonnet", max_attempts=5, accuracy_threshold=0.95)

In [19]:
# TODO: do the split at time of program generation
training_set, test_set = split_to_training_test(filtered_classes, n=9999, start=0, proportion_test=0.0)
len(training_set)


615

In [22]:
from c3p.generator import generate_and_test_classifier

def learn_programs(training_set, n=99999):
    results = []
    for test_cls in training_set[0:n]:
        print(f"## {test_cls.name} POS={len(test_cls.instances)} NEG={len(test_cls.negatives)}")
        for result in generate_and_test_classifier(test_cls, config=config):
            print(f"attempt={result.attempt} compiled={result.success} tp={result.num_true_positives} tn={result.num_true_negatives} fp={result.num_false_positives} f1={result.f1}, len=={len(result.code)}")
            results.append(result)
            result.calculate()
    return results

In [23]:
#results = learn_programs(training_set, n=9999)
results = learn_programs(training_set, n=2)

## vitamin A
attempt=0 tp=None tn=None fp=None f1=None
attempt=1 tp=0 tn=20 fp=0 f1=0
attempt=2 tp=0 tn=20 fp=0 f1=0
attempt=3 tp=0 tn=20 fp=0 f1=0
attempt=4 tp=4 tn=20 fp=0 f1=0.8
FAILED: vitamin A err=
Attempt failed: F1 score of 0.8 is too 
## pyrrolobenzodiazepine
attempt=0 tp=None tn=None fp=None f1=None
attempt=1 tp=0 tn=20 fp=0 f1=0
attempt=2 tp=5 tn=20 fp=0 f1=0.8333333333333333
attempt=3 tp=5 tn=20 fp=0 f1=0.8333333333333333
attempt=4 tp=5 tn=20 fp=0 f1=0.8333333333333333
FAILED: pyrrolobenzodiazepine err=
Attempt failed: F1 score of 0.833333333


In [24]:
print(len(results))

10


In [25]:
def calculate_best(results):
    """
    Sets the best flag on the best result for each chemical class
    """
    best_by_cls = {}
    for r in results:
        cid = r.chemical_class.id
        if r.f1 and (cid not in best_by_cls or r.f1 > best_by_cls[cid]):
            best_by_cls[cid] = r.f1
    for r in results:
        r.best = False
        cid = r.chemical_class.id
        if cid in best_by_cls and best_by_cls[cid] == r.f1:
            r.best = True
            
calculate_best(results)

In [26]:
# count of all failed
len([r for r in results if not r.best])

6

In [31]:
best_by_cls = {r.chemical_class.id: r for r in results if r.best}
len(best_by_cls)

2

In [32]:
import pandas as pd


def calc_eval_results(results, min_f1=0):
    eval_results = []
    for result in results:
        if result.f1 < min_f1:
            continue
        # print(result.f1)
        train_cls = result.chemical_class
        code = result.code
        [test_cls] = [c for c in test_set if c.id == train_cls.id]
        # note we suppress the use of the LLM, as we are evaluating the generated code
        # TODO: more elegant way to do this
        for eval_result in generate_and_test_classifier(test_cls, suppress_llm=True, prog=code, config=config):
            eval_results.append(eval_result)
            eval_result.calculate()
            # print(eval_result.f1)
    return pd.DataFrame([r.model_dump() for r in eval_results])



    

In [36]:
eval_df = calc_eval_results([r for r in results if r.best]) 
eval_df

Unnamed: 0,chemical_class,config,code,true_positives,false_positives,true_negatives,false_negatives,attempt,success,best,error,stdout,num_true_positives,num_false_positives,num_true_negatives,num_false_negatives,precision,recall,f1
0,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[],[],[],[],0,True,False,,,0,0,0,0,0.0,0,0
1,"{'id': 'CHEBI:131437', 'name': 'pyrrolobenzodi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[],[],[],[],0,True,False,,,0,0,0,0,0.0,0,0
2,"{'id': 'CHEBI:131437', 'name': 'pyrrolobenzodi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[],[],[],[],0,True,False,,,0,0,0,0,0.0,0,0
3,"{'id': 'CHEBI:131437', 'name': 'pyrrolobenzodi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[],[],[],[],0,True,False,,,0,0,0,0,0.0,0,0


In [34]:
from pathlib import Path

results_dir = Path("latest")
results_dir.mkdir(parents=True, exist_ok=True)
with open(results_dir / "results.json", "w") as f:
    import json
    results_objs = [r.model_dump() for r in results]
    f.write(json.dumps(results_objs, indent=2))

In [35]:
#!ls -l latest

In [37]:
def results_as_df(results):
    rows = []
    for r in results:
        r.calculate()
        row = r.model_dump()
        rows.append(row)
    return pd.DataFrame(rows)
        

In [42]:
results_as_df(results)

Unnamed: 0,chemical_class,config,code,true_positives,false_positives,true_negatives,false_negatives,attempt,success,best,error,stdout,num_true_positives,num_false_positives,num_true_negatives,num_false_negatives,precision,recall,f1
0,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,module 'rdkit.Chem.rdMolDescriptors' has no at...,,0,0,0,0,0.0,0.0,0.0
1,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[],[],[(C[C@@H]1CN(C(=O)C2=C(C3=CC=CC=C3CO[C@@H]1CN(...,[(C1(C)(C)CC=CC(=C1\C=C\C(=C\C=C\C(=C\C(=O)O)\...,1,True,False,,,0,0,20,6,0.0,0.0,0.0
2,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[],[],[(C[C@@H]1CN(C(=O)C2=C(C3=CC=CC=C3CO[C@@H]1CN(...,[(C1(C)(C)CC=CC(=C1\C=C\C(=C\C=C\C(=C\C(=O)O)\...,2,True,False,,,0,0,20,6,0.0,0.0,0.0
3,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[],[],[(C[C@@H]1CN(C(=O)C2=C(C3=CC=CC=C3CO[C@@H]1CN(...,[(C1(C)(C)CC=CC(=C1\C=C\C(=C\C=C\C(=C\C(=O)O)\...,3,True,False,,,0,0,20,6,0.0,0.0,0.0
4,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(C\C(=C/CO)\C=C\C=C(/C)\C=C\C1=C(C)CCCC1(C)C,...",[],[(C[C@@H]1CN(C(=O)C2=C(C3=CC=CC=C3CO[C@@H]1CN(...,[(C1(C)(C)CC=CC(=C1\C=C\C(=C\C=C\C(=C\C(=O)O)\...,4,True,True,,,4,0,20,2,1.0,0.666667,0.8
5,"{'id': 'CHEBI:131437', 'name': 'pyrrolobenzodi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,cannot import name 'rdDecomposition' from 'rdk...,,0,0,0,0,0.0,0.0,0.0
6,"{'id': 'CHEBI:131437', 'name': 'pyrrolobenzodi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[],[],[([C@@]12(C[C@@H]([C@@H]([C@@]3(N1C(N[C@](C2)(...,[(OC1=C2N[C@@H]([C@]3(N(C=C(/C=C/C)C3)C(=O)C2=...,1,True,False,[21:18:28] SMILES Parse Error: syntax error wh...,,0,0,20,7,0.0,0.0,0.0
7,"{'id': 'CHEBI:131437', 'name': 'pyrrolobenzodi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(OC1=C2N[C@@H]([C@]3(N(C=C(/C=C/C)C3)C(=O)C2=...,[],[([C@@]12(C[C@@H]([C@@H]([C@@]3(N1C(N[C@](C2)(...,[(O=C1N2[C@@H](CC(=C2)C3=CC=C(OC)C=C3)C=NC=4C1...,2,True,True,[21:18:50] SMILES Parse Error: syntax error wh...,,5,0,20,2,1.0,0.714286,0.833333
8,"{'id': 'CHEBI:131437', 'name': 'pyrrolobenzodi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(OC1=C2N[C@@H]([C@]3(N(C=C(/C=C/C)C3)C(=O)C2=...,[],[([C@@]12(C[C@@H]([C@@H]([C@@]3(N1C(N[C@](C2)(...,[(O=C1N2[C@@H](CC(=C2)C3=CC=C(OC)C=C3)C=NC=4C1...,3,True,True,[21:19:15] SMILES Parse Error: syntax error wh...,,5,0,20,2,1.0,0.714286,0.833333
9,"{'id': 'CHEBI:131437', 'name': 'pyrrolobenzodi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(OC1=C2N[C@@H]([C@]3(N(C=C(/C=C/C)C3)C(=O)C2=...,[],[([C@@]12(C[C@@H]([C@@H]([C@@]3(N1C(N[C@](C2)(...,[(O=C1N2[C@@H](CC(=C2)C3=CC=C(OC)C=C3)C=NC=4C1...,4,True,True,[21:19:41] SMILES Parse Error: syntax error wh...,,5,0,20,2,1.0,0.714286,0.833333


In [38]:
#eval_df = results_as_df(eval_results)

In [39]:
eval_df.to_csv( results_dir / "eval_results.csv")

In [40]:
from c3p.stats import calculate_metrics_pandas


def df_stats(df):
    return calculate_metrics_pandas(df.aggregate({"num_true_positives": "sum", "num_true_negatives": "sum", "num_false_positives": "sum",  "num_false_negatives": "sum"}))

In [41]:
df_stats(calc_eval_results([r for r in results if r.best]))

  'accuracy': (tp + tn) / (tp + tn + fp + fn),
  'precision': tp / (tp + fp),
  'recall': tp / (tp + fn),
  'specificity': tn / (tn + fp),
  'f1_score': 2 * tp / (2 * tp + fp + fn),
  'false_positive_rate': fp / (tn + fp),
  'negative_predictive_value': tn / (tn + fn),


total                        0.0
positives                    0.0
negatives                    0.0
actual_positives             0.0
actual_negatives             0.0
accuracy                     NaN
precision                    NaN
recall                       NaN
specificity                  NaN
f1_score                     NaN
false_positive_rate          NaN
negative_predictive_value    NaN
balanced_accuracy            NaN
dtype: float64

In [52]:
df_stats(calc_eval_results(results, min_f1=1.0))

total                        506.0000
positives                     71.0000
negatives                    435.0000
actual_positives              68.0000
actual_negatives             438.0000
accuracy                       0.9862
precision                      0.9296
recall                         0.9706
specificity                    0.9886
f1_score                       0.9496
false_positive_rate            0.0114
negative_predictive_value      0.9954
balanced_accuracy              0.9796
dtype: float64

In [53]:
df_stats(calc_eval_results(results, min_f1=1.0)).to_csv(results_dir / "top_f1_stats.csv")

In [54]:
results_df = results_as_df(results)

In [55]:
results_df.query('best == True')


Unnamed: 0,chemical_class,config,code,true_positives,false_positives,true_negatives,false_negatives,attempt,success,best,error,stdout,num_true_positives,num_false_positives,num_true_negatives,num_false_negatives,precision,recall,f1
3,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(CC(\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C)=C/COC([...,[],[(COCC(=O)N[C@@H]1C=C[C@H](O[C@@H]1CO)CC(=O)NC...,[(C1(C)(C)C(\C=C\C(=C\C=C\C(=C\CO)\C)\C)=C(C)C...,3,True,True,,,4,0,19,1,1.0,0.8,0.888889
7,"{'id': 'CHEBI:131437', 'name': 'pyrrolobenzodi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(O=C1N2[C@@H](CC(=C2)C3=CC=C(OC)C=C3)C=NC=4C1...,[],[(CN1C(=NN=N1)SC2=NC=NC3=C2C(=CS3)C4=CC=C(C=C4...,[],2,True,True,,,6,0,20,0,1.0,1.0,1.0
13,"{'id': 'CHEBI:131619', 'name': 'C27-steroid', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(CC(C)CCC[C@](C)(O)C1CCC2C3CCC4C[C@@H](O)CC[C...,[(CC(CCC[C@@H](C)[C@H]1CC[C@H]2[C@@H]3CC=C4C[C...,[([H][C@@]12C[C@@H](C)[C@](O)(C(=O)CO)[C@@]1(C...,[([H]C(=O)C1C2CC(O)CCC2(C)C2CCC3(C)C(CCC3([H])...,0,True,True,,,19,2,18,1,0.904762,0.95,0.926829
15,"{'id': 'CHEBI:131619', 'name': 'C27-steroid', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(CC(C)CCC[C@](C)(O)C1CCC2C3CCC4C[C@@H](O)CC[C...,[(CC(CCC[C@@H](C)[C@H]1CC[C@H]2[C@@H]3CC=C4C[C...,[([H][C@@]12C[C@@H](C)[C@](O)(C(=O)CO)[C@@]1(C...,[([H]C(=O)C1C2CC(O)CCC2(C)C2CCC3(C)C(CCC3([H])...,2,True,True,,,19,2,18,1,0.904762,0.95,0.926829
18,"{'id': 'CHEBI:131620', 'name': 'C24-steroid', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[([H][C@@]1(CC[C@@]2([H])[C@]3([H])C[C@H](O)[C...,[],[(C1OC2=C(O1)C=C(C=C2)CNC(=O)C[C@H]3C=C[C@H]([...,[],0,True,True,,,12,0,16,0,1.0,1.0,1.0
19,"{'id': 'CHEBI:131621', 'name': 'C19-steroid', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[([H][C@@]12CCC3=CC(=O)CC[C@]3(C)[C@@]1([H])CC...,[(O1[C@H]2C3=CC[C@H]4[C@@H](CCC[C@@]4([C@H]3CC...,[(CCNC(=O)NC1=CC2=C(C=C1)OC[C@@H]3[C@@H](CC[C@...,[],0,True,True,,,6,2,18,0,0.75,1.0,0.857143
25,"{'id': 'CHEBI:131697', 'name': 'pyrimidotriazi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(CC1=NN(C2=NC(=O)N(C(=O)C2=N1)C)C, Contains o...",[],[(CN(C)C1=CC2=C(C=C1)O[C@@H]3[C@H]2C[C@@H](O[C...,[],1,True,True,,,6,0,20,0,1.0,1.0,1.0
32,"{'id': 'CHEBI:131859', 'name': 'oxo-ETE anion'...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(C(CCC/C=C\C/C=C\CC(/C=C/C=C\CCCCC)=O)(=O)[O-...,[],[(CN1[C@@H]2CC[C@@H](O[C@H]2COC3=C(C1=O)C=C(C=...,[],1,True,True,,,5,0,20,0,1.0,1.0,1.0
34,"{'id': 'CHEBI:131862', 'name': 'HPODE(1-)', 'd...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(C(CCCCCCCC(/C=C/C=C\CCCCC)OO)([O-])=O, Molec...",[],[(C(\CC)=C\C/C=C\C/C=C\C=C\C(C/C=C\CCCC(=O)[O-...,[],1,True,True,,,8,0,20,0,1.0,1.0,1.0
36,"{'id': 'CHEBI:131867', 'name': 'hydroxydocosah...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(C(C([O-])=O)C/C=C\CC(/C=C/C=C\C/C=C\C/C=C\C/...,[],[(C(\CCCC([O-])=O)=C\C/C=C\C/C=C\C\C=C/C=C/[C@...,[],1,True,True,,,6,0,20,0,1.0,1.0,1.0


In [56]:
results_df.query('best == True').aggregate({"precision": "mean", "recall": "mean", "f1": "mean"})


precision    0.968544
recall       0.963279
f1           0.959676
dtype: float64

In [57]:
results_df.query('best == True and precision == 1.0')

Unnamed: 0,chemical_class,config,code,true_positives,false_positives,true_negatives,false_negatives,attempt,success,best,error,stdout,num_true_positives,num_false_positives,num_true_negatives,num_false_negatives,precision,recall,f1
3,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(CC(\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C)=C/COC([...,[],[(COCC(=O)N[C@@H]1C=C[C@H](O[C@@H]1CO)CC(=O)NC...,[(C1(C)(C)C(\C=C\C(=C\C=C\C(=C\CO)\C)\C)=C(C)C...,3,True,True,,,4,0,19,1,1.0,0.8,0.888889
7,"{'id': 'CHEBI:131437', 'name': 'pyrrolobenzodi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(O=C1N2[C@@H](CC(=C2)C3=CC=C(OC)C=C3)C=NC=4C1...,[],[(CN1C(=NN=N1)SC2=NC=NC3=C2C(=CS3)C4=CC=C(C=C4...,[],2,True,True,,,6,0,20,0,1.0,1.0,1.0
18,"{'id': 'CHEBI:131620', 'name': 'C24-steroid', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[([H][C@@]1(CC[C@@]2([H])[C@]3([H])C[C@H](O)[C...,[],[(C1OC2=C(O1)C=C(C=C2)CNC(=O)C[C@H]3C=C[C@H]([...,[],0,True,True,,,12,0,16,0,1.0,1.0,1.0
25,"{'id': 'CHEBI:131697', 'name': 'pyrimidotriazi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(CC1=NN(C2=NC(=O)N(C(=O)C2=N1)C)C, Contains o...",[],[(CN(C)C1=CC2=C(C=C1)O[C@@H]3[C@H]2C[C@@H](O[C...,[],1,True,True,,,6,0,20,0,1.0,1.0,1.0
32,"{'id': 'CHEBI:131859', 'name': 'oxo-ETE anion'...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(C(CCC/C=C\C/C=C\CC(/C=C/C=C\CCCCC)=O)(=O)[O-...,[],[(CN1[C@@H]2CC[C@@H](O[C@H]2COC3=C(C1=O)C=C(C=...,[],1,True,True,,,5,0,20,0,1.0,1.0,1.0
34,"{'id': 'CHEBI:131862', 'name': 'HPODE(1-)', 'd...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(C(CCCCCCCC(/C=C/C=C\CCCCC)OO)([O-])=O, Molec...",[],[(C(\CC)=C\C/C=C\C/C=C\C=C\C(C/C=C\CCCC(=O)[O-...,[],1,True,True,,,8,0,20,0,1.0,1.0,1.0
36,"{'id': 'CHEBI:131867', 'name': 'hydroxydocosah...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(C(C([O-])=O)C/C=C\CC(/C=C/C=C\C/C=C\C/C=C\C/...,[],[(C(\CCCC([O-])=O)=C\C/C=C\C/C=C\C\C=C/C=C/[C@...,[],1,True,True,,,6,0,20,0,1.0,1.0,1.0
39,"{'id': 'CHEBI:131868', 'name': 'hydroperoxydoc...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(CC\C=C/C\C=C/CC(OO)\C=C\C=C/C\C=C/C\C=C/CCC(...,[],[(C(C([O-])=O)C/C=C\CC(/C=C/C=C\C/C=C\C/C=C\C/...,[],2,True,True,,,5,0,20,0,1.0,1.0,1.0
44,"{'id': 'CHEBI:131873', 'name': 'EpETE(1-)', 'd...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(C(\CC)=C\C/C=C\C/C=C\CC1C(C/C=C\CCCC(=O)[O-]...,[],[(C(/C=C\CCCCC(=O)[O-])/C=C\C=C\C=C\[C@H]([C@@...,[],4,True,True,,,7,0,20,0,1.0,1.0,1.0
48,"{'id': 'CHEBI:131874', 'name': 'HEPE(1-)', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(C(\CC)=C\C/C=C\C/C=C\C/C=C\C=C\C(CCCC(=O)[O-...,[],[(C(\CC)=C\C/C=C\C/C=C\CC1C(C/C=C\CCCC(=O)[O-]...,[],3,True,True,,,9,0,20,0,1.0,1.0,1.0


In [58]:
results_df.query('success == True')

Unnamed: 0,chemical_class,config,code,true_positives,false_positives,true_negatives,false_negatives,attempt,success,best,error,stdout,num_true_positives,num_false_positives,num_true_negatives,num_false_negatives,precision,recall,f1
0,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(CC(\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C)=C/COC([...,[(C\C(\C=C\C1=C(C)C(=O)CCC1(C)C)=C/C=C/C(/C)=C...,[(COCC(=O)N[C@@H]1C=C[C@H](O[C@@H]1CO)CC(=O)NC...,[],0,True,False,,,5,3,16,0,0.625,1.0,0.769231
1,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[],[],[(COCC(=O)N[C@@H]1C=C[C@H](O[C@@H]1CO)CC(=O)NC...,[(CC(\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C)=C/COC([...,1,True,False,,,0,0,19,5,0.000,0.0,0.000000
2,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[],[],[(COCC(=O)N[C@@H]1C=C[C@H](O[C@@H]1CO)CC(=O)NC...,[(CC(\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C)=C/COC([...,2,True,False,,,0,0,19,5,0.000,0.0,0.000000
3,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(CC(\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C)=C/COC([...,[],[(COCC(=O)N[C@@H]1C=C[C@H](O[C@@H]1CO)CC(=O)NC...,[(C1(C)(C)C(\C=C\C(=C\C=C\C(=C\CO)\C)\C)=C(C)C...,3,True,True,,,4,0,19,1,1.000,0.8,0.888889
4,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[],[],[(COCC(=O)N[C@@H]1C=C[C@H](O[C@@H]1CO)CC(=O)NC...,[(CC(\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C)=C/COC([...,4,True,False,,,0,0,19,5,0.000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,"{'id': 'CHEBI:138138', 'name': 'epoxy(hydroxy)...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(C(CCCO)C/C=C\C/C=C\CC1C(C/C=C\CCCC(O)=O)O1, ...",[],"[(OC(CCCCCCCC(O)=O)CCC(=O)CC/C=C\CC, Contains ...",[],0,True,True,,,9,0,20,0,1.000,1.0,1.000000
160,"{'id': 'CHEBI:138979', 'name': 'hemisuccinate'...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[([H][C@@]12CCC3=CC(=O)CC[C@]3(C)[C@@]1([H])CC...,[],[(C1(=CC=CC=C1C(=O)OCC(CCCCC(C)C(=O)O)C)C(=O)O...,[],0,True,True,,,27,0,20,0,1.000,1.0,1.000000
161,"{'id': 'CHEBI:139111', 'name': 'alpha-galactos...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[],[],[(C[C@H]1CN([C@H](COC2=C(C=CC(=C2)NC(=O)NC3=C(...,[(CCCCCCCC(=O)N[C@@H](CO[C@H]1O[C@H](CO)[C@H](...,0,True,False,,,0,0,16,9,0.000,0.0,0.000000
162,"{'id': 'CHEBI:139111', 'name': 'alpha-galactos...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[],[],[(C[C@H]1CN([C@H](COC2=C(C=CC(=C2)NC(=O)NC3=C(...,[(CCCCCCCC(=O)N[C@@H](CO[C@H]1O[C@H](CO)[C@H](...,1,True,False,,,0,0,16,9,0.000,0.0,0.000000


In [59]:
results_df.query('success == False')

Unnamed: 0,chemical_class,config,code,true_positives,false_positives,true_negatives,false_negatives,attempt,success,best,error,stdout,num_true_positives,num_false_positives,num_true_negatives,num_false_negatives,precision,recall,f1
5,"{'id': 'CHEBI:131437', 'name': 'pyrrolobenzodi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,cannot import name 'rdDecomposition' from 'rdk...,,0,0,0,0,0.0,0.0,0.0
8,"{'id': 'CHEBI:131565', 'name': 'steroid aldehy...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,cannot import name 'rdDecomposition' from 'rdk...,,0,0,0,0,0.0,0.0,0.0
14,"{'id': 'CHEBI:131619', 'name': 'C27-steroid', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,1,False,False,unsupported operand type(s) for +: '_vecti' an...,,0,0,0,0,0.0,0.0,0.0
24,"{'id': 'CHEBI:131697', 'name': 'pyrimidotriazi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,cannot import name 'rdDecomposition' from 'rdk...,,0,0,0,0,0.0,0.0,0.0
45,"{'id': 'CHEBI:131874', 'name': 'HEPE(1-)', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,module 'rdkit.Chem.rdMolDescriptors' has no at...,,0,0,0,0,0.0,0.0,0.0
49,"{'id': 'CHEBI:131877', 'name': 'dihydroxyicosa...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,module 'rdkit.Chem.rdMolDescriptors' has no at...,,0,0,0,0,0.0,0.0,0.0
63,"{'id': 'CHEBI:131901', 'name': 'pyranopyranone...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,cannot import name 'rdDecomposition' from 'rdk...,,0,0,0,0,0.0,0.0,0.0
81,"{'id': 'CHEBI:132126', 'name': 'dihydroxy-1,4-...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,cannot import name 'rdDecomposition' from 'rdk...,,0,0,0,0,0.0,0.0,0.0
86,"{'id': 'CHEBI:132157', 'name': 'hydroxy-1,4-na...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,cannot import name 'rdDecomposition' from 'rdk...,,0,0,0,0,0.0,0.0,0.0
90,"{'id': 'CHEBI:132215', 'name': 'heparin octasa...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,Python argument types in\n Mol.GetSubstruct...,,0,0,0,0,0.0,0.0,0.0


In [60]:
slim_df = results_df.copy()
slim_df["code"] = ""

In [61]:
slim_df.to_csv(results_dir / "results.csv")

In [62]:
from c3p.generator import safe_name
import yaml

config_yaml = yaml.dump(config.model_dump())
config_yaml = "\n".join([f"# {line}" for line in config_yaml.split("\n")])

for r in results:
    cn = safe_name(r.chemical_class.name)
    prog_dir = results_dir / "programs"
    prog_dir.mkdir(exist_ok=True, parents=True)
    prog_path = f"{prog_dir / cn}.py"
    #print(prog_path)
    with open(prog_path, "w") as f:
        f.write(config_yaml)
        f.write("\n")
        f.write(r.code)
        f.write(f"\n# Attempt={r.attempt}")
        f.write(f"\n# Pr={r.precision}")
        f.write(f"\n# Recall={r.recall}")
        f.write(f"\n# Recall={r.f1}")
    