# Inspect ORD file

In [36]:
import ord_schema
from ord_schema import message_helpers, validations
from ord_schema.proto import dataset_pb2
import os
import fnmatch

In [10]:
# Inspect raw ORD file
# Find the schema here
# https://github.com/open-reaction-database/ord-schema/blob/main/ord_schema/proto/reaction.proto


# Load Dataset message
pb = "../data/ord/02/ord_dataset-02ee2261663048188cf6d85d2cc96e3f.pb.gz"
data = message_helpers.load_message(pb, dataset_pb2.Dataset)

# Inspect a reaction identifier
data.reactions[0].identifiers[0].value

'[S:1](=[O:4])(=[O:3])=[O:2].[S:5](=[O:9])(=[O:8])([OH:7])[OH:6]>>[OH:8][S:5]([OH:9])(=[O:7])=[O:6].[O:2]=[S:1](=[O:4])=[O:3] |f:2.3|'

In [12]:
data.name

'uspto-grants-1993_09'

In [37]:
# Can also find ORD file just given the hash:
def open_ord(ord_hash):
    folder_path = '/Users/dsw46/Projects_local/ORDerly/data/ord'
    # look for files within that folder path or deeper for a file that contains the ord_hash
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if fnmatch.fnmatch(file, f'*{ord_hash}*'):
                filepath = os.path.join(root, file)
                data = message_helpers.load_message(filepath, dataset_pb2.Dataset)
                return data
    raise FileNotFoundError(f"No file found containing {ord_hash} in the name.")



In [39]:
data = open_ord('ord_dataset-85c00026681b46f89ef8634d2b8618c3')
data.name

'uspto-grants-2001_07'

# Inspect extracted data from ORD file

In [14]:
import pandas as pd

In [30]:
path = "../data/orderly/uspto_no_trust/extracted_ords/uspto-grants-1993_09.parquet"
df = pd.read_parquet(path)

In [31]:
df['rxn_str'][0]

'[S:1](=[O:4])(=[O:3])=[O:2].[S:5](=[O:9])(=[O:8])([OH:7])[OH:6]>>[OH:8][S:5]([OH:9])(=[O:7])=[O:6].[O:2]=[S:1](=[O:4])=[O:3]'

# Inspect cleaned data from ORD file

In [None]:
import pandas as pd

In [33]:
path = "../data/orderly/datasets/orderly_no_trust_no_map_train.parquet"
df = pd.read_parquet(path)

In [34]:
df['rxn_str'][0]

'[Cl:1][C:2]1[N:3]=[CH:4][C:5]2[C:10]([CH:11]=1)=[C:9]([N+:12]([O-])=O)[CH:8]=[CH:7][CH:6]=2.O.[OH-].[Na+]>C(O)(=O)C.[Fe]>[Cl:1][C:2]1[N:3]=[CH:4][C:5]2[C:10]([CH:11]=1)=[C:9]([NH2:12])[CH:8]=[CH:7][CH:6]=2'

# Download ORD

In [1]:
import pathlib
import zipfile

import pandas as pd
import requests


def download_benchmark(
    benchmark_zip_file="orderly_benchmark.zip",
    benchmark_directory="orderly_benchmark/",
    version=2,
):
    figshare_url = (
        f"https://figshare.com/ndownloader/articles/23298467/versions/{version}"
    )
    print(f"Downloading benchmark from {figshare_url} to {benchmark_zip_file}")
    r = requests.get(figshare_url, allow_redirects=True)
    with open(benchmark_zip_file, "wb") as f:
        f.write(r.content)

    print("Unzipping benchmark")
    benchmark_directory = pathlib.Path(benchmark_directory)
    benchmark_directory.mkdir(parents=True, exist_ok=True)
    with zipfile.ZipFile(benchmark_zip_file, "r") as zip_ref:
        zip_ref.extractall(benchmark_directory)


download_benchmark()
train_df = pd.read_parquet("orderly_benchmark/orderly_benchmark_train.parquet")
test_df = pd.read_parquet("orderly_benchmark/orderly_benchmark_test.parquet")

Downloading benchmark from https://figshare.com/ndownloader/articles/23298467/versions/2 to orderly_benchmark.zip
Unzipping benchmark


In [2]:
train_df

Unnamed: 0,agent_000,agent_001,agent_002,date_of_experiment,extracted_from_file,grant_date,is_mapped,procedure_details,product_000,reactant_000,reactant_001,rxn_str,rxn_time,solvent_000,solvent_001,temperature,yield_000
332116,[OH-],[Na+],,NaT,ord_dataset-bcc0b01d4f58457a8733b10a099f43ba,2015-01-01 00:10:00,True,To N-[6-(1-tert-butyloxycarbonylpiperidin-4-yl...,Cc1c(C(=O)Nc2ccc(C3CCNCC3)nc2)cnn1-c1ccc(C(F)(...,Cc1c(C(=O)Nc2ccc(C3CCN(C(=O)OC(C)(C)C)CC3)nc2)...,,C(OC([N:8]1[CH2:13][CH2:12][CH:11]([C:14]2[N:1...,2.50,O=C(O)C(F)(F)F,ClCCl,25.0,89.4
109219,[OH-],[Pd+2],,NaT,ord_dataset-e967d076b4894c2c854795f019ed3c39,2002-01-01 00:06:00,True,A solution of 240 mg (0.50 mmol) of 3-bromo-4-...,CN1C(=O)CC(c2c(CCCCc3cc4ccccc4[nH]3)[nH]c3cccc...,CN1C(=O)C(Br)=C(c2c(CCCCc3cc4ccccc4[nH]3)[nH]c...,,Br[C:2]1[C:3](=[O:31])[N:4]([CH3:30])[C:5](=[O...,,CO,,,
212362,c1ccc(P(c2ccccc2)c2ccccc2)cc1,,,NaT,ord_dataset-3af92aec23dc4810b92eb0d8c60023ee,2011-01-01 00:03:00,True,"A solution of triphenylphosphine (469 mg, 1.79...",BrCc1cc2ccccc2cc1I,BrC(Br)(Br)Br,OCc1cc2ccccc2cc1I,C1(P(C2C=CC=CC=2)C2C=CC=CC=2)C=CC=CC=1.[C:20](...,4.00,ClCCl,,25.0,
198710,[Fe],Cl,,NaT,ord_dataset-f886e51ba1484c76a94bce1482f1eab9,2010-01-01 00:07:00,True,To a solution of methyl N-methyl-N-[(4′-nitro-...,COC(=O)[C@H](C(C)C)N(C)C(=O)c1ccc(-c2ccc(N)cc2...,COC(=O)[C@H](C(C)C)N(C)C(=O)c1ccc(-c2ccc([N+](...,,[CH3:1][N:2]([C:11]([C:13]1[CH:18]=[CH:17][C:1...,,CCO,,,75.8
243552,CC(C)OC(=O)/N=N/C(=O)OC(C)C,c1ccc(P(c2ccccc2)c2ccccc2)cc1,,NaT,ord_dataset-9cd817a75dfc4fe7ad19d4232772d5ff,2012-01-01 00:07:00,True,"DIAD (23.95 ml, 123 mmol) was added dropwise t...",COC(=O)c1cccc(OC)c1OCCNC(=O)OC(C)(C)C,CC(C)(C)OC(=O)NCCO,COC(=O)c1cccc(OC)c1O,CC(OC(/N=N/C(OC(C)C)=O)=O)C.[OH:15][CH2:16][CH...,0.50,C1CCOC1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232549,[Li]CCCC,,,NaT,ord_dataset-aaeaab5f3720492494c1cbbdd0ed2820,2012-01-01 00:02:00,True,To a solution of DIPA (15.5 mL) in THF (300 mL...,COc1ccc2ncc(F)c(C=O)c2c1,CN(C)C=O,COc1ccc2ncc(F)cc2c1,[Li]CCCC.[F:6][C:7]1[CH:8]=[N:9][C:10]2[C:15](...,0.08,C1CCOC1,,0.0,52.0
208488,[Na+],[BH3-]C#N,,NaT,ord_dataset-136cfada6ce247b4919085a57363459e,2011-01-01 00:01:00,True,To a cold solution of methyl indole-5-carboxyl...,COC(=O)c1ccc2c(c1)CCN2,COC(=O)c1ccc2[nH]ccc2c1,,[NH:1]1[C:9]2[C:4](=[CH:5][C:6]([C:10]([O:12][...,,CC(=O)O,,25.0,80.2
222435,Cl,[Na+],[OH-],NaT,ord_dataset-52a37d876ddb453e86de0c15fa233d29,2011-01-01 00:09:00,True,"Before the start of the reaction, the loop rea...",CC(C)(C)OOC(=O)c1ccccc1,CC(C)(C)OO,O=C(Cl)c1ccccc1,[C:1]([O:5][OH:6])([CH3:4])([CH3:3])[CH3:2].[O...,,O,,,95.3
225524,Cc1ccc(S(=O)(=O)[O-])cc1,c1cc[nH+]cc1,,NaT,ord_dataset-375a420ee9b042918ddca20f02df37d3,2011-01-01 00:11:00,True,"A solution of N-{3-[1,1-Dimethyl-2-(tetrahydro...",CC(C)(CO)c1cc(NC(=O)C(C)(C)S(=O)(=O)C2CCOCC2)on1,CC(C)(COC1CCCCO1)c1cc(NC(=O)C(C)(C)S(=O)(=O)C2...,,[CH3:1][C:2]([C:12]1[CH:16]=[C:15]([NH:17][C:1...,,CCO,,,81.0


In [3]:
train_2_df = pd.read_parquet("/Users/dsw46/Projects_local/orderly_reproduce/data/orderly/orderly_ord_train.parquet")

In [4]:
train_2_df

Unnamed: 0_level_0,agent_000,agent_001,agent_002,date_of_experiment,extracted_from_file,grant_date,is_mapped,procedure_details,product_000,product_001,...,rxn_str,rxn_time,solvent_000,solvent_001,temperature,yield_000,yield_001,yield_002,yield_003,yield_004
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
79574,[Pd],[C],,NaT,ord_dataset-a6643d22de674f30a85ba57198b82644,1995-01-01 00:03:00,True,In a mixture of 18 ml of ethanol and 9 ml of N...,Nc1ccccc1Oc1cccc(-c2nnn[nH]2)c1,,...,[N+:1]([C:4]1[CH:21]=[CH:20][CH:19]=[CH:18][C:...,,CCO,CN(C)C=O,,96.9,,,,
148136,CCN=C=NCCCN(C)C,Cl,On1nnc2ccccc21,NaT,ord_dataset-82e842e611ef4a05b6e7f9ea0a46d52d,2003-01-01 00:07:00,True,"A mixture of 5-[5-(2,6-dichloro-phenylmethanes...",Cc1[nH]c(/C=C2\C(=O)Nc3ccc(S(=O)(=O)Cc4c(Cl)cc...,,...,[Cl:1][C:2]1[CH:7]=[CH:6][CH:5]=[C:4]([Cl:8])[...,72.0,CN(C)C=O,,40.0,,,,,
79634,[Na+],[O-][I+3]([O-])([O-])[O-],,NaT,ord_dataset-a6643d22de674f30a85ba57198b82644,1995-01-01 00:03:00,True,A solution of sodium periodate (0.214 g.; 0.00...,COc1ccc2c(c1)C(CC(=O)O)=C(C)C2=Cc1ccc(S(C)=O)cc1,,...,I([O-])(=O)(=O)=O.[Na+].[OH2:7].[CH3:8][O:9][C...,8.0,CC(C)=O,CO,25.0,,,,,
329451,c1ccc([P](c2ccccc2)(c2ccccc2)[Pd]([P](c2ccccc2...,O=C([O-])[O-],[K+],NaT,ord_dataset-cfad8b3f00044bcda60a96b019f09872,2013-01-01 00:08:00,True,"As depicted in Reaction 5, 15 g (46 mmol) of 3...",c1ccc(-c2ccc3[nH]c4ccc(-c5ccccc5)cc4c3c2)cc1,,...,Br[C:2]1[CH:3]=[CH:4][C:5]2[NH:6][C:7]3[C:12](...,,Cc1ccccc1,,,73.0,,,,
359709,[H-],[Na+],,NaT,ord_dataset-a86112d52cd54525a5e36d41f18aced2,2014-01-01 00:07:00,True,"Sodium hydride (60% in oil, 19.41 mg, 0.485 mm...",CC(C)(C)OC(=O)CN1C[C@@H]2COCCN2c2nc(Cl)ncc21,,...,[H-].[Na+].[Cl:3][C:4]1[N:13]=[CH:12][C:11]2[N...,0.5,CN(C)C=O,O,25.0,67.2,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182938,[H-],[Na+],,NaT,ord_dataset-844a22e1fcab44a5b59c5e2922b2855a,2007-01-01 00:01:00,True,In DMF (8.0 ml) was suspended sodium hydride (...,CCn1cc(C(=O)OC)c2ccccc21,,...,[H-].[Na+].[NH:3]1[C:11]2[C:6](=[CH:7][CH:8]=[...,,CN(C)C=O,O,0.0,,,,,
123321,[Na+],[OH-],,NaT,ord_dataset-18e9ed24dbd44e98b33bdc22aa7580a8,2001-01-01 00:04:00,True,"A mixture of 6-benzyloxy-1,3-benzoxathiol-2-on...",Oc1cc(OCc2ccccc2)ccc1S,,...,[CH2:1]([O:8][C:9]1[CH:18]=[CH:17][C:12]2[S:13...,16.0,CO,,25.0,,,,,
35293,CC1(C)C2CCC1(CS(=O)(=O)O)C(=O)C2,CCCC[N+](CCCC)(CCCC)CCCC,other,NaT,ord_dataset-1895fe091c3f47afa1ee96a41a250de4,1986-01-01 00:05:00,True,To a stirred solution of 3.58 g of (3S)-2-oxo-...,Oc1cccc2ccc[nH+]c12,,...,C([N+](CCCC)(CCCC)CCCC)CCC.[O:18]=[C:19]1[C@@H...,,CC(Cl)Cl,,,,,,,
392918,O=[Pt]=O,,,NaT,ord_dataset-35c51552812941cda45194a013d34bb9,2015-01-01 00:08:00,True,"2-Aminopropan-1-ol (5 g, 66.57 mmol) and 1-hyd...",CC(CO)NC(C)CO,,...,[NH2:1][CH:2]([CH3:5])[CH2:3][OH:4].[OH:6][CH2...,24.0,CCO,,25.0,,,,,
