In [54]:
import sys, os, re, math, csv
import pandas as pd
from collections import OrderedDict
import collections
import glob, xmltodict
from pprint import pprint
from rdkit import Chem
from pypdf import PdfReader
import cirpy

In [2]:
pdf_path = "./reaxysRN_records/blockO.pdf"
pdf_reader = PdfReader(pdf_path)

In [3]:
text = "" # length 820132 for block A
for page in pdf_reader.pages:
    text += page.extract_text()

In [64]:
# Dictionary to convert solvents (as referenced by reaxys) into smiles for chemprop
solvents = {'acetonitrile': 'CC#N','MeCN': 'CC#N','ethanol': 'CCO','water': 'O','H2O': 'O',
            'dichloromethane': 'ClCCl','toluene': 'Cc1ccccc1','methanol': 'CO',
            '2-methyl-propan-2-ol': 'CC(O)(C)C','isopropyl alcohol': 'CC(O)C','benzene': 'c1ccccc1',
            '1,2-dichloro-benzene': 'Clc1c(Cl)cccc1','1,4-dioxane': 'C1OCCOC1',
            'dimethyl sulfoxide': 'CS(=O)C','dimethylsulfoxide': 'CS(=O)C','chloroform': 'ClC(Cl)Cl',
            'CHCl3': 'ClC(Cl)Cl','dimethylformamide': 'CN(C=O)C','N,N-dimethyl-formamide': 'CN(C=O)C',
            '1-methyl-pyrrolidin-2-one': 'CN1CCCC1=O','hexane': 'CCCCCC','cyclohexane': 'C1CCCCC1',
            'ethyl acetate': 'O=C(OCC)C',
            }

In [5]:
lines = text.splitlines()
rows = [] # list of lists generated. rows[0] is header, 1-999 are entries for block A. 
current_row = []
for line in lines:
    if "Reaxys ID" in line and current_row:
        rows.append(current_row)
        current_row = []
    current_row.append(line)

if current_row:
    rows.append(current_row)

In [35]:
def get_num(s):
    match = re.match(r'^\d+(\.\d+)?', s)
    return match.group() if match else ""

In [81]:
def name_to_smi(name):
    return cirpy.resolve(name, 'smiles')

def cas_to_smi(cas):
    return cirpy.resolve(cas, 'smiles', ['cas_number'])

def inchikey_to_smi(ik):
    return cirpy.resolve(ik, 'smiles', ['stdinchikey'])

In [85]:
def extract_details(row):
    reaxys_id, cas_rn, formula, mw, inchikey = None, None, None, None, None
    id_fields = [] # list to hold the above
    
    uv_entry = []
    uv_entries = []
    
    for i, line in enumerate(row):
        if "Reaxys ID" in line: 
            reaxys_id = line.split(" ")[2].strip()
        if "CAS Registry Number" in line: 
            cas_rn = line.split(":")[-1].strip()
        if "Molecular Formula" in line: 
            formula = line.split(":")[-1].strip()
        if "Molecular Weight" in line: 
            mw = line.split(":")[-1].strip()
        if "InChI Key" in line: 
            inchikey = line.split(":")[-1].strip()
            #solute_smi = inchikey_to_smi(inchikey)
            #solute_smi = Chem.MolToSmiles(Chem.MolFromSmiles(solute_smi))
        
        num_uv_entries = None
        if "UV/VIS Spectroscopy (" in line:
            num_uv_entries = re.search(r'\((.*?)\)',line).group(1)
        
        solvent, solvent_smi = None, None
        uv_abs_maxima, uv_coefs = [], []
        uv_entry_flag = False
        
        if "Solvent (UV/VIS Spectro-" in line:
            solvent = row[i+2]
            try: 
                solvent_smi = solvents[solvent]
            except: 
                pass
            uv_entry.append(solvent)
            uv_entry.append(solvent_smi)
        if "Absorption Maxima (UV/" in line: 
            max_raw = "".join(row[i+2].split()).split(";")
            max_list = [float(get_num(m)) for m in max_raw]
            uv_abs_maxima.append(max_list)
            uv_entry.append(uv_abs_maxima)
        if "Ext./Abs. Coefficient" in line: 
            eac_raw = "".join(row[i+2].split()).split(";")
            eac_list = [float(get_num(e)) for e in eac_raw]
            uv_coefs.append(eac_list)
            uv_entry.append(uv_coefs)
            uv_entry_flag = True # "break" to add individual UV entry to running list of UV entries. 
        if "Log epsilon" in line: # assumed to be mutually exclusive with EAC above.
            eps_raw = "".join(line[12:].split()).split(";")
            eac_list = [10.0**float(get_num(e)) for e in eps_raw]
            uv_coefs.append(eac_list)
            uv_entry.append(uv_coefs)
            uv_entry_flag = True # "break" to add individual UV entry to running list of UV entries. 
        
        '''
        if line.endswith("View in Reaxys") and line != "View in Reaxys": # ~sufficient to target reference entries.
            ref = rmc(line[:-16]) # doesn't get multi-line entries, though -- could clean this up at a later stage.
            uv_entry.append(ref)
        '''
        
        if uv_entry_flag is True:
            uv_entries.append(uv_entry)
            uv_entry = []
            solvent = None
            solvent_smi = None
            uv_abs_maxima, uv_coefs = [], []
            uv_entry_flag = False
            
    id_fields.append((reaxys_id, cas_rn, formula, mw, inchikey)) 
    return id_fields, uv_entries

In [86]:
extracted_entries = [extract_details(rows[i]) for i in range(1, len(rows))]

OverflowError: (34, 'Result too large')

In [77]:
ex_details = extract_details(rows[64])
ex_details

methanol
acetonitrile


([('1437208',
   '58450-01-4',
   'C16H12O6',
   '300.268',
   'UBYOEDLUKKPPPN-UHFFFAOYSA-N')],
 [['methanol',
   'CO',
   [[233.0, 257.0, 291.0, 362.0]],
   [[6456.542290346556,
     6025.595860743575,
     501.18723362727246,
     79.43282347242814]]],
  [[[233.0, 257.0, 362.0]],
   [[6456.542290346556, 501.18723362727246, 79.43282347242814]]],
  ['acetonitrile',
   'CC#N',
   [[380.0, 292.0, 262.0, 236.0]],
   [[6100.0, 9700.0, 34800.0, 26400.0]]]])

In [None]:
# ex_details[0]: reaxys_id, cas_rn, formula, mw, inchikey

In [56]:
pd.DataFrame(ex_details[0])

Unnamed: 0,0,1,2,3,4
0,1437208,58450-01-4,C16H12O6,300.268,UBYOEDLUKKPPPN-UHFFFAOYSA-N


In [76]:
ex_details[1]

[[[[233.0, 257.0, 291.0, 362.0]],
  [[6456.542290346556,
    6025.595860743575,
    501.18723362727246,
    79.43282347242814]]],
 [[[233.0, 257.0, 362.0]],
  [[6456.542290346556, 501.18723362727246, 79.43282347242814]]],
 [[[380.0, 292.0, 262.0, 236.0]], [[6100.0, 9700.0, 34800.0, 26400.0]]]]

In [9]:
rows[64]

['Reaxys ID 1437208 View in Reaxys 64/999',
 'O',
 'O',
 'OO',
 'OH',
 'HO',
 'CAS Registry Number: 58450-01-4',
 'Chemical Name: sydowinin A',
 'Linear Structure Formula: C16H12O6',
 'Molecular Formula: C16H12O6',
 'Molecular Weight: 300.268',
 'Type of Substance: heterocyclic',
 'InChI Key: UBYOEDLUKKPPPN-UHFFFAOYSA-N',
 'Note:  ',
 'UV/VIS Spectroscopy (4)',
 '1 of 4 Solvent (UV/VIS Spectro-',
 'scopy)',
 'methanol',
 'Absorption Maxima (UV/',
 'VIS) [nm]',
 '233; 257; 291; 362',
 'Log epsilon 3.81; 3.78; 2.7; 1.9',
 'Goddard, Mary-Lorène; Mottier, Nicolas; Jeanneret-Gris, Julie; Christen, Danilo; Tabacchi, Raphaël; Abou-Mansour,',
 'Eliane; Journal of Agricultural and Food Chemistry; vol. 62; nb. 34; (2014); p. 8602 - 8607, View in Reaxys',
 '2 of 4 Absorption Maxima (UV/',
 'VIS) [nm]',
 '233; 257; 362Copyright © 2024 Elsevier Life Sciences IP Limited except certain content provided by',
 'third parties. Reaxys is a trademark of Elsevier Life Sciences IP Limited.',
 '75/866 2024-1

In [28]:
# UV.EAC is "Ext./Abs. Coefficient"
# UV.LOGE is "Log epsilon"
## if UV.LOGE, then transformed into units of EAC. 
# UV.AM is "Absorption Maxima (UV/VIS) [nm]"

# TODO: split by semicolons -- ordinal matching

# Do their Chemprop model as a benchmark. 

#All models were trained using Chemprop version 1.5. The models were trained using the MIT
#SuperCloud, a Linux environment with Intel Xeon Gold 6248 CPU (40 cores) and a Nvidia Tesla
#V100 GPU. For reproducibility, the training command line argument was:

#Chemprop_train --data_path train_set.csv --save_dir model_x\
#--num_folds 10 --dataset_type regression --config_path\ hyperopts.json --
#epochs 200 --number_of_molecules 2

# Try: LLMs, Mordred descriptors. 

### comp_SGM_110724.1435
# OceanOptics UV detector en route -- AFZ sending quote details. 