In [34]:
import sys, os, re, math, csv
from collections import OrderedDict
import collections
import glob, xmltodict
from pprint import pprint
from rdkit import Chem
from pypdf import PdfReader

In [84]:
pdf_path = "./reaxysRN_records/blockO.pdf"
pdf_reader = PdfReader(pdf_path)

In [85]:
text = "" # length 820132 for block A
for page in pdf_reader.pages:
    text += page.extract_text()

In [86]:
# XML
# Dictionary to convert solvents (as referenced by reaxys) into smiles for chemprop
solvents = {'acetonitrile': 'CC#N',
            'MeCN': 'CC#N',
            'ethanol': 'CCO',
            'water': 'O',
            'H2O': 'O',
            'dichloromethane': 'ClCCl',
            'toluene': 'Cc1ccccc1',
            'methanol': 'CO',
            '2-methyl-propan-2-ol': 'CC(O)(C)C',
            'isopropyl alcohol': 'CC(O)C',
            'benzene': 'c1ccccc1',
            '1,2-dichloro-benzene': 'Clc1c(Cl)cccc1',
            '1,4-dioxane': 'C1OCCOC1',
            'dimethyl sulfoxide': 'CS(=O)C',
            'dimethylsulfoxide': 'CS(=O)C',
            'chloroform': 'ClC(Cl)Cl',
            'CHCl3': 'ClC(Cl)Cl',
            'dimethylformamide': 'CN(C=O)C',
            'N,N-dimethyl-formamide': 'CN(C=O)C',
            '1-methyl-pyrrolidin-2-one': 'CN1CCCC1=O',
            'hexane': 'CCCCCC',
            'cyclohexane': 'C1CCCCC1',
            'ethyl acetate': 'O=C(OCC)C',
            }


In [161]:
lines = text.splitlines()
rows = [] # list of lists generated. rows[0] is header, 1-999 are entries for block A. 
current_row = []
for line in lines:
    if "Reaxys ID" in line and current_row:
        rows.append(current_row)
        current_row = []
    current_row.append(line)

if current_row:
    rows.append(current_row)

In [166]:
def extract_details(row):
    reaxys_id, cas_rn, formula, mw, inchi_key = None, None, None, None, None
    id_fields = [] # list to hold the above
    
    uv_entry = []
    uv_entries = []
    
    for i, line in enumerate(row):
        if "Reaxys ID" in line: 
            reaxys_id = line.split(" ")[2].strip()
        if "CAS Registry Number" in line: 
            cas_rn = line.split(":")[-1].strip()
        if "Molecular Formula" in line: 
            formula = line.split(":")[-1].strip()
        if "Molecular Weight" in line: 
            mw = line.split(":")[-1].strip()
        if "InChI Key" in line: 
            inchi_key = line.split(":")[-1].strip()
        
        num_uv_entries = None
        if "UV/VIS Spectroscopy (" in line:
            num_uv_entries = re.search(r'\((.*?)\)',line).group(1)
        
        solvent = None
        uv_abs_maxima, uv_coefs = [], []
        uv_entry_flag = False
        
        if "Solvent (UV/Vis Spectro-" in line:
            uv_entry.append(row[i+2])
        if "Absorption Maxima (UV/" in line: 
            max_raw = "".join(row[i+2].split()).split(";")
            max_list = [float(m) for m in max_raw]
            uv_abs_maxima.append(max_list)
            uv_entry.append(uv_abs_maxima)
        if "Ext./Abs. Coefficient" in line: 
            eac_raw = "".join(row[i+2].split()).split(";")
            eac_list = [float(e) for e in eac_raw]
            uv_coefs.append(eac_list)
            uv_entry.append(uv_coefs)
            uv_entry_flag = True # "break" to add individual UV entry to running list of UV entries. 
        if "Log epsilon" in line: # assumed to be mutually exclusive with EAC above.
            eps_raw = "".join(line[12:].split()).split(";")
            eps_list = [float(e) for e in (eps_raw)]
            eac_list = [10**e for e in eps_list]
            uv_coefs.append(eac_list)
            uv_entry.append(uv_coefs)
            uv_entry_flag = True # "break" to add individual UV entry to running list of UV entries. 
        
        '''
        if line.endswith("View in Reaxys") and line != "View in Reaxys": # sufficient to target reference entries.
            ref = rmc(line[:-16])
            uv_entry.append(ref)
        '''
        
        if uv_entry_flag is True:
            uv_entries.append(uv_entry)
            uv_entry = []
            solvent = None
            uv_abs_maxima, uv_coefs = [], []
            uv_entry_flag = False
            
    id_fields.append((reaxys_id, cas_rn, formula, mw, inchi_key)) 
    return id_fields, uv_entries

In [168]:
ex_details = extract_details(rows[63])
ex_details

([('1403165',
   '28656-26-0',
   'C27H22O3',
   '394.47',
   'RPETUZOFZUKMCP-UHFFFAOYSA-N')],
 [[[[303.0, 316.0, 347.0, 357.0]], [[5900.0, 6825.0, 4575.0, 4400.0]]]])

In [169]:
rows[63]

['Reaxys ID 1403165 View in Reaxys 63/999',
 'O',
 'O',
 'O',
 'CAS Registry Number: 28656-26-0',
 'Chemical Name: 3,3-Bis(4-methoxyphenyl)[3H]naphtho[2,1-b]pyran',
 'Linear Structure Formula: C27H22O3',
 'Molecular Formula: C27H22O3',
 'Molecular Weight: 394.47',
 'Type of Substance: heterocyclic',
 'InChI Key: RPETUZOFZUKMCP-UHFFFAOYSA-N',
 'Note:  ',
 'UV/VIS Spectroscopy (8)',
 '1 of 8 Description (UV/VIS',
 'Spectroscopy)',
 'Spectrum',
 'Solvent (UV/VIS Spectro-',
 'scopy)',
 'acetonitrile',
 'Absorption Maxima (UV/',
 'VIS) [nm]',
 '303; 316; 347; 357',
 'Ext./Abs. Coefficient',
 '[l·mol-1cm-1]',
 '5900; 6825; 4575; 4400',
 'Barachevsky, Valery ǧ.; Gorelik, Alexander ǧ.; Kobeleva, ǧlga I.; Valova, ǧatyana ǧ.; Venidiktova, ǧlga V .; Dyes and Pigments;',
 'vol. 184; (2021), View in Reaxys',
 '2 of 8 Description (UV/VIS',
 'Spectroscopy)',
 'Spectrum',
 'Solvent (UV/VIS Spectro-',
 'scopy)',
 'toluene',
 'Absorption Maxima (UV/',
 'VIS) [nm]',
 '474',
 'De Azevedo, Orlando D. C. C.

In [28]:
# UV.EAC is "Ext./Abs. Coefficient"
# UV.LOGE is "Log epsilon"
## if UV.LOGE, then transformed into units of EAC. 
# UV.AM is "Absorption Maxima (UV/VIS) [nm]"

# TODO: split by semicolons -- ordinal matching

# Do their Chemprop model as a benchmark. 

#All models were trained using Chemprop version 1.5. The models were trained using the MIT
#SuperCloud, a Linux environment with Intel Xeon Gold 6248 CPU (40 cores) and a Nvidia Tesla
#V100 GPU. For reproducibility, the training command line argument was:

#Chemprop_train --data_path train_set.csv --save_dir model_x\
#--num_folds 10 --dataset_type regression --config_path\ hyperopts.json --
#epochs 200 --number_of_molecules 2

# Try: LLMs, Mordred descriptors. 