# Your Chemistry, Your Data, Your Insights
#### 02/12/2019


# Jupyter Setup

Assuming we are in the "dm_public/notebooks" directory, let's change to the base directory to keep consistency between paths in Jupyter and Python at the command line. If you did not launch Jupyter from the dm_public directory, do not execute this cell. And be sure to only run it once!

In [1]:
%cd ..

C:\Users\David Pattison\Code\dm_public


Specify some constants referring to the input XML files and schemas. We'll work with these for the rest of this notebook.

# Validation

Use the lxml library to validate that our recipe PCML adheres to the standard.

In [2]:
from lxml import etree

pcml_schema_file = './data/pcml-1.3.4.xsd'
pcml_recipe_file = './data/3a_recipe.pcml'

pcrr_schema_file = './data/pcrr-0.0.1.xsd'
pcrr_recipe_files = ['./data/3a_run_01.pcrr',
                     './data/3a_run_02.pcrr',
                     './data/3a_run_03.pcrr']



In [3]:
def validate_xml(xml_file, schema_file):
    try:
        xml_doc = etree.parse(xml_file)
        xml_schema = etree.XMLSchema(etree.parse(schema_file))
        
        xml_schema.assertValid(xml_doc)

        print("XML for '{}' is valid".format(xml_file))
    except etree.DocumentInvalid as e:
        print("ERROR: XML for '{} is invalid".format(xml_file))
        raise
    except:
        print("There was an error while validating the XML")
        raise

In [4]:
#validate the PCML against its XSD schema

validate_xml(pcml_recipe_file, pcml_schema_file)


XML for './data/3a_recipe.pcml' is valid


Now let's validate the X509 certificate. As this is self signed, it is included in the same directory as the data.

In [5]:
#verify the signatures in all PCRR files.
for rr in pcrr_recipe_files:
    validate_xml(rr, pcrr_schema_file)
    

XML for './data/3a_run_01.pcrr' is valid
XML for './data/3a_run_02.pcrr' is valid
XML for './data/3a_run_03.pcrr' is valid


In [6]:
from signxml import XMLVerifier, InvalidSignature

def verify_x509_signature(xml_file, cer_file):
    
    try:
        cert = open(cer_file, "rb").read()
        xml_elem = etree.parse(xml_file)

        verified_data = XMLVerifier().verify(xml_elem, x509_cert = cert)
        print("X.509 signature of '{}' is valid as generated by DigitalGlassware".format(xml_file))

        return
    except InvalidSignature:
        print("X.509 signature for '{}' is invalid -- it may not have been generated by DigitalGlassware, or has been modified after signing.".format(xml_file))
        raise
    except:
        print("An error occurred whilst verifying X.509 signature of '{}'".format(xml_file))
        raise

In [7]:
#verify the X509 signature of the PCML
pcml_cer_path = "./data/x509/pcml.crt"
verify_x509_signature(pcml_recipe_file, pcml_cer_path)

X.509 signature of './data/3a_recipe.pcml' is valid as generated by DigitalGlassware


In [8]:
#verify the signatures in all PCRR files.

pcrr_cer_path = "./data/x509/pcrr.crt"

for rr in pcrr_recipe_files:
    verify_x509_signature(rr, pcrr_cer_path)

X.509 signature of './data/3a_run_01.pcrr' is valid as generated by DigitalGlassware
X.509 signature of './data/3a_run_02.pcrr' is valid as generated by DigitalGlassware
X.509 signature of './data/3a_run_03.pcrr' is valid as generated by DigitalGlassware


# XSLT on PCML

In [9]:
#apply an XSL transformation to the PCML data

pcml_xslt_flow_file = "./data/flow.xsl"

pcml_obj = etree.parse(pcml_recipe_file)
xslt_obj = etree.parse(pcml_xslt_flow_file)

transform = etree.XSLT(xslt_obj)
pcml_obj_transform = transform(pcml_obj)
html_xml = etree.tostring(pcml_obj_transform, pretty_print=False)

with open("./out/3a_pcml_flow.html", 'w') as outfile:
    outfile.write(html_xml.decode("utf8"))

# Extracting Recipe Content

In [10]:
#list all the chemicals used
chem_elem = pcml_obj.find(".//chemicals")
for c in chem_elem:
    print("Chemical: {}".format(c[0].text))
    

Chemical: 1-Naphthoyl chloride
Chemical: N-(4-methylbenzenesulfonyl)naphthalene-1-carbohydrazide
Chemical: 4-dimethylaminopyridine
Chemical: p-toluenesulfonyl hydrazide
Chemical: dichloromethane
Chemical: triethylamine
Chemical: saturated NH₄Cl solution
Chemical: water
Chemical: 10% aqueous citric acid solution
Chemical: saturated sodium chloride solution
Chemical: sodium sulfate
Chemical: dichloromethane
Chemical: hexane
Chemical: 1,3,5-trimethoxybenzene


In [11]:
#search for specific safety code
import itertools

code_to_search = "H318"
has_code = len(pcml_obj.xpath('.//safetycode/code[text()="{}"]'.format(code_to_search))) > 0
print("{} {} code associated with recipe chemicals".format("Found" if has_code else "Did not find", code_to_search))


Found H318 code associated with recipe chemicals


In [12]:
#List off safety codes
safety_elem = pcml_obj.findall(".//safetycode/code")
all_s_codes = [s.text for s in safety_elem]

uniq_s_codes = set(itertools.chain.from_iterable([x.split(" + ") for x in all_s_codes]))
print("Found the following unique safety codes:", sorted(uniq_s_codes))

Found the following unique safety codes: ['H-N/A', 'H-Unknown', 'H225', 'H242', 'H301', 'H302', 'H304', 'H310', 'H311', 'H314', 'H315', 'H318', 'H319', 'H331', 'H335', 'H336', 'H351', 'H361d', 'H373', 'H411', 'H412', 'P-Unknown', 'P201', 'P210', 'P261', 'P264', 'P273', 'P280', 'P301', 'P302', 'P303', 'P304', 'P305', 'P308', 'P310', 'P312', 'P313', 'P330', 'P331', 'P337', 'P338', 'P340', 'P351', 'P352', 'P353', 'P361', 'P370', 'P378', 'R-N/A', 'R-Unknown', 'S-N/A', 'S-Unknown']


In [13]:
#extract and count roles of chemicals
from collections import Counter
import pprint

role_elems = pcml_obj.xpath('.//chemicals/chemical')
role_counts = Counter([r.get("role", None) for r in role_elems])

pp = pprint.PrettyPrinter()
pp.pprint(role_counts)

Counter({'reagent': 4,
         'solvent': 3,
         'washing-solution': 3,
         'starting-material': 1,
         'product': 1,
         'quenching-solution': 1,
         'drying-agent': 1})


In [14]:
#get durations of recipes
from datetime import datetime

time_fmt = "%Y-%m-%dT%H:%M:%S.%f%z"

for i, rr in enumerate(pcrr_recipe_files, 1):
    
    start_str = next(etree.iterparse(rr, tag = "start_time"))[1].text
    end_str = next(etree.iterparse(rr, tag = "end_time"))[1].text
    
    start_datetime = datetime.strptime(start_str, time_fmt)
    end_datetime = datetime.strptime(end_str, time_fmt)
    
    rr_duration = end_datetime - start_datetime
    print("Duration of recipe run {} is {}".format(i, rr_duration))
    

Duration of recipe run 1 is 1 day, 1:10:35
Duration of recipe run 2 is 1 day, 3:07:34
Duration of recipe run 3 is 1 day, 18:47:00
