# Extracting a Custom Property

In [11]:
from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading

## Example Document

Let's create a simple example document with a single heading followed by a single paragraph:

In [12]:
d = Document(
    Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),
    Paragraph(u'The procedure was followed to yield a pale yellow solid NH3 (b.p. 240 °C)'))


What does this look like:

In [13]:
d

## Default Parsers

By default, ChemDataExtractor won't extract the boiling point property:

In [14]:
d.records.serialize()

[{u'labels': [u'3a'],
  u'names': [u'2,4,6-trinitrotoluene'],
  u'roles': [u'product']}]

## Defining a New Property Model

The first task is to define the schema of a new property, and add it to the `Compound` model:

In [6]:
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType

class logkp(BaseModel):
    value = StringType()
    units = StringType()
    
Compound.logkp_points = ListType(ModelType(logkp))

## Writing a New Parser

Next, define parsing rules that define how to interpret text and convert it into the model:

In [7]:
import re
from chemdataextractor.parse import R, I, W, Optional, merge

prefix = (R(u'^logkp$', re.I) + Optional(I('of') | I('is')|I('was')| I('equals to')| I('range')|I('range of')
                                         |I('range is')|I('range was')))
#units = Optional(R(u'^\.?$'))(u'units').add_action(merge)
value = R(u'^\d+(\.\d+)?$')(u'value')
logkp_model = (prefix + value)(u'logkp')

In [8]:
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

class logkpParser(BaseParser):
    root = logkp_model

    def interpret(self, result, start, end):
        compound = Compound(
            logkp_points=[
                logkp(
                    value=first(result.xpath('./value/text()')),
                    #units=first(result.xpath('./units/text()'))
                )
            ]
        )
        cem_el = first(result.xpath('./cem'))
        if cem_el is not None:
            compound.names = cem_el.xpath('./name/text()')
            compound.labels = cem_el.xpath('./label/text()')
        yield compound


In [9]:
Paragraph.parsers = [logkpParser()]

## Running the New Parser

In [10]:
d = Document(
    Heading(u'DLSCORE: A Deep Learning algorithm to identify ligands'),
    Paragraph(u'The algorithm identified some of the ligands and their values, such as resorcinol with a logkp 3.79.'))
d.records.serialize()

[{u'logkp_points': [{u'value': u'3.79'}]}]