In [1]:
from lxml import etree
import re
import os
import inspect
from py2neo import Graph, Node, Relationship

# The classes

In [2]:
def reverseDict(dic):
    return {v: k for k, v in dic.items()}

In [3]:
pos = {'-' : '-',
           "d" : 'adverb',
           'n' : 'noun',
           'm' : 'numeral',
           'p' : 'pron',
           'v' : 'verb',
           't' : 'verb', 
           'x' : 'irregular',
           'l' : 'article',
           'e' : 'exclamation',
           'a' : 'adjective',
           'r' : 'preposition',
           'c' : 'conjunction',
           'g' : 'adverb',
           'u' : 'punctuation',
      }
person = {'1' : "1st", '-' : '-', '3' : '3rd' , '2' : '2nd'} 
number = {'s' : 'singular', '-' : '-', 'p' : 'plural', 'd' : 'dual'}
tense = reverseDict({'imperfect' : 'i',
                     'future' : 'f',
                     'perfect' : 'r',
                     '-' : '-',
                     'future_perfect' : 't',
                     'aorist' : 'a',
                     'pluperfect' : 'l',
                     'present' : 'p'})
mood  = reverseDict({'optative' : 'o', '-' : '-', 'imperative' : 'm', 'indicative' : 'i', 'infintive' : 'n', 'subjunctive' : 's', 'participle' : 'p'})
voice = reverseDict({'middle' : 'm',
                     'passive' : 'p',
                     '-' : '-' ,
                     'mediopassive' : 'e',
                     'active' : 'a'})
gender = reverseDict({'neuter' : 'n', 'masculine' : 'm', 'feminine' : 'f', '-' : '-'})
case = reverseDict({'accusative' : 'a',
                    'nominative' : 'n', 
                    'vocative' : 'v', 
                    '-' : '-', 
                    'dative' : 'd', 
                    'genitive' : 'g'})
degree = reverseDict({'superl' : 's', '-' : '-', 'comp' : 'c'})

In [4]:
def _setPropIfThere(el, p):
    '''generic function that tries to set a proprety by accessing the appropriate XML attribute;
    if the attribute is not there, an empty string is returned.
    This is used for properties like "cite" or "cid" that might
    or might not be set for all files'''
    try:
        s = el.attrib[p]
    except KeyError:
        s = None
    return s

In [5]:
class Sentence():
    def __init__(self, el, **kwargs):
        '''Takes a parsed xml sentence element, returns a Sentence object with a series of properties defined.
        Optionally, you can pass a series of keyword:value pairs, including:
        - author
        - title of the work
        - genre
        - chronology
        - meter'''
        self._element = el
        self._raw = etree.tostring(el, encoding="UTF-8").decode("utf8")
        self._doc_id = el.attrib["document_id"]
        self.sent_id = el.attrib["id"]
        self.subdoc = el.attrib["subdoc"]
        self.span = el.attrib["span"]
        self._artificials = [t for t in self._tokens if type(t) is Artificial_Token]
        self._words = [t for t in self._tokens if type(t) is Word]
        self._stats = (len(self._tokens), len(self._artificials), len(self._tokens) - len(self._artificials))
        self.address = self._doc_id.split(":")[-1] + "#" + self.sent_id + "#" + '0'
        
        #Some bibliographic information
        self.author = kwargs.pop("author", None)
        self.work = kwargs.pop("work", None)
        self.genre = kwargs.pop("genre", None)
        self.chronology = kwargs.pop("chronology", None)
        
        #Content-related information
        self.speaker = kwargs.pop("speaker", "")
        self.meter = kwargs.pop("meter", "")
        
    def _isArtificial(self, t):
        try:
            t.attrib["artificial"]
            return(True)
        except KeyError:
            return(False)

    @property    
    def _tokens(self):
        tokens = []
        toks = self._element.xpath("word")
        for t in toks:
            if self._isArtificial(t):
                tokens.append(Artificial_Token(t))
            else:
                tokens.append(Word(t))
        return tokens
    
    
    def printStats(self):
        print('''Total tokens:\t{};\nWords:\t{};\nArtificial nodes:\t{}'''.format(self._stats[0],
                                                                                 self._stats[2], self._stats[1]))
    
    def __str__(self):
        '''returns the plain sentence'''
        return ' '.join([s.form for s in self._tokens if type(t) is Word])

In [6]:
class Token():
    '''Super-class for both words and artificial nodes. All that is common to both should go here'''
    def __init__(self, word_element):
        self._element = word_element
        self._raw =  etree.tostring(word_element, encoding="UTF-8").decode("utf8")
        self._token_id = word_element.attrib["id"]
        self.rank = self._token_id
        self.form = word_element.attrib["form"]
        #the original relation tag
        self.original_label = word_element.attrib["relation"]
        #the relation tag split in the components:
        #e.g. "SBJ_AP_CO" > [SBJ, AP, CO]
        self._rel_components = self.original_label.split("_")
        self.head = word_element.attrib["head"]
        self._relation = self._rel_components[0]
        #it might not be there for artificial!
        self._lemma_original = _setPropIfThere(word_element, "lemma")
        self.postag = _setPropIfThere(word_element, "postag")

        #Getter
        self.lemma
        self.address
        self._morphology
        
    def setIsMember(self, tag):
        assert tag in ["AP", "CO"],"the appendix tag must be either AP or CO!"
        if tag in self._rel_components:
            return 1
        else:
            return 0
        
    @property
    def lemma(self):
        if self._lemma_original is not None:
            return re.sub('[0-9]+$', '', self._lemma_original)
        else:
            return None
    @property
    def address(self):
        sent = self._element.getparent()
        return "{}#{}#{}".format(sent.attrib["document_id"].split(":")[-1], sent.attrib["id"], self._token_id)
    
    @property
    def _morphology(self):
        if self.postag == None:
            return None 
        else:
            return Morph(self.postag)
            

In [7]:
class Word(Token):
    def __init__(self, word_element):
        Token.__init__(self, word_element)
        #These might not be there!
        self.cid = _setPropIfThere(word_element, "cid")
        self.cite = _setPropIfThere(word_element, "cite")
        self.isMemberOfCoord = Token.setIsMember(self, "CO")
        self.isMemberOfApos = Token.setIsMember(self, "AP")

        #Semantics
        #to be implemented
        self.ne_type = ''
        self.animacy = ''

In [8]:
class Artificial_Token(Token):
    def __init__(self, word_element):
        Token.__init__(self, word_element)
        self._insertion_id = word_element.attrib["insertion_id"]
        self.artificial_type = word_element.attrib["artificial"]
        self.isMemberOfCoord = Token.setIsMember(self, "CO")
        self.isMemberOfApos = Token.setIsMember(self, "AP")
        #self.morphology = Token.setMorphs(self)


In [9]:
class Morph():
    def __init__(self, tag):
        assert len(tag) == 9, "Tag: {} is invalid".format(tag)
        self.pos = pos[tag[0]]
        self.person = person[tag[1]]
        self.number = number[tag[2]]
        self.tense = tense[tag[3]] 
        self.mood = mood[tag[4]]
        self.voice = voice[tag[5]] 
        self.gender = gender[tag[6]]
        self.case = case[tag[7]]
        self.degree = degree[tag[8]]
        
    @property
    def full(self):
        return {'pos' : self.pos,
               'person' : self.person,
               'number' : self.number,
               'tense' : self.tense,
               'mood' : self.mood,
               'voice' : self.voice,
               'gender' : self.gender,
               'case' : self.case,
               'degree' : self.degree,
               }

Tests

In [12]:
t = s._tokens[4]

In [202]:
setPropDict(t)

TypeError: <lambda>() missing 1 required positional argument: 'n'

In [51]:
t.address

'tlg0011.tlg001.perseus-grc2#2898476#28'

In [125]:
t._morphology.pos

'noun'

In [173]:
inspect.isdatadescriptor(t._element)

False

In [191]:
isinstance(getattr(type(t), "lemma", None), property)

True

In [15]:
properties = inspect.getmembers(t, lambda o: isinstance(o, property))

In [28]:
setPropDict(t)

{'address': 'tlg0011.tlg001.perseus-grc2#2898476#5',
 'animacy': '',
 'cid': '36335582',
 'cite': 'urn:cts:greekLit:tlg0011.tlg001:1',
 'form': 'ἀνθρώπων',
 'head': '1',
 'isMemberOfApos': 0,
 'isMemberOfCoord': 0,
 'lemma': 'ἄνθρωπος',
 'ne_type': '',
 'original_label': 'ATR',
 'postag': 'n-p---mg-',
 'rank': '5'}

In [40]:
_createHeadDep(t)

('tlg0011.tlg001.perseus-grc2#2898476#1',
 'tlg0011.tlg001.perseus-grc2#2898476#5')

# To Graph DB

## Functions

In [10]:
def _createHeadDep(t):
    '''gets a token object (word or artificial);
    returns a tuple (head-address, dependent-address)'''
    add_parts = t.address.split("#")
    return ("{}#{}#{}".format(add_parts[0], add_parts[1], t.head), t.address)

In [11]:
_createHeadDep(t)

NameError: name 't' is not defined

In [None]:
def createRels(Sent, graph):
    '''Takes a Sentence object and a Neo4j graph! Note that the nodes must have been already created in the DB'''
    for t in Sent._tokens:
        head_add,dep_add = _createHeadDep(t)
        query = '''MATCH (h),(d)
        WHERE h.address = "{}" AND d.address = "{}"
        CREATE UNIQUE (h)-[r:{}]->(d)
        RETURN r'''.format(head_add, dep_add,t._relation)
        graph.run(query)

In [None]:
def setPropDict(obj):
    '''takes an object. Return the list of properties, excluding those that begins with "_"'''
    return {n:v for n,v in inspect.getmembers(obj)
            if n[0] != '_' and not inspect.ismethod(v)}

In [None]:
def toGraphNodes(s):
    '''converts the elements of a sentence into Nodes and Relationships ready to be pushed to a Neo4j db.
    This includes the root node itself!
    '''
    #sent_props = {k:v for k,v in vars(t).items() if k[0] != '_'}
    root = Node("Sentence", **setPropDict(s))
    nodes = [root]
    for t in s._tokens:
        if type(t) is Word:
            n = Node("Token", **setPropDict(t))
        else:
            n = Node("Artificial", **setPropDict(t))
        if t._morphology:
            for k,v in t._morphology.full.items():
                n[k] = v
        nodes.append(n)
    return nodes 

In [None]:
def createNodes(graph, nodes):
    for n in nodes:
        graph.create(n)

## Workflow sample

We start by transforming a sentence into a series of Graph nodes

In [109]:
nodes = toGraphNodes(s)

In [111]:
nodes[30]

(d7cad89:Artificial {address:"tlg0011.tlg001.perseus-grc2#2898476#30",artificial_type:"elliptic",form:"[0]",head:"22",isMemberOfApos:1,isMemberOfCoord:1,original_label:"OBJ_AP_CO",rank:"30"})

We connect to the Graph DB

In [112]:
g = Graph(password="boston4ever")

Very important, we create a **uniqueness constraint** on the node address, so that no two nodes for the same token in added to the DB

Here we create the nodes and commit them to the DB!

In [113]:
for n in nodes:
    g.merge(n)

And here we create the relations!

In [114]:
createRels(s, g)

In [99]:
nodes[28]

(c311769:Token {address:"tlg0011.tlg001.perseus-grc2#2898476#28",animacy:"",case:"nominative",cid:"36335603",cite:"urn:cts:greekLit:tlg0011.tlg001:3",degree:"-",form:"κακός",gender:"masculine",head:"31",isMemberOfApos:0,isMemberOfCoord:0,lemma:"κακός",mood:"-",ne_type:"",number:"singular",original_label:"PNOM",person:"-",pos:"adjective",postag:"a-s---mn-",rank:"28",tense:"-",voice:"-"})

# Some useful queries

Match nodes, regardless whether they're Sentences, Tokens or Artificials 

```cypher
MATCH t
WHERE t:Token OR t:Artificial
RETURN t
```

Get a sentence and the descendants

```cypher
MATCH path= (t)<-[*]-(Sentence)    
WHERE t:Token OR t:Artificial
RETURN path
```

Delete all nodes

```cypher
MATCH (n)
OPTIONAL MATCH (n)-[r]-()
DELETE n,r
```

#Clear the DB

**CAREFUL!** The following statement will clear the db..

In [117]:
#g.delete_all()

# Main loop

In [124]:
g = Graph(password="boston4ever")

In [118]:
x = etree.parse(\
'/Users/fmambrini/Documents/lavoro/treebank/files/AGDT2.X/data/tlg0011.tlg001.perseus-grc2.tb.xml')
sents = x.xpath("//sentence")

In [123]:
biblio = {"author" : "Sophocles", "genre" : "tragedy", "chronology" : "5th BCE",
         "work" : "Trachiniae"}

In [125]:
%%time
for s_el in sents:
    s = Sentence(s_el, **biblio)
    nodes = toGraphNodes(s)
    for n in nodes:
        g.merge(n)
    createRels(s, g)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.01 µs


# Test

In [363]:
ns = { 'tb' : "http://nlp.perseus.tufts.edu/syntax/treebank/1.5"}

In [364]:
x = etree.parse('/Users/fmambrini/Documents/lavoro/treebank/files/AGDT2.X/data/tlg0011.tlg001.perseus-grc2.tb.xml')

In [79]:
sents = x.xpath("//sentence")

In [82]:
s = Sentence(sents[0], **{"author" : "Sophocles"})

In [83]:
t = s._tokens[0]

In [84]:
t.__dict__

{'_element': <Element word at 0x104954a48>,
 '_lemma_original': 'λόγος1',
 '_raw': '<word xmlns:treebank="http://nlp.perseus.tufts.edu/syntax/treebank/1.5" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" id="1" cid="36335578" form="&#x3BB;&#x3CC;&#x3B3;&#x3BF;&#x3C2;" lemma="&#x3BB;&#x3CC;&#x3B3;&#x3BF;&#x3C2;1" postag="n-s---mn-" head="3" relation="SBJ" cite="urn:cts:greekLit:tlg0011.tlg001:1"/>\n        ',
 '_rel_components': ['SBJ'],
 '_relation': 'SBJ',
 '_token_id': '1',
 'animacy': '',
 'cid': '36335578',
 'cite': 'urn:cts:greekLit:tlg0011.tlg001:1',
 'form': 'λόγος',
 'head': '3',
 'isMemberOfApos': 0,
 'isMemberOfCoord': 0,
 'ne_type': '',
 'original_label': 'SBJ',
 'postag': 'n-s---mn-',
 'rank': '1'}

In [371]:
len("p-s---fn-")

9

In [372]:
s.printStats()

Total tokens:	31;
Words:	29;
Artificial nodes:	2


In [397]:
sentences = []
for s in sents:
    sentences.append(Sentence(s))

In [406]:
s = sentences[0]
t = s._tokens[4]

In [407]:
t._element.attrib["postag"]
#t.lemma

'n-p---mg-'

In [379]:
for k,v in t._morphology.full.items():
    print("{}\t{}".format(k,v))

AttributeError: 'function' object has no attribute 'full'

In [182]:
_setPropIfThere(t.element, "form")

'οὐκ'

In [173]:
re.sub('[0-9]+$', '', "λόγος1")
#re.sub('[0-9]+$', '', "λόγο")

'λόγος'

In [274]:
for tok in s._tokens:
    print("{} => {}".format(tok.original_label, tok._relation))
    #print(type(t))

SBJ => SBJ
AuxY => AuxY
PRED => PRED
ATV => ATV
ATR => ATR
ATR => ATR
AuxX => AuxX
AuxC => AuxC
AuxZ => AuxZ
AuxY => AuxY
OBJ_AP => OBJ
ATR => ATR
ATR => ATR
AuxX => AuxX
ADV => ADV
AuxY => AuxY
ADV => ADV
SBJ => SBJ
APOS => APOS
AuxZ => AuxZ
AuxY => AuxY
AuxC => AuxC
PNOM => PNOM
AuxZ => AuxZ
COORD => COORD
AuxC => AuxC
ADV => ADV
PNOM => PNOM
AuxK => AuxK
OBJ_AP_CO => OBJ
OBJ_AP_CO => OBJ


In [196]:
root = Node("Sentence", sentence_id=s.sent_id,
                    address=s.address,
                    document = s.doc_id,
                    subdoc = s.subdoc,
                    span = s.span,
                   author = s.author,
                   work = s.work,
                   chronology=s.chronology,
                   genre = s.genre,
                   speaker = s.speaker,
                    meter = s.meter
                   )

In [219]:
props

{'animacy': '',
 'artificial_type': 'elliptic',
 'element': <Element word at 0x103d11b08>,
 'form': '[1]',
 'head': '26',
 'isMemberOfApos': 1,
 'isMemberOfCoord': 1,
 'lemma_original': None,
 'ne_type': '',
 'postag': None,
 'relation': 'OBJ',
 'token_id': '31'}

In [226]:
d = {'form': '[1]',
 'head': '26',
 'isMemberOfApos': 1, 'lemma_original': "None"}

n = Node("#Artificial", **d)

In [227]:
n

(a5110f2:`#Artificial` {form:"[1]",head:"26",isMemberOfApos:1,lemma_original:"None"})

In [228]:
n["saint"] = "padre pio"

In [229]:
n

(a5110f2:`#Artificial` {form:"[1]",head:"26",isMemberOfApos:1,lemma_original:"None",saint:"padre pio"})

In [298]:
class Fuffa():
    def __init__(self, par):
        self.par = par
    
    @property
    def robba(self):
        if self.par:
            return "Hurra!"
        else:
            return ["pd", "pm", "pp"][1]

In [299]:
f = Fuffa(None)

In [300]:
f.robba

'pm'

In [42]:
head_add,dep_add = _createHeadDep(t)
query = '''MATCH (h),(d)
WHERE h.address = "{}" AND d.address = "{}"
CREATE (h)-[r:{}]->(d)
RETURN r'''.format(head_add, dep_add,t._relation)

In [43]:
query

'MATCH (h),(d)\nWHERE h.address = "tlg0011.tlg001.perseus-grc2#2898476#1" AND d.address = "tlg0011.tlg001.perseus-grc2#2898476#5"\nCREATE (h)-[r:ATR]->(d)\nRETURN r'

Help on class Graph in module py2neo.database:

class Graph(builtins.object)
 |  The `Graph` class represents a Neo4j graph database. Connection
 |  details are provided using URIs and/or individual settings. For any
 |  given `Graph`, the following protocol combinations are supported:
 |  
 |  - HTTP
 |  - HTTPS
 |  - Bolt + HTTP
 |  - Bolt/TLS + HTTPS
 |  
 |  Note that either HTTP or HTTPS must be enabled to allow for
 |  discovery and for some legacy features to be supported.
 |  
 |  The full set of `settings` supported are:
 |  
 |  Keyword         Description                                    Type(s)         Default
 |  ``bolt``        Use Bolt* protocol (`None` means autodetect)   bool, ``None``  ``None``
 |  ``secure``      Use a secure connection (Bolt/TLS + HTTPS)     bool            ``False``
 |  ``host``        Database server host name                      str             ``'localhost'``
 |  ``http_port``   Port for HTTP traffic                          int             `

In [78]:
toks = s._tokens

In [83]:
t = toks[22]

In [85]:
t._relation

'PNOM'