In [1]:
from lxml import etree
from __init__ import Wn_grid_parser
from datetime import datetime

# TODO
* make sure it validates

## Download dtd

In [2]:
%%bash
wget -O resources/cili/WN-LMF.dtd  https://raw.githubusercontent.com/globalwordnet/schemas/master/WN-LMF.dtd 

--2017-03-06 10:05:40--  https://raw.githubusercontent.com/globalwordnet/schemas/master/WN-LMF.dtd
Resolving raw.githubusercontent.com... 151.101.36.133
Connecting to raw.githubusercontent.com|151.101.36.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8196 (8.0K) [text/plain]
Saving to: 'resources/cili/WN-LMF.dtd'

     0K ........                                              100% 18.6M=0s

2017-03-06 10:05:41 (18.6 MB/s) - 'resources/cili/WN-LMF.dtd' saved [8196/8196]



## LexicalEntry

FROM
```xml
<LexicalEntry id="elektroforetisch-a-1" partOfSpeech="adjective">
       <Lemma writtenForm="elektroforetisch"/>
       <WordForms/>
       <Morphology/>
       <MorphoSyntax/>
       <Sense definition="" id="o_a-73337" provenance="google+bing" senseId="1" synset="eng-30-02718845-a">
         <SenseRelations/>
         <Semantics-adjective/>
         <Pragmatics/>
       </Sense>
</LexicalEntry>
```

```xml
TO
<LexicalEntry id="w1">
    <Lemma writtenForm="grandfather" partOfSpeech="n"/>
    <Sense id="example-10161911-n-1" synset="example-10161911-n">
    </Sense>
</LexicalEntry>
```

TODO:
    1. senseId?
    2. provenance?
    3. mapping pos
    4. synset identifier -> list of definitions

```xml
FROM
<Synset id="eng-30-06618427-n" ili="i71250">
       <Definitions>
         <Definition gloss="(film) an abrupt transition from one scene to another" language="en" provenance="pwn"/>
       </Definitions>
       <SynsetRelations>
         <SynsetRelation provenance="pwn" relType="has_hyperonym" target="eng-30-06401328-n"/>
       </SynsetRelations>
</Synset>

TO
<Synset id="example-1-n" ili="in">
        <Definition>A father&apos;s father; a paternal grandfather</Definition>
        <!-- You can include metadata (such as source) at many points -->
        <!-- The ILI Definition must be at least 20 characters or five words -->
        <ILIDefinition dc:source="https://en.wiktionary.org/wiki/farfar">A father&apos;s father; a paternal grandfather</ILIDefinition>
        <SynsetRelation relType="hypernym" target="example-10162692-n"/>

</Synset>
```

TODO:
    1. mapping relations
    2. check provenance

QUESTIONS:
    1. difference sense example and synset definition

### Helper functions and dictionaries

In [3]:
reltypes_mapping = {'near_antonym': 'antonym',
                    'fuzzynym': 'other',

                    'has_mero_location': 'mero_location',
                    'has_mero_member': 'mero_member',

                    'role_source_direction': 'source_direction',
                    'role_patient': 'patient',
                    'role_result': 'result',
                    'role_instrument': 'instrument',
                    'role_direction': 'direction',
                    'role_location': 'location',
                    'role_agent': 'agent',
                    'role_target_direction': 'target_direction',

                    'has_subevent': 'subevent',

                    'has_hyperonym': 'hypernym',
                    'has_xpos_hyperonym': 'hypernym',
                    'has_hyponym': 'hyponym',
                    'has_xpos_hyponym': 'hyponym',
                    'has_meronym': 'meronym',
                    'has_holonym': 'holonym',

                    'has_mero_madeof': 'mero_substance',
                    'has_mero_portion': 'mero_portion',
                    'has_mero_part': 'mero_part',

                    'has_holo_portion': 'holo_portion',
                    'has_holo_member': 'holo_member',
                    'has_holo_location': 'holo_location',
                    'has_holo_part': 'holo_substance',
                    'has_holo_madeof': 'holo_substance',

                    'instance': 'instance_hypernym',

                    'near_synonym': 'eq_synonym'}

In [4]:
def validate(dtd_path, loaded_xml):
    '''
    validate against dtd

    :param str dtd_path: full path to dtd

    :rtype: tuple
    :return: (succes,message)
    '''
    f = open(dtd_path)
    dtd = etree.DTD(f)
    message = ""

    succes = dtd.validate(loaded_xml)
    if not succes:
        message = dtd.error_log.filter_from_errors()[0]

    return (succes, message)

## Load new synsets information

In [5]:
wrong_ones = {'odwn-10-104225285-n',
              'odwn-10-109919923-n',
              'odwn-10-107611589-n',
              'odwn-10-102360294-n',
              'odwn-10-107262080-n'}

rbn_ids = set()
odwn_ids = {}

with open('resources/cili/new_synsets_v2.csv') as infile:
    next(infile)
    for line in infile:
        split = line.strip().split('\t')
        if len(split) < 8:
            continue
        rbn_id, odwn_id, ilidef = split[1], split[2], split[7]
        
        if all([ilidef,
                ilidef != 'x',
                odwn_id not in wrong_ones]):
            rbn_ids.add(rbn_id)
            odwn_ids[odwn_id] = ilidef

In [6]:
print(len(odwn_ids))
print(len(rbn_ids))

78
78


## Starting point

In [7]:
old = Wn_grid_parser(Wn_grid_parser.odwn)
old_root = old.doc.getroot()

In [8]:
root = etree.fromstring('<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/"></LexicalResource>')
tree = etree.ElementTree(root)

etree.dump(root)

<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/"/>


In [9]:
lexicon_el = etree.Element('Lexicon', 
                           attrib={
        "id": "odwn",
        "label": "Open Dutch WordNet",
        "language": "nl",
        "email": "piek.vossen@vu.nl",
        "license": "https://creativecommons.org/licenses/by-sa/4.0/",
        "version": "1.3",
        "citation": "Open Dutch WordNet. Marten Postma and Emiel van Miltenburg and Roxane Segers and Anneleen Schoen and Piek Vossen, Proceedings of the Global WordNet Conference 2016, (2016).",
        "url": "http://wordpress.let.vupr.nl/odwn/",
        "{http://purl.org/dc/elements/1.1/}publisher": "Global Wordnet Association",
        "confidenceScore" : "1.0"
        },
        nsmap={"dc" : "http://purl.org/dc/elements/1.1/"})
root.append(lexicon_el)

## Select synset ids to add

In [10]:
# which synsets to add
added_synsets = set()
# loop through Synsets
for synset_obj in old.synsets_get_generator():
    ili = synset_obj.get_ili()
    synset_id = synset_obj.get_id()
    if all([ili is not None,
            synset_id]):
        added_synsets.add(synset_id)

    if synset_id in odwn_ids:
        added_synsets.add(synset_id)

## Add LexicalEntries

In [11]:
# add LexicalEntries
added_sense_ids = set()
for counter, le_obj in enumerate(old.les_get_generator()): # mw not taken into account

    synset_id = le_obj.get_synset_id()
    
    sense_id = le_obj.get_sense_id()

    if all([synset_id,
            sense_id not in added_sense_ids]):
        if synset_id in added_synsets:
            
            synset_ili_format = synset_id.replace('eng-30-', 'odwn-')
            synset_ili_format = synset_ili_format.replace('odwn-10-', 'odwn-')

            # TODO: add sense examples
            lexical_entry_el = etree.SubElement(lexicon_el, 'LexicalEntry',
                                                attrib={'id': 'w%s' % counter})

            pos = le_obj.get_pos()[0]
            if sense_id == 'c_546616':
                pos = 'v'
            
            
            etree.SubElement(lexical_entry_el, 'Lemma',
                             attrib={'writtenForm': le_obj.get_lemma(),
                                     'partOfSpeech': pos})

            etree.SubElement(lexical_entry_el, 'Sense', attrib={'id': le_obj.get_sense_id(),
                                                                'synset': synset_ili_format})


            added_sense_ids.add(sense_id)


## Add Tweet (noun + verb)

In [12]:
# Add tweet-n
lexical_entry_el = etree.SubElement(lexicon_el, 'LexicalEntry',
                                                attrib={'id': 'w1000000'})

etree.SubElement(lexical_entry_el, 'Lemma',
                 attrib={'writtenForm': 'tweet',
                         'partOfSpeech': 'n'})

etree.SubElement(lexical_entry_el, 'Sense', attrib={'id': 'r_1',
                                                    'synset': 'odwn-00000001-n'})

added_sense_ids.add('r_1')

# Add tweet-v
lexical_entry_el = etree.SubElement(lexicon_el, 'LexicalEntry',
                                    attrib={'id': 'w1000001'})

etree.SubElement(lexical_entry_el, 'Lemma',
                 attrib={'writtenForm': 'tweet',
                         'partOfSpeech': 'v'})

etree.SubElement(lexical_entry_el, 'Sense', attrib={'id': 'r_2',
                                                    'synset': 'odwn-00000002-v'})

added_sense_ids.add(sense_id)

## Add synsets

In [13]:
for synset_obj in old.synsets_get_generator():

    ili = synset_obj.get_ili()
    synset_id = synset_obj.get_id()
    
    synset_ili_format = synset_id.replace('eng-30-', 'odwn-')
    synset_ili_format = synset_ili_format.replace('odwn-10-', 'odwn-')

    if synset_id in odwn_ids:
        ili = 'in'

    if all([ili is not None,
            synset_id]):
        synset_el = etree.Element('Synset',
                                  attrib={'id': synset_ili_format,
                                          'ili': ili,
                                          'partOfSpeech' : synset_ili_format[-1]})

        # TODO: add language attribute
        # TODO: add Dutch definitions

        if synset_id.startswith('eng'):
            for def_en in synset_obj.get_glosses(languages=['en']):
                def_el = etree.Element('Definition')
                def_el.text = def_en
                synset_el.append(def_el)

        elif synset_id.startswith('odwn'):

            def_el = etree.Element('Definition')
            def_el.text = odwn_ids[synset_id]
            synset_el.append(def_el)

            def_el = etree.Element('ILIDefinition')
            def_el.text = odwn_ids[synset_id]
            synset_el.append(def_el)




        for rel_obj in synset_obj.get_all_relations():
            reltype = rel_obj.get_reltype()

            # TODO: add dc:source

            target = rel_obj.get_target()
            target_ili_format = target.replace('eng-30-', 'odwn-')
            target_ili_format = target_ili_format.replace('odwn-10-', 'odwn-')
            
            if all([reltype in reltypes_mapping,
                    target in added_synsets]):
                mapped_reltype = reltypes_mapping[reltype]
                source = rel_obj.get_provenance()
                rel_el = etree.Element('SynsetRelation', attrib={'relType' : mapped_reltype,
                                                                 'target' : target_ili_format})

                synset_el.append(rel_el)

        lexicon_el.append(synset_el)

## Add tweet synsets

In [14]:
# add synset belonging to tweet-n
synset_el = etree.Element('Synset', 
                          attrib={'id': 'odwn-00000001-n', 
                                  'ili': 'in',
                                  'partOfSpeech' : 'n'})

def_el = etree.Element('Definition')
def_el.text = 'a message or image posted on Twitter'
synset_el.append(def_el)

def_el = etree.Element('ILIDefinition')
def_el.text = 'a message or image posted on Twitter'
synset_el.append(def_el)

rel_el = etree.Element('SynsetRelation', attrib={'relType': 'hypernym',
                                                 'target': 'odwn-06253690-n'})
synset_el.append(rel_el)

lexicon_el.append(synset_el)


# add synset belonging to tweet-v
synset_el = etree.Element('Synset', attrib={'id': 'odwn-00000002-v', 
                                            'ili': 'in',
                                            'partOfSpeech' : 'v'})

def_el = etree.Element('Definition')
def_el.text = 'to post a message or image on Twitter'
synset_el.append(def_el)

def_el = etree.Element('ILIDefinition')
def_el.text = 'to post a message or image on Twitter'
synset_el.append(def_el)

rel_el = etree.Element('SynsetRelation', attrib={'relType': 'hypernym',
                                                 'target': 'odwn-00742320-v'})
synset_el.append(rel_el)

lexicon_el.append(synset_el)

## Validate and save

In [15]:
# validate
dtd_path = 'resources/cili/WN-LMF.dtd'
succes, message = validate(dtd_path, tree)
print(succes)
print(message)

if succes:
    with open('resources/cili/odwn_cili.xml', "wb") as outfile:
        tree.write(outfile,
                   pretty_print=True,
                   xml_declaration=True,
                   encoding='utf-8')

True



In [16]:
%%bash
head resources/cili/odwn_cili.xml

<?xml version='1.0' encoding='UTF-8'?>
<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
  <Lexicon citation="Open Dutch WordNet. Marten Postma and Emiel van Miltenburg and Roxane Segers and Anneleen Schoen and Piek Vossen, Proceedings of the Global WordNet Conference 2016, (2016)." confidenceScore="1.0" email="piek.vossen@vu.nl" id="odwn" label="Open Dutch WordNet" language="nl" license="https://creativecommons.org/licenses/by-sa/4.0/" url="http://wordpress.let.vupr.nl/odwn/" version="1.3" dc:publisher="Global Wordnet Association">
    <LexicalEntry id="w0">
      <Lemma partOfSpeech="a" writtenForm="hydro-elektrisch"/>
      <Sense id="o_a-114723" synset="odwn-02827950-a"/>
    </LexicalEntry>
    <LexicalEntry id="w1">
      <Lemma partOfSpeech="a" writtenForm="elektroforetisch"/>
      <Sense id="o_a-73337" synset="odwn-02718845-a"/>
