# Functions

The original information is in XML format and DB files are needed for the Wordnet module. This sections contains all functions created for extracting relevant information from the XML file and converting it into a suitable format for database creation. Data extraction from XML format uses BeautifulSoup and database files are created using SQLite.

In [1]:
import re
import sqlite3
from bs4 import BeautifulSoup

Getting relevant information from all Wordnet entries from the XML-file.

In [6]:
def wordnet_entries(lexical_entries):
    wn_entries = []

    for entry in lexical_entries:
        lemma_info = entry.find('lemma')
        pos = lemma_info.get('partofspeech')
        lemma = lemma_info.get('writtenform')
        for sense in entry.find_all('sense'):
            sense_id = sense.get('id')
            estwn_id = sense.get('synset')
            sense = re.sub('[^0-9]', '', sense_id.split('-')[-1])
            wn_entries.append((lemma, pos, sense, sense_id, estwn_id))
            
    return wn_entries

Finding the name (main phrase of the synset) for synsets, most of them can be found from the definition of the synset.

In [7]:
def synset_names(xml_synsets):
    sourcesenses = {}

    for s in xml_synsets:
        d = s.find('definition')
        if d is not None and d.get('sourcesense') not in sourcesenses:
            if d.get('sourcesense') is not None:
                sourcesenses[s.get('id')] = d.get('sourcesense')
                
    return sourcesenses

Creating a list of tuples where each tuple contains relevant information about one lexical entry.

In [8]:
def database_entries(wn_entries, names):
    sourcesenses = names
    db_entries = []

    for entry in wn_entries:
        #Most of the names were found previously, but for some synsets, they couldn't be obtained.
        #In those cases, the name of the first lexical entry of said synset is used as the name
        if entry[-1] not in sourcesenses:
            sourcesenses[entry[-1]] = entry[-2]
        sourcesense = sourcesenses[entry[-1]]
        if sourcesense == entry[3]:
            #id is obtained from the estwn_id from the XML-file by removing all non-numeric characters from the string
            db_entries.append((re.sub("[^0-9]", "", entry[-1]), entry[0], entry[1], entry[2], sourcesense, entry[4], 1))
        else:
            db_entries.append((re.sub("[^0-9]", "", entry[-1]), entry[0], entry[1], entry[2], sourcesense, entry[4], 0))
            
    return db_entries

Creating a list of tuples (of synset relations) where each tuple contains the start vertex, end vertex and specified relation.
Start and end vertex of each relation are numeric id's of said synsets.

In [9]:
def database_relations(xml_synsets):
    db_relations = []

    for synset in xml_synsets:
        synset_id = re.sub("[^0-9]", "", synset.get("id"))
        relations = synset.find_all("synsetrelation")
        for relation in relations:
            db_relations.append((re.sub("[^0-9]", "", relation.get("target")), synset_id, relation.get("reltype")))
            
    return db_relations

Creating a list of tuples where each tuple consists of synset name and its definition.

In [10]:
def database_definitions(xml_synsets):
    db_definitions = []

    for synset in xml_synsets:
        definitions = synset.find_all('definition')
        if definitions is not None:
            for definition in definitions:
                sourcesense = definition.get('sourcesense')
                if sourcesense is None:
                    db_definitions.append((sourcesenses[synset.get('id')], definition.find(text=True)))
                else:
                    db_definitions.append((sourcesense, definition.find(text=True)))
                    
    return db_definitions

Creating a list of tuples where each tuple consists of synset sense (literal) and an example for it

In [11]:
def database_examples(xml_senses):
    db_examples = []

    for sense in xml_senses:
        examples = sense.find_all("example")
        if examples is not None:
            for example in examples:
                db_examples.append((sense.get("id"), example.find(text=True)))
                
    return db_examples

In [12]:
def create_database(file_path, db_name, create_table, insert, values):
    conn = sqlite3.connect(file_path)
    cur = conn.cursor()
    cur.execute(create_table)
    with conn:
        cur.execute("DELETE FROM {};".format(db_name)) #if database exists, deletes all values so there wouldn't be duplicates
        cur.executemany(insert, values)

# Test

In [21]:
import doctest

In [9]:
with open("../source_data/estwn-et-2.3.2.xml", encoding="UTF-8") as f:
    data = f.read()

soup = BeautifulSoup(data, 'lxml')

In [10]:
entry = soup.find('lexicalentry')
synset = soup.find('synset')
sense = soup.find('sense')

In [11]:
wn_entry = wordnet_entries([entry])
sourcesense = synset_names([synset])
db_entry = database_entries(wn_entry, sourcesense)
db_relation = database_relations([synset])
db_definition = database_definitions([synset])
db_example = database_examples([sense])

In [30]:
def wn_entry_len():
    '''
    >>> wn_entry_len()
    5
    '''
    return len(wordnet_entries([entry])[0])

In [51]:
def sourcesense_test():
    '''
    >>> sourcesense_test()
    True
    '''
    ss = synset_names([synset])
    keys = list(sourcesense.keys())
    values = list(sourcesense.values())
    correct_key = False
    correct_value = False
    if re.match(r's-.+-', values[0]):
        correct_value = True
    if re.match(r'estwn-et-', keys[0]):
        correct_key = True
    return correct_key and correct_value

In [67]:
def db_entry_len():
    '''
    >>> db_entry_len()
    7
    '''
    wn_entry = wordnet_entries([entry])
    sourcesense = synset_names([synset])
    return len(database_entries(wn_entry, sourcesense)[0])

In [82]:
def db_entry_id():
    '''
    >>> db_entry_id()
    True
    '''
    try:
        wn_entry = wordnet_entries([entry])
        sourcesense = synset_names([synset])
        db_entry = database_entries(wn_entry, sourcesense)[0]
        entry_id = int(db_entry[0])
        return int == type(entry_id)
    except:
        return False

In [86]:
def db_relation_ids():
    '''
    >>> db_entry_id()
    True
    '''
    try:
        db_relation = database_relations([synset])
        synset_1_id = int(db_relation[0])
        synset_2_id = int(db_relation[1])
        return type(synset_1_id) == int and type(synset_2_id) == int
    except:
        return False

In [87]:
doctest.testmod(verbose=True)

Trying:
    db_entry_id()
Expecting:
    True
ok
Trying:
    db_entry_len()
Expecting:
    7
ok
Trying:
    db_entry_id()
Expecting:
    True
ok
Trying:
    sourcesense_len()
Expecting:
    2
ok
Trying:
    sourcesense_test()
Expecting:
    True
ok
Trying:
    wn_entry_len()
Expecting:
    5
ok
10 items had no tests:
    __main__
    __main__.create_database
    __main__.database_definitions
    __main__.database_entries
    __main__.database_examples
    __main__.database_relations
    __main__.fun
    __main__.ha
    __main__.synset_names
    __main__.wordnet_entries
6 items passed all tests:
   1 tests in __main__.db_entry_id
   1 tests in __main__.db_entry_len
   1 tests in __main__.db_relation_ids
   1 tests in __main__.sourcesense_len
   1 tests in __main__.sourcesense_test
   1 tests in __main__.wn_entry_len
6 tests in 16 items.
6 passed and 0 failed.
Test passed.


TestResults(failed=0, attempted=6)

# Data extraction and import

Using previously created functions to extract relevant data from the XML file and create necessary database files.

In [2]:
with open("../source_data/estwn-et-2.3.2.xml", encoding="UTF-8") as f:
    data = f.read()

In [3]:
soup = BeautifulSoup(data,'lxml')

Finding all lexical entries, synset objects and senses by their tags

In [4]:
entries = soup.find_all('lexicalentry')
synsets = soup.find_all('synset')
senses = soup.find_all('sense')

Extracting necessary information to create database files

In [13]:
wn_entries = wordnet_entries(entries)
sourcesenses = synset_names(synsets)
db_entries = database_entries(wn_entries, sourcesenses)
db_relations = database_relations(synsets)
db_definitions = database_definitions(synsets)
db_examples = database_examples(senses)

There are some discrepancies when it comes to hyponyms and hypernyms in the original XML file. Here these are found and printed out.

In [15]:
dict_hypernyms = {}
dict_hyponyms = {}

for relation in db_relations:
    if relation[2] == 'hypernym':
        start = relation[0]
        end = relation[1]
        if start in dict_hypernyms:
            dict_hypernyms[start].append(end)
        else:
            dict_hypernyms[start] = [end]
    if relation[2] == 'hyponym':
        start = relation[0]
        end = relation[1]
        if start in dict_hyponyms:
            dict_hyponyms[start].append(end)
        else:
            dict_hyponyms[start] = [end]

In [34]:
for key, values in dict_hypernyms.items():
    for hyp in values:
        if hyp not in dict_hyponyms or key not in dict_hyponyms[hyp]:
            hyp_tuple = None
            key_tuple = None
            for entry in db_entries:
                if hyp == entry[0]:
                    hyp_tuple = entry[4]
                if key == entry[0]:
                    key_tuple = entry[4]
            print(hyp_tuple, "is hyponym for", key_tuple, "but", key_tuple, "is not hypernym for", hyp_tuple)

s-tsüaanima-v1 is hyponym for s-töötlema-v2 but s-töötlema-v2 is not hypernym for s-tsüaanima-v1
s-dekapeerima-v1 is hyponym for s-puhastama-v1 but s-puhastama-v1 is not hypernym for s-dekapeerima-v1
s-alkoholiseerima-v1 is hyponym for s-lisama-v2 but s-lisama-v2 is not hypernym for s-alkoholiseerima-v1
s-deklasseeruma-v1 is hyponym for s-eralduma-v1 but s-eralduma-v1 is not hypernym for s-deklasseeruma-v1
s-redunka-n1 is hyponym for s-veislane-n1 but s-veislane-n1 is not hypernym for s-redunka-n1
s-trasseerima-v1 is hyponym for s-märkima-v2 but s-märkima-v2 is not hypernym for s-trasseerima-v1
s-tõkendama-v1 is hyponym for s-kohaldama-v1 but s-kohaldama-v1 is not hypernym for s-tõkendama-v1
s-dekapeerima-v1 is hyponym for s-söövitama-v1 but s-söövitama-v1 is not hypernym for s-dekapeerima-v1


In [32]:
for key, values in dict_hyponyms.items():
    for hyp in values:
        if hyp not in dict_hypernyms or key not in dict_hypernyms[hyp]:
            hyp_tuple = None
            key_tuple = None
            for entry in db_entries:
                if hyp == entry[0]:
                    hyp_tuple = entry[4]
                if key == entry[0]:
                    key_tuple = entry[4]
            print(hyp_tuple, "is hypernym for", key_tuple, "but", key_tuple, "is not hyponym for", hyp_tuple)

s-gasell-n1 is hypernym for s-redunka-n1 but s-redunka-n1 is not hyponym for s-gasell-n1
s-rabasaar-n1 is hypernym for s-peaksi-n1 but s-peaksi-n1 is not hyponym for s-rabasaar-n1
s-sulfoonamiid-n1 is hypernym for s-streptotsiid-n1 but s-streptotsiid-n1 is not hyponym for s-sulfoonamiid-n1
s-kliiniline_uuring-n1 is hypernym for s-sõeltest-n1 but s-sõeltest-n1 is not hyponym for s-kliiniline_uuring-n1
s-mõlu-n1 is hypernym for s-radiaallaiksus-n1 but s-radiaallaiksus-n1 is not hyponym for s-mõlu-n1


Creating the 'Wordnet entries' database

In [20]:
wn_entry_db = "..//data//estwn-et-2.3.2//wordnet_entry.db"
wn_entry_name = "wordnet_entry"
wn_entry_create = "CREATE TABLE IF NOT EXISTS wordnet_entry(id INT, literal TEXT, pos TEXT, sense INT, synset_name TEXT, estwn_id TEXT, is_name INT)"
wn_entry_insert = "insert into wordnet_entry(id, literal, pos, sense, synset_name, estwn_id, is_name) values (?,?,?,?,?,?,?)"
create_database(wn_entry_db, wn_entry_name, wn_entry_create, wn_entry_insert, db_entries)

Creating the 'Wordnet relations' database

In [15]:
wn_relation_db = "..//data//estwn-et-2.3.2//wordnet_relation.db"
wn_relation_name = "wordnet_relation"
wn_relation_create = "CREATE TABLE IF NOT EXISTS wordnet_relation(start_vertex INT, end_vertex INT, relation TEXT)"
wn_relation_insert = "insert into wordnet_relation(start_vertex, end_vertex, relation) values (?,?,?)"
create_database(wn_relation_db, wn_relation_name, wn_relation_create, wn_relation_insert, db_relations)

Creating the 'Wordnet definitions' database

In [32]:
wn_definitions_db = "..//data//estwn-et-2.3.2//wordnet_definition.db"
wn_definitions_name = "wordnet_definition"
wn_definitions_create = "CREATE TABLE IF NOT EXISTS wordnet_definition(synset_name TEXT, definition TEXT)"
wn_definitions_insert = "insert into wordnet_definition(synset_name, definition) values (?,?)"
create_database(wn_definitions_db, wn_definitions_name, wn_definitions_create, wn_definitions_insert, db_definitions)

Creating the 'Wordnet examples' database

In [15]:
wn_examples_db = "..//data//estwn-et-2.3.2//wordnet_example.db"
wn_examples_name = "wordnet_example"
wn_examples_create = "CREATE TABLE IF NOT EXISTS wordnet_example(synset_name TEXT, example TEXT)"
wn_examples_insert = "insert into wordnet_example(synset_name, example) values (?,?)"
create_database(wn_examples_db, wn_examples_name, wn_examples_create, wn_examples_insert, db_examples)