In [1]:
from bs4 import BeautifulSoup
import re

In [2]:
with open("../source_data/estwn-et-2.3.2.xml", encoding="UTF-8") as f:
    data = f.read()

In [3]:
soup = BeautifulSoup(data,'lxml')

In [4]:
#Finding all lexical entries by their tag
entries = soup.find_all('lexicalentry')

In [5]:
#Getting relevant information from all Wordnet entries from the XML-file
wn_entries = []

for entry in entries:
    lemma_info = entry.find('lemma')
    pos = lemma_info.get('partofspeech')
    lemma = lemma_info.get('writtenform')
    for sense in entry.find_all('sense'):
        sense_id = sense.get('id')
        estwn_id = sense.get('synset')
        sense = re.sub('[^0-9]', '', sense_id.split('-')[-1])
        wn_entries.append((lemma, pos, sense, sense_id, estwn_id))

In [6]:
#Finding all synset objects by the synset tag
synsets = soup.find_all('synset')

In [7]:
#Finding the name (main phrase of the synset) for synsets, most of them can be found from the definition of the synset
sourcesenses = {}

for s in synsets:
    d = s.find('definition')
    if d is not None and d.get('sourcesense') not in sourcesenses:
        if d.get('sourcesense') is not None:
            sourcesenses[s.get('id')] = d.get('sourcesense')

In [8]:
#Creating a list of tuples where each tuple contains relevant information about one lexical entry
db_entries = []

for entry in wn_entries:
    #Most of the names were found previously, but for some synsets, they couldn't be obtained.
    #In those cases, the name of the first lexical entry of said synset is used as the name
    if entry[-1] not in sourcesenses:
        sourcesenses[entry[-1]] = entry[-2]
    sourcesense = sourcesenses[entry[-1]]
    if sourcesense == entry[3]:
        #id is obtained from the estwn_id from the XML-file by removing all non-numeric characters from the string
        db_entries.append((re.sub("[^0-9]", "", entry[-1]), entry[0], entry[1], entry[2], sourcesense, entry[4], 1))
    else:
        db_entries.append((re.sub("[^0-9]", "", entry[-1]), entry[0], entry[1], entry[2], sourcesense, entry[4], 0))

In [9]:
#Creating a list of tuples (of synset relations) where each tuple contains the start vertex, end vertex and specified relation.
#Start and end vertex of each relation are numeric id's of said synsets.
db_relations = []

for synset in synsets:
    synset_id = re.sub("[^0-9]", "", synset.get('id'))
    relations = synset.find_all('synsetrelation')
    for relation in relations:
        db_relations.append((re.sub("[^0-9]", "", relation.get('target')), synset_id, relation.get('reltype')))

In [10]:
#Creating a list of tuples where each tuple consists of synset name and its definition
db_definitions = []

for synset in synsets:
    definitions = synset.find_all('definition')
    #if len(synset.find_all('definition')) > 1:
    #    print(definitions)
    #    print(set([str(d.find(text=True)) for d in definitions]))
    #    print()
    if definitions is not None:
        for definition in definitions:
            sourcesense = definition.get("sourcesense")
            if sourcesense is None:
                db_definitions.append((sourcesenses[synset.get('id')], definition.find(text=True)))
            else:
                db_definitions.append((sourcesense, definition.find(text=True)))

In [11]:
senses = soup.find_all('sense')

In [12]:
#Creating a list of tuples where each tuple consists of synset sense (literal) and an example for it
db_examples = []

for sense in senses:
    examples = sense.find_all('example')
    if examples is not None:
        for example in examples:
            db_examples.append((sense.get("id"), example.find(text=True)))

In [13]:
import sqlite3

In [14]:
def create_database(file_path, db_name, create_table, insert, values):
    conn = sqlite3.connect(file_path)
    cur = conn.cursor()
    cur.execute(create_table)
    with conn:
        cur.execute("DELETE FROM {};".format(db_name)) #if database exists, deletes all values so there wouldn't be duplicates
        cur.executemany(insert, values)

In [14]:
#Creating the 'Wordnet entries' database
wn_entry_db = "..//data//estwn-et-2.3.2//wordnet_entry.db"
wn_entry_name = "wordnet_entry"
wn_entry_create = "CREATE TABLE IF NOT EXISTS wordnet_entry(id INT, literal TEXT, pos TEXT, sense INT, synset_name TEXT, estwn_id TEXT, is_name INT)"
wn_entry_insert = "insert into wordnet_entry(id, literal, pos, sense, synset_name, estwn_id, is_name) values (?,?,?,?,?,?,?)"
create_database(wn_entry_db, wn_entry_name, wn_entry_create, wn_entry_insert, db_entries)

In [15]:
#Creating the 'Wordnet relations' database
wn_relation_db = "..//data//estwn-et-2.3.2//wordnet_relation.db"
wn_relation_name = "wordnet_relation"
wn_relation_create = "CREATE TABLE IF NOT EXISTS wordnet_relation(start_vertex INT, end_vertex INT, relation TEXT)"
wn_relation_insert = "insert into wordnet_relation(start_vertex, end_vertex, relation) values (?,?,?)"
create_database(wn_relation_db, wn_relation_name, wn_relation_create, wn_relation_insert, db_relations)

In [32]:
#Creating the 'Wordnet definitions' database
wn_definitions_db = "..//data//estwn-et-2.3.2//wordnet_definition.db"
wn_definitions_name = "wordnet_definition"
wn_definitions_create = "CREATE TABLE IF NOT EXISTS wordnet_definition(synset_name TEXT, definition TEXT)"
wn_definitions_insert = "insert into wordnet_definition(synset_name, definition) values (?,?)"
create_database(wn_definitions_db, wn_definitions_name, wn_definitions_create, wn_definitions_insert, db_definitions)

In [15]:
#Creating the 'Wordnet examples' database
wn_examples_db = "..//data//estwn-et-2.3.2//wordnet_example.db"
wn_examples_name = "wordnet_example"
wn_examples_create = "CREATE TABLE IF NOT EXISTS wordnet_example(synset_name TEXT, example TEXT)"
wn_examples_insert = "insert into wordnet_example(synset_name, example) values (?,?)"
create_database(wn_examples_db, wn_examples_name, wn_examples_create, wn_examples_insert, db_examples)