### Convert PropBankPreannotator's frames to verb patterns database

Convert a subset of PropBankPreannotator's frames lexicon to [verb patterns sqlite3 database](https://github.com/estnltk/syntax_experiments/blob/verb_templates/verb_patterns/vp_data2_documentation/patterns.md).

In [1]:
import sqlite3

In [2]:
# Create new database / connect to an existing one
con = sqlite3.connect("propbank_preannotator_verb_patterns.db")
cur = con.cursor()

In [3]:
# Create new patterns table
cur.execute("""
DROP TABLE IF EXISTS propbank_patterns
""")

cur.execute("""
    CREATE TABLE propbank_patterns (
       pat_id integer, 
       pattern text,
       verb_word text, 
       verb_compound text, 
       phrase_nr integer, 
       phrase_case text, 
       adp text, 
       inf_verb text);
""")

<sqlite3.Cursor at 0x1729a3297c0>

In [4]:
# Locate PropBankPreannotator's lexicon 
import os, os.path
from estnltk.downloader import get_resource_paths
propbank_lexicon_path = None
propbank_lexicon_path = 'propbank_frames.jl'
if propbank_lexicon_path is None:
    # Try to download PropBankPreannotator's lexicon via estnltk's resources
    propbank_lexicon_path = get_resource_paths("propbankpreannotator", only_latest=True, download_missing=True)
    propbank_lexicon_path = os.path.join(propbank_lexicon, 'propbank_frames.jl') if propbank_lexicon_path is not None else None
assert os.path.exists(propbank_lexicon_path), \
    f'(!) Illegal path for propbank lexicon: {propbank_lexicon_path}.'

In [5]:
# Load entries from PropBankPreannotator's lexicon
import json
entries_loaded = 0
frame_lexicon = {}
with open(propbank_lexicon_path, 'r', encoding='utf-8') as in_f:
    for entry in in_f:
        #
        # Example entry:
        #
        #  {"sense_id": "eitama_1", 
        #   "lemma": "eitama", 
        #   "class": "KÕNEAKT", 
        #   "description": "", 
        #   "complete": true, 
        #   "arguments": [{"name": "Arg0", "description": "eitaja", "variants": [{"feats": ["deprel=nsubj"]}]}, 
        #                 {"name": "Arg1", "description": "seda", "variants": [{"feats": ["deprel=obj"]}]}]}
        #
        entry = entry.strip()
        if entry.startswith('#'):
            # Skip comment lines
            continue
        entry_dict = json.loads(entry)
        lemma = entry_dict['lemma']
        if lemma not in frame_lexicon:
            frame_lexicon[lemma] = []
        frame_lexicon[lemma].append( entry_dict )
        entries_loaded += 1
print(f'Total {entries_loaded} frame entries loaded.')

Total 714 frame entries loaded.


In [6]:
# Mapping cases from UD lowercase to morph_extended
ud_to_morph_ext_case_mapping = {
    'nom': 'nom', 
    'gen': 'gen',
    'par': 'part',
    'ill': 'ill',
    'ine': 'in',
    'ela': 'el',
    'all': 'all', 
    'ade': 'ad',
    'abl': 'abl',
    'tra': 'tr',
    'ter': 'term',
    'ess': 'es',
    'abe': 'abes',
    'com': 'kom',
    # aditiiv
    'add': 'adit'
}

In [7]:
def extract_feats( feats:str ):
    extracted_feats = {}
    for feat_str in feats:
        assert '=' in feat_str
        fname, fval = feat_str.split('=')
        if fname == 'case':
            me_case = ud_to_morph_ext_case_mapping.get(fval.lower(), '<missing>')
            extracted_feats['phrase_case'] = me_case
        elif fname == 'casemarker':
            extracted_feats['adp'] = fval.lower()
    return extracted_feats



def insert_into_db( connection, cursor, feats_dict, table='propbank_patterns', missing_placeholder='' ):
    # Placeholder for missing values
    #missing_placeholder = None
    columns = ['pat_id', 'pattern', 'verb_word', 'verb_compound', 'phrase_nr', 'phrase_case', 'adp', 'inf_verb']
    values = []
    for col in columns:
        v = feats_dict.get(col, missing_placeholder)
        values.append( v )
    cursor.executemany(f"INSERT INTO {table} ({','.join(columns)}) VALUES ({','.join(['?' for c in columns])})", \
                                                                                     [tuple(values)] )
    connection.commit()

cur_pat_id = 0
extracted_args = 0
discarded_args = 0
for verb_lemma in frame_lexicon.keys():
    for frame in frame_lexicon[verb_lemma]:
        collected_patterns = []
        phrase_nr = 1
        for arg in frame["arguments"]:
            # Different variants features can be used to describe a single argument, e.g.
            #
            #  rääkima  {'name': 'Arg3', 
            #            'description': 'millegi kohta/millest', 
            #            'variants': [{'feats': ['deprel=obl', 'case=Gen', 'casemarker=kohta']}, 
            #                         {'feats': ['deprel=obl', 'case=Ela']}]}
            #
            arg_extracted = False
            for variant in arg['variants']:
                feats_dict = extract_feats(variant['feats'])
                if len(feats_dict.keys()) > 0:
                    arg_name = arg['name']
                    feats_dict['pattern'] = f'{frame["sense_id"]} {arg_name}'
                    arg_description = arg.get('description', None)
                    if arg_description is not None and len(arg_description) > 0:
                        feats_dict['pattern'] += f' ({arg_description.strip("()")})'
                    feats_dict['phrase_nr'] = phrase_nr
                    collected_patterns.append( feats_dict )
                    arg_extracted = True
                else:
                    discarded_args += 1
            if arg_extracted:
                phrase_nr += 1
        if collected_patterns:
            # Finalize patterns: all argument descriptions belonging to the same 
            # frame will get the same pat_id. Add verb lemmas
            for feats_dict in collected_patterns:
                feats_dict['verb_word'] = verb_lemma
                feats_dict['pat_id'] = cur_pat_id
                extracted_args += 1
                #print( feats_dict )
            # Insert collected patterns to db
            for feats_dict in collected_patterns:
                insert_into_db( con, cur, feats_dict, table='propbank_patterns' )
            cur_pat_id += 1
print()
print(f'Extracted argument descriptions: {extracted_args} / {extracted_args+discarded_args}')


Extracted argument descriptions: 626 / 2102


In [8]:
# ühenduse sulgemine
con.close()