# Sippycup semantic parsing

In [1]:
import sys
sys.path.append('3rdparty/sippycup')
from annotator import *
from parsing import *

In [2]:
import spacy
nlp = spacy.load("en_core_web_md")

In [3]:
def add_rule(grammar, rule):
    if contains_optionals(rule):
        add_rule_containing_optional(grammar, rule)
    elif is_lexical(rule):
        grammar.lexical_rules[rule.rhs].append(rule)
    elif is_unary(rule):
        grammar.unary_rules[rule.rhs].append(rule)
    elif is_binary(rule):
        grammar.binary_rules[rule.rhs].append(rule)
    elif all([is_cat(rhsi) for rhsi in rule.rhs]):
        add_n_ary_rule(grammar, rule)
    else:
        make_cat(grammar, rule)
        # raise Exception('RHS mixes terminals and non-terminals: %s' % rule

def add_rule_containing_optional(grammar, rule):
    # Find index of the first optional element on the RHS.
    first = next((idx for idx, elt in enumerate(rule.rhs) if is_optional(elt)), -1)
    assert first >= 0
    assert len(rule.rhs) > 1, 'Entire RHS is optional: %s' % rule
    prefix = rule.rhs[:first]
    suffix = rule.rhs[(first + 1):]
    # First variant: the first optional element gets deoptionalized.
    deoptionalized = (rule.rhs[first][1:],)
    add_rule(grammar, Rule(rule.lhs, prefix + deoptionalized + suffix, rule.sem))
    # Second variant: the first optional element gets removed.
    # If the semantics is a value, just keep it as is.
    sem = rule.sem
    # But if it's a function, we need to supply a dummy argument for the removed element.
    if isinstance(rule.sem, FunctionType):
        sem = lambda sems: rule.sem(sems[:first] + [None] + sems[first:])
    add_rule(grammar, Rule(rule.lhs, prefix + suffix, sem))

def make_cat(grammar, rule):
    """
    Convert a terminal in the RHS into a non-terminal.
    
    Conversion works by creating a nonterminal from each terminal if
    it does not exist already in the grammar, otherwise it just replaces it.
    """
    
    new_rhs = []
    for rhsi in rule.rhs:
        if is_cat(rhsi):
            cat_name = rhsi
        else:
            cat_name = "$" + rhsi + "__nonterminal"
            if cat_name not in grammar.categories:
                grammar.categories.add(cat_name)
                # print(f"Adding rule: {cat_name} := {str(rhsi)}")
                add_rule(grammar, Rule(cat_name, rhsi))
        new_rhs.append(cat_name)
        # print(f"Adding rule: {rule.lhs} := {str(new_rhs)}")
    add_rule(grammar, Rule(rule.lhs, tuple(new_rhs), rule.sem))



In [4]:
def parse_input(grammar, input):
    """Returns a list of all parses for input using grammar."""
    tokens_spacy = nlp(input) # New
    tokens = [token.text for token in tokens_spacy]
    chart = defaultdict(list)
    for j in range(1, len(tokens) + 1):
        for i in range(j - 1, -1, -1):
            apply_annotators(grammar, chart, tokens, i, j)
            apply_lexical_rules(grammar, chart, tokens, i, j)
            apply_binary_rules(grammar, chart, i, j)
            apply_unary_rules(grammar, chart, i, j)
    parses = chart[(0, len(tokens))]
    if hasattr(grammar, 'start_symbol') and grammar.start_symbol:
        parses = [parse for parse in parses if parse.rule.lhs == grammar.start_symbol]
    return parses

class Grammar:
    def __init__(self, rules=[], annotators=[], start_symbol='$ROOT'):
        self.categories = set()
        self.lexical_rules = defaultdict(list)
        self.unary_rules = defaultdict(list)
        self.binary_rules = defaultdict(list)
        self.annotators = annotators
        self.start_symbol = start_symbol
        for rule in rules:
            add_rule(self, rule)
        print('Created grammar with %d rules.' % len(rules))

    def parse_input(self, input):
        """Returns a list of parses for the given input."""
        return parse_input(self, input)

In [5]:
NumberAnnotator().annotate(['16'])

[('$Number', 16)]

In [6]:
TokenAnnotator().annotate(['foo'])

[('$Token', 'foo')]

In [28]:
class StopWordAnnotator(Annotator):
    """Let spacy detect stop words for us"""
    def annotate(self, tokens):
        if len(tokens) == 1:
            if nlp(tokens[0])[0].is_stop:
                return [('$StopWord', tokens[0])]
        return []

class ShowVerbAnnotator(Annotator):
    def __init__(self, threshold = 0.7):
        self.show_verbs = [("define", ""), ("tell", "me"), ("show", "me")]
        self.spacy_show_toks = nlp(" ".join([verb for verb, _ in self.show_verbs]))
        self.threshold = 0.7

    def annotate(self, tokens):
        if len(tokens) <= 2:
            spacy_tokens = nlp(" ".join(tokens))
            spacy_token = spacy_tokens[0]
            if spacy_token.pos_ != 'VERB':
                return []
            
            # If the verb matches in meaning and, in case it requires a
            # follow-up word, that this matches as well, then it's a match.
            for idx, (verb, acc) in enumerate(self.show_verbs):
                spacy_verb = self.spacy_show_toks[idx]
                if spacy_token.similarity(spacy_verb) >= self.threshold:
                    if verb == tokens[0] and acc != "" and (len(tokens) == 1 or tokens[1] != acc):
                        return []
                    return [('$ShowVerb', tokens)]
        return []

    
class TokenAnnotatorBuilder(Annotator):
    def __init__(self, category_name, excluded):
        Annotator.__init__(self)
        self.category_name = category_name
        self.excluded = excluded
    
    def annotate(self, tokens):
        if len(tokens) == 1:
            token = tokens[0]
            if token not in self.excluded:
                return [(self.category_name, token)]
        return []

In [8]:
ShowVerbAnnotator().annotate(['say'])

[('$ShowVerb', ['say'])]

In [9]:
ShowVerbAnnotator().annotate(['define'])

[('$ShowVerb', ['define'])]

In [10]:
ShowVerbAnnotator().annotate(['tell', 'me'])

[('$ShowVerb', ['tell', 'me'])]

In [11]:
TokenAnnotatorBuilder('TokenWithoutQuotes', ['"', '"']).annotate(['Jeff'])

[('TokenWithoutQuotes', 'Jeff')]

In [12]:
MAX_CELL_CAPACITY = 10000

# Grammar definition

We will model the queries after a few intents:

- Definition: asking for a definition of a noun phrase
- Comparison: compare two noun phrases
- Filtering/Details on a given sense: ask for further details on a previously mentioned sense
- Usage of form
- General grammar knowledge

In [94]:
from functools import reduce

def sems_0(sems):
    return sems[0]

def sems_1(sems):
    return sems[1]

def merge_dicts(d1, d2):
    if not d2:
        return d1
    if not d1:
        return {}
    return {**d1, **d2}

def strip_none(sems):
    return [sem for sem in sems if sem]

def merge_dicts_singleparam(sems):
    if all([sem is None for sem in sems]):
        return {}
    return reduce(merge_dicts, strip_none(sems))

def to_np(sems):
    return {'np': strip_none(sems)[0]}

def concatenate(sems):
    return " ".join(strip_none(sems))


rules_definition = [
    Rule('$ROOT', '$DefinitionQuery', sems_0),
    Rule('$DefinitionQuery', '$DefinitionQueryElements',
         lambda sems: merge_dicts({'intent': 'definition'}, sems[0])),
    Rule('$DefinitionQueryElements', '$DefinitionQuestion $NounPhrase',
         merge_dicts_singleparam),
    
    Rule('$DefinitionQuestion', '$ShowVerb ?me'),
    Rule('$DefinitionQuestion', '$WhatDefinition'),
    Rule('$WhatDefinition', 'what is ?$Determiner ?$DefinitionFor'),
    Rule('$WhatDefinition', 'how do you $ShowVerb'),
    Rule('$DefinitionFor', 'meaning $StopWord'),
    Rule('$DefinitionFor', 'sense $StopWord'),
    Rule('$DefinitionFor', 'definition $StopWord'),
    Rule('$NounPhrase', "$Tokens", to_np),
    Rule('$NounPhrase', "' $Tokens '", to_np),
    Rule('$NounPhrase', '" $Tokens "', to_np),
    Rule('$Tokens', '$UnquotedToken ?$Tokens', concatenate)
]

rules_determiner = [
    Rule('$Determiner', 'a'),
    Rule('$Determiner', 'an'),
    Rule('$Determiner', 'the'),
]

In [95]:
annotators = [StopWordAnnotator(), ShowVerbAnnotator(), TokenAnnotatorBuilder("$UnquotedToken", ["'", '"', "?"])]
rules = rules_definition + rules_end_of_sentence + rules_determiner
grammar = Grammar(rules=rules, annotators=annotators)
parses = grammar.parse_input('define pi')
parse = parses[0]

Created grammar with 20 rules.


In [96]:
parse.semantics

{'intent': 'definition', 'np': 'pi'}

In [97]:
parses = grammar.parse_input('define "pi"')

In [98]:
parses[0].semantics

{'intent': 'definition', 'np': 'pi'}

In [99]:
def pretty_print(parse, depth=0):
    if not isinstance(parse, str):
        if depth > 0:
            for _ in range(1):
                print("|")
        print("-" * depth * 2, parse.rule.lhs, parse.rule.rhs)
        for child in parse.children:
            pretty_print(child, depth+1)

pretty_print(parses[0])

 $ROOT ('$DefinitionQuery',)
|
-- $DefinitionQuery ('$DefinitionQueryElements',)
|
---- $DefinitionQueryElements ('$DefinitionQuestion', '$NounPhrase')
|
------ $DefinitionQuestion ('$ShowVerb',)
|
-------- $ShowVerb ('define',)
|
------ $NounPhrase ('$"__nonterminal', '$NounPhrase_$"__nonterminal')
|
-------- $"__nonterminal ('"',)
|
-------- $NounPhrase_$"__nonterminal ('$Tokens', '$"__nonterminal')
|
---------- $Tokens ('$UnquotedToken',)
|
------------ $UnquotedToken ('pi',)
|
---------- $"__nonterminal ('"',)


In [100]:
parses = grammar.parse_input("define 'pi'")

In [101]:
parses = grammar.parse_input("tell me the life")

In [102]:
parses[0].semantics

{'intent': 'definition', 'np': 'the life'}

In [103]:
parses = grammar.parse_input("what is an 'apple'")

In [104]:
parses[0].semantics

{'intent': 'definition', 'np': 'apple'}

In [105]:
parses = grammar.parse_input("what is the definition of botanics?")
for parse in parses:
    print(parse.semantics)

### Filter intents

"show me the third sense"

"tell me more about the mathematical meaning"

"show me some examples"

"show me related words"

In [106]:
from text_to_num import alpha2digit

def remove_suffix(word: str, suffix: str):
    """Remove a suffix from a string. """
    if word.endswith(suffix):
        return word[:-len(suffix)]
    return word

def convert_ordinal(word: str):
    """Convert a number to ordinal"""
    basic_forms = {"first": "one",
                   "second": "two",
                   "third": "three",
                   "fifth": "five",
                   "twelfth": "twelve"}
    
    for k, v in basic_forms.items():
        word = word.replace(k, v)
    
    word = word.replace("ieth", "y")
    
    for pattern in ["st", "nd", "rd", "th", "°"]:
        word = remove_suffix(word, pattern)
    
    converted = alpha2digit(word, "en")
    try:
        return int(converted)
    except:
        return None

class OrdinalNumberAnnotator(Annotator):
    def annotate(self, tokens):
        if len(tokens) > 1:
            return []
        value = convert_ordinal(tokens[0])
        if value:
            return [('$OrdinalNumber', value)]
        return []

In [107]:
OrdinalNumberAnnotator().annotate(['fortieth'])

[('$OrdinalNumber', 40)]

In [211]:
def foo(type_):
    def f(sems):
        return merge_dicts({'type': type_, 'value': sems[4]}, sems[1])
    return f

rules_filter = [
    Rule('$ROOT', '$FilterQuery', sems_0),
    Rule('$FilterQuery', '?$ShowVerb $FilterQueryElements',
         lambda sems: merge_dicts({'intent': 'filter'}, sems[1])),
    
    Rule('$FilterQuery', 'what about $FilterQueryElements',
         lambda sems: merge_dicts({'intent': 'filter'}, sems[2])),
    
    # ordinal case
    Rule('$FilterQueryElements', "?$More the $OrdinalNumber ?$WordSense ?$Only",
         lambda sems: {'type': 'number', 'value': strip_none(sems)[0]}),
    
    # "more about the mathematical case"
    Rule('$FilterQueryElements', "?$More the $UnquotedToken $WordSense ?$Only",
         lambda sems: {'type': 'sense_meaning', "value": strip_none(sems)[0]}),
    
    # some examples
    Rule('$FilterQueryElements', '?$More $Extra', sems_1),
    # some examples for the second case
    Rule('$FilterQueryElements', '?$More $Extra $StopWord ?$Determiner $OrdinalNumber $WordSense ?$Only',
         #lambda sems: merge_dicts({'type': 'number', 'value': sems[4]}, sems[1])),
         foo('number')),
    
    # some examples for the botanical case
    Rule('$FilterQueryElements', '?$More $Extra $StopWord ?$Determiner $UnquotedToken $WordSense ?$Only',
         # lambda sems: merge_dicts({'type': 'sense_meaning', 'value': sems[4]}, sems[1])),
         foo('sense_meaning')),
    
    
    Rule('$Extra', 'examples', {'variant': "example"}),
    Rule('$Extra', 'related words', {'variant': 'related'}),
    
    Rule('$More', "more"),
    Rule('$More', "more about"), # TODO: add optionals for terminals as well
    Rule('$More', "some"),
    
    Rule("$WordSense", "one"),
    Rule("$WordSense", "sense"),
    Rule("$WordSense", "meaning"),
    Rule("$WordSense", "definition"),
    Rule("$WordSense", "possibility"),
    Rule("$WordSense", "case"),
    
    Rule("$Only", "only"),
    Rule("$Only", "alone"),
    
]

In [212]:
annotators = [StopWordAnnotator(), ShowVerbAnnotator(),
                TokenAnnotatorBuilder("$UnquotedToken", ["'", '"', "?"]),
                OrdinalNumberAnnotator()]

rules_2 = rules_definition + rules_end_of_sentence + rules_determiner + rules_filter

grammar_2 = Grammar(rules=rules_2, annotators=annotators)

Created grammar with 41 rules.


In [213]:
parses = grammar_2.parse_input('tell me the first one')
for parse in parses:
    print(parse.semantics)

{'intent': 'filter', 'type': 'sense_meaning', 'value': 'first'}
{'intent': 'filter', 'type': 'number', 'value': 1}
{'intent': 'definition', 'np': 'the first one'}


In [214]:
for parse in grammar_2.parse_input('tell me the mathematical one only'):
    print(parse.semantics)

{'intent': 'filter', 'type': 'sense_meaning', 'value': 'mathematical'}
{'intent': 'definition', 'np': 'the mathematical one only'}


In [207]:
for parse in grammar_2.parse_input('tell me more examples'):
    print(parse.semantics)

{'intent': 'filter', 'variant': 'example'}
{'intent': 'definition', 'np': 'more examples'}


In [208]:
for parse in grammar_2.parse_input('what about more examples for the first one'):
    print(parse.semantics)

{'intent': 'filter', 'type': 'sense_meaning', 'value': 'first', 'variant': 'example'}
{'intent': 'filter', 'type': 'number', 'value': 1, 'variant': 'example'}


In [209]:
for parse in grammar_2.parse_input('show me more examples for the first one'):
    print(parse.semantics)

{'intent': 'filter', 'type': 'sense_meaning', 'value': 'first', 'variant': 'example'}
{'intent': 'filter', 'type': 'number', 'value': 1, 'variant': 'example'}
{'intent': 'definition', 'np': 'more examples for the first one'}


## Interlude: Question Answering demo

By now we'll ignore that a parsed sentence may (and usually does) bring about multiple semantics.

Instead, we'll hardcode a "simple" priority choice: take the semantics with the greatest number of keys. It should work in a number of situations.

In case the choices are only definitions, pick the one with the shortest np.

In case I have to choose between two filters, always prefer the number type.

In [248]:
from tools.providers import WiktionaryProvider
from tools.answering import QuestionAnsweringContext, DefinitionIntent, FilterIntent
from IPython.core.display import display, HTML

provider = WiktionaryProvider()

def pick_best_semantics(parses):
    semantics = [parse.semantics for parse in parses]
    
    if all(parse["intent"] == "definition" for parse in semantics):
        picked_parser = min(semantics, key=lambda parse: len(parse["np"]))
    
    else:
        priority = {'sense_meaning': 1, 'number': 2}
        picked_parser = max(semantics, key=lambda parse: len(parse.keys()) * 10 + priority[parse['type']])
        
    return picked_parser

context = QuestionAnsweringContext()

def answer_question(grammar: Grammar, question: str):
    question = question.lower()
    
    for eos in [".", "?", "!"]:
        question = remove_suffix(question, eos)
    
    parses = grammar.parse_input(question)
    best_semantics = pick_best(parses)
    
    print(best_semantics)
    
    if best_semantics['intent'] == 'definition':
        display(HTML(context.handle_intent(DefinitionIntent(best_semantics['np'])).message))
    elif best_semantics['intent'] == 'filter':
        if best_semantics['type'] == 'number':
            display(HTML(context.handle_intent(FilterIntent('single', best_semantics['value'])).message))
        # ???

In [249]:
pick_best(grammar_2.parse_input('what about more examples for the first one'))

{'intent': 'filter', 'type': 'number', 'value': 1, 'variant': 'example'}

In [250]:
pick_best(grammar_2.parse_input('more examples for the botanics one'))

{'intent': 'filter',
 'type': 'sense_meaning',
 'value': 'botanics',
 'variant': 'example'}

In [251]:
pick_best(grammar_2.parse_input('define butterfly'))

{'intent': 'definition', 'np': 'butterfly'}

In [252]:
answer_question(grammar_2, "define butterfly")

{'intent': 'definition', 'np': 'butterfly'}
Current state of the entities:  None
Dataset wiktionary/butterfly.json already downloaded. Skipping...
Serializing an answer here...


In [253]:
answer_question(grammar_2, "more about the second one")

{'intent': 'filter', 'type': 'number', 'value': 2}
Current state of the entities:  <tools.answering.DefinitionEntity object at 0x7f340b6563d0>
