# Sippycup semantic parsing

In [1]:
import sys
sys.path.append('3rdparty/sippycup')
from annotator import *
from parsing import *

In [2]:
import spacy
nlp = spacy.load("en_core_web_md")

In [298]:
def add_rule(grammar, rule):
    if contains_optionals(rule):
        add_rule_containing_optional(grammar, rule)
    elif is_lexical(rule):
        grammar.lexical_rules[rule.rhs].append(rule)
    elif is_unary(rule):
        grammar.unary_rules[rule.rhs].append(rule)
    elif is_binary(rule):
        grammar.binary_rules[rule.rhs].append(rule)
    elif all([is_cat(rhsi) for rhsi in rule.rhs]):
        add_n_ary_rule(grammar, rule)
    else:
        make_cat(grammar, rule)
        # raise Exception('RHS mixes terminals and non-terminals: %s' % rule

def add_rule_containing_optional(grammar, rule):
    # Find index of the first optional element on the RHS.
    first = next((idx for idx, elt in enumerate(rule.rhs) if is_optional(elt)), -1)
    assert first >= 0
    assert len(rule.rhs) > 1, 'Entire RHS is optional: %s' % rule
    prefix = rule.rhs[:first]
    suffix = rule.rhs[(first + 1):]
    # First variant: the first optional element gets deoptionalized.
    deoptionalized = (rule.rhs[first][1:],)
    add_rule(grammar, Rule(rule.lhs, prefix + deoptionalized + suffix, rule.sem))
    # Second variant: the first optional element gets removed.
    # If the semantics is a value, just keep it as is.
    sem = rule.sem
    # But if it's a function, we need to supply a dummy argument for the removed element.
    if isinstance(rule.sem, FunctionType):
        sem = lambda sems: rule.sem(sems[:first] + [None] + sems[first:])
    add_rule(grammar, Rule(rule.lhs, prefix + suffix, sem))

def make_cat(grammar, rule):
    """
    Convert a terminal in the RHS into a non-terminal.
    
    Conversion works by creating a nonterminal from each terminal if
    it does not exist already in the grammar, otherwise it just replaces it.
    """
    
    new_rhs = []
    for rhsi in rule.rhs:
        if is_cat(rhsi):
            cat_name = rhsi
        else:
            cat_name = "$" + rhsi + "__nonterminal"
            if cat_name not in grammar.categories:
                grammar.categories.add(cat_name)
                # print(f"Adding rule: {cat_name} := {str(rhsi)}")
                add_rule(grammar, Rule(cat_name, rhsi))
        new_rhs.append(cat_name)
        # print(f"Adding rule: {rule.lhs} := {str(new_rhs)}")
    add_rule(grammar, Rule(rule.lhs, tuple(new_rhs), rule.sem))



In [299]:
def parse_input(grammar, input):
    """Returns a list of all parses for input using grammar."""
    tokens_spacy = nlp(input) # New
    tokens = [token.text for token in tokens_spacy]
    print(tokens)
    chart = defaultdict(list)
    for j in range(1, len(tokens) + 1):
        for i in range(j - 1, -1, -1):
            apply_annotators(grammar, chart, tokens, i, j)
            apply_lexical_rules(grammar, chart, tokens, i, j)
            apply_binary_rules(grammar, chart, i, j)
            apply_unary_rules(grammar, chart, i, j)
    parses = chart[(0, len(tokens))]
    if hasattr(grammar, 'start_symbol') and grammar.start_symbol:
        parses = [parse for parse in parses if parse.rule.lhs == grammar.start_symbol]
    return parses

class Grammar:
    def __init__(self, rules=[], annotators=[], start_symbol='$ROOT'):
        self.categories = set()
        self.lexical_rules = defaultdict(list)
        self.unary_rules = defaultdict(list)
        self.binary_rules = defaultdict(list)
        self.annotators = annotators
        self.start_symbol = start_symbol
        for rule in rules:
            add_rule(self, rule)
        print('Created grammar with %d rules.' % len(rules))

    def parse_input(self, input):
        """Returns a list of parses for the given input."""
        return parse_input(self, input)

In [300]:
NumberAnnotator().annotate(['16'])

[('$Number', 16)]

In [301]:
TokenAnnotator().annotate(['foo'])

[('$Token', 'foo')]

In [359]:
class StopWordAnnotator(Annotator):
    """Let spacy detect stop words for us"""
    def annotate(self, tokens):
        if len(tokens) == 1:
            if nlp(tokens[0])[0].is_stop:
                return [('$StopWord', tokens[0])]
        return []

class ShowVerbAnnotator(Annotator):
    def __init__(self, threshold = 0.7):
        self.show_verbs = [("define", ""), ("tell", "me"), ("show", "")]
        self.spacy_show_toks = nlp(" ".join([verb for verb, _ in self.show_verbs]))
        self.threshold = 0.7

    def annotate(self, tokens):
        if len(tokens) <= 2:
            spacy_tokens = nlp(" ".join(tokens))
            spacy_token = spacy_tokens[0]
            if spacy_token.pos_ != 'VERB':
                return []
            
            # If the verb matches in meaning and, in case it requires a
            # follow-up word, that this matches as well, then it's a match.
            for idx, (verb, acc) in enumerate(self.show_verbs):
                spacy_verb = self.spacy_show_toks[idx]
                if spacy_token.similarity(spacy_verb) >= self.threshold:
                    if verb == tokens[0] and acc != "" and (len(tokens) == 1 or tokens[1] != acc):
                        return []
                    return [('$ShowVerb', tokens)]
        return []

    
class TokenAnnotatorBuilder(Annotator):
    def __init__(self, category_name, excluded):
        Annotator.__init__(self)
        self.category_name = category_name
        self.excluded = excluded
    
    def annotate(self, tokens):
        if len(tokens) == 1:
            token = tokens[0]
            if token not in self.excluded:
                return [(self.category_name, token)]
        return []

In [360]:
ShowVerbAnnotator().annotate(['say'])

[('$ShowVerb', ['say'])]

In [361]:
ShowVerbAnnotator().annotate(['define'])

[('$ShowVerb', ['define'])]

In [362]:
ShowVerbAnnotator().annotate(['tell', 'me'])

[('$ShowVerb', ['tell', 'me'])]

In [363]:
TokenAnnotatorBuilder('TokenWithoutQuotes', ['"', '"']).annotate(['Jeff'])

[('TokenWithoutQuotes', 'Jeff')]

In [364]:
MAX_CELL_CAPACITY = 10000

# Grammar definition

We will model the queries after a few intents:

- Definition: asking for a definition of a noun phrase
- Comparison: compare two noun phrases
- Filtering/Details on a given sense: ask for further details on a previously mentioned sense
- Usage of form
- General grammar knowledge

In [392]:
from functools import reduce

def sems_0(sems):
    return sems[0]

def sems_1(sems):
    return sems[1]

def merge_dicts(d1, d2):
    if not d2:
        return d1
    if not d1:
        return {}
    return {**d1, **d2}

def strip_none(sems):
    return [sem for sem in sems if sem]

def merge_dicts_singleparam(sems):
    if all([sem is None for sem in sems]):
        return {}
    return reduce(merge_dicts, strip_none(sems))

def to_np(sems):
    print("Semantics: ", sems)
    return {'np': strip_none(sems)[0]}

def concatenate(sems):
    #print("concatenating ", sems)
    return " ".join(strip_none(sems))


rules_definition = [
    Rule('$ROOT', '$DefinitionQuery', sems_0),
    Rule('$DefinitionQuery', '$DefinitionQueryElements',
         lambda sems: merge_dicts({'intent': 'definition'}, sems[0])),
    Rule('$DefinitionQueryElements', '$DefinitionQuestion $NounPhrase ?$EndOfSentence',
         merge_dicts_singleparam),
    
    Rule('$DefinitionQuestion', '$ShowVerb ?me'),
    Rule('$DefinitionQuestion', '$WhatDefinition'),
    Rule('$WhatDefinition', 'what is ?$Determiner ?$DefinitionFor'),
    Rule('$WhatDefinition', 'how do you $ShowVerb'),
    Rule('$DefinitionFor', 'meaning $StopWord'),
    Rule('$DefinitionFor', 'sense $StopWord'),
    Rule('$DefinitionFor', 'definition $StopWord'),
    Rule('$NounPhrase', "$Tokens", to_np),
    Rule('$NounPhrase', "' $Tokens '", to_np),
    Rule('$NounPhrase', '" $Tokens "', to_np),
    Rule('$Tokens', '$UnquotedToken ?$Tokens', concatenate)
]

rules_end_of_sentence = [
    Rule('$EndOfSentence', '?'),
    Rule('$EndOfSentence', '.'),
    Rule('$EndOfSentence', '!')
]

rules_determiner = [
    Rule('$Determiner', 'a'),
    Rule('$Determiner', 'an'),
    Rule('$Determiner', 'the'),
]

In [393]:
annotators = [StopWordAnnotator(), ShowVerbAnnotator(), TokenAnnotatorBuilder("$UnquotedToken", ["'", '"', "?"])]
rules = rules_definition + rules_end_of_sentence + rules_determiner
grammar = Grammar(rules=rules, annotators=annotators)
parses = grammar.parse_input('define pi')
parse = parses[0]

Created grammar with 20 rules.
['define', 'pi']
Semantics:  ['define']
Semantics:  ['pi']
Semantics:  ['define pi']


In [367]:
parse.semantics

{'intent': 'definition', 'np': 'pi'}

In [368]:
parses = grammar.parse_input('define "pi"')

['define', '"', 'pi', '"']
Semantics:  ['define']
Semantics:  ['pi']
Semantics:  [None, 'pi', None]


In [369]:
parses[0].semantics

{'intent': 'definition', 'np': 'pi'}

In [323]:
def pretty_print(parse, depth=0):
    if not isinstance(parse, str):
        if depth > 0:
            for _ in range(1):
                print("|")
        print("-" * depth * 2, parse.rule.lhs, parse.rule.rhs)
        for child in parse.children:
            pretty_print(child, depth+1)

pretty_print(parses[0])

 $ROOT ('$DefinitionQuery',)
|
-- $DefinitionQuery ('$DefinitionQueryElements',)
|
---- $DefinitionQueryElements ('$DefinitionQuestion', '$NounPhrase')
|
------ $DefinitionQuestion ('$ShowVerb',)
|
-------- $ShowVerb ('define',)
|
------ $NounPhrase ('$"__nonterminal', '$NounPhrase_$"__nonterminal')
|
-------- $"__nonterminal ('"',)
|
-------- $NounPhrase_$"__nonterminal ('$Tokens', '$"__nonterminal')
|
---------- $Tokens ('$UnquotedToken',)
|
------------ $UnquotedToken ('pi',)
|
---------- $"__nonterminal ('"',)


In [324]:
parses = grammar.parse_input("define 'pi'")

['define', "'", 'pi', "'"]
Semantics:  ['define']
Semantics:  ['pi']
Semantics:  [None, 'pi', None]


In [370]:
parses = grammar.parse_input("tell me the life")

['tell', 'me', 'the', 'life']
Semantics:  ['tell']
Semantics:  ['me']
Semantics:  ['tell me']
Semantics:  ['the']
Semantics:  ['me the']
Semantics:  ['tell me the']
Semantics:  ['life']
Semantics:  ['the life']
Semantics:  ['me the life']
Semantics:  ['tell me the life']


In [372]:
parses[0].semantics

{'intent': 'definition', 'np': 'the life'}

In [388]:
parses = grammar.parse_input("what is an 'apple'?")

['what', 'is', 'an', "'", 'apple', "'", '?']
Semantics:  ['what']
Semantics:  ['is']
Semantics:  ['what is']
Semantics:  ['an']
Semantics:  ['is an']
Semantics:  ['what is an']
Semantics:  ['apple']
Semantics:  [None, 'apple', None]


In [391]:
parses[0].semantics

{'intent': 'definition', 'np': 'apple'}

In [394]:
parses = grammar.parse_input("what is the definition of botanics?")
parses[0].semantics

['what', 'is', 'the', 'definition', 'of', 'botanics', '?']
Semantics:  ['what']
Semantics:  ['is']
Semantics:  ['what is']
Semantics:  ['the']
Semantics:  ['is the']
Semantics:  ['what is the']
Semantics:  ['definition']
Semantics:  ['the definition']
Semantics:  ['is the definition']
Semantics:  ['what is the definition']
Semantics:  ['of']
Semantics:  ['definition of']
Semantics:  ['the definition of']
Semantics:  ['is the definition of']
Semantics:  ['what is the definition of']
Semantics:  ['botanics']
Semantics:  ['of botanics']
Semantics:  ['definition of botanics']
Semantics:  ['the definition of botanics']
Semantics:  ['is the definition of botanics']
Semantics:  ['what is the definition of botanics']


{'intent': 'definition', 'np': 'the definition of botanics'}

In [399]:
for parse in parses:
    print(parse.semantics)

{'intent': 'definition', 'np': 'the definition of botanics'}
{'intent': 'definition', 'np': 'definition of botanics'}
{'intent': 'definition', 'np': 'botanics'}
