# Sippycup semantic parsing

In [2]:
import sys
sys.path.append('3rdparty/sippycup')
from annotator import *
from parsing import *

In [3]:
import spacy
nlp = spacy.load("en_core_web_md")

In [4]:
def add_rule(grammar, rule):
    if contains_optionals(rule):
        add_rule_containing_optional(grammar, rule)
    elif is_lexical(rule):
        grammar.lexical_rules[rule.rhs].append(rule)
    elif is_unary(rule):
        grammar.unary_rules[rule.rhs].append(rule)
    elif is_binary(rule):
        grammar.binary_rules[rule.rhs].append(rule)
    elif all([is_cat(rhsi) for rhsi in rule.rhs]):
        add_n_ary_rule(grammar, rule)
    else:
        make_cat(grammar, rule)
        # raise Exception('RHS mixes terminals and non-terminals: %s' % rule

def add_rule_containing_optional(grammar, rule):
    # Find index of the first optional element on the RHS.
    first = next((idx for idx, elt in enumerate(rule.rhs) if is_optional(elt)), -1)
    assert first >= 0
    assert len(rule.rhs) > 1, 'Entire RHS is optional: %s' % rule
    prefix = rule.rhs[:first]
    suffix = rule.rhs[(first + 1):]
    # First variant: the first optional element gets deoptionalized.
    deoptionalized = (rule.rhs[first][1:],)
    add_rule(grammar, Rule(rule.lhs, prefix + deoptionalized + suffix, rule.sem))
    # Second variant: the first optional element gets removed.
    # If the semantics is a value, just keep it as is.
    sem = rule.sem
    # But if it's a function, we need to supply a dummy argument for the removed element.
    if isinstance(rule.sem, FunctionType):
        sem = lambda sems: rule.sem(sems[:first] + [None] + sems[first:])
    add_rule(grammar, Rule(rule.lhs, prefix + suffix, sem))

def make_cat(grammar, rule):
    """
    Convert a terminal in the RHS into a non-terminal.
    
    Conversion works by creating a nonterminal from each terminal if
    it does not exist already in the grammar, otherwise it just replaces it.
    """
    
    new_rhs = []
    for rhsi in rule.rhs:
        if is_cat(rhsi):
            cat_name = rhsi
        else:
            cat_name = "$" + rhsi + "__nonterminal"
            if cat_name not in grammar.categories:
                grammar.categories.add(cat_name)
                # print(f"Adding rule: {cat_name} := {str(rhsi)}")
                add_rule(grammar, Rule(cat_name, rhsi))
        new_rhs.append(cat_name)
        # print(f"Adding rule: {rule.lhs} := {str(new_rhs)}")
    add_rule(grammar, Rule(rule.lhs, tuple(new_rhs), rule.sem))



In [5]:
def parse_input(grammar, input):
    """Returns a list of all parses for input using grammar."""
    tokens_spacy = nlp(input) # New
    tokens = [token.text for token in tokens_spacy]
    chart = defaultdict(list)
    for j in range(1, len(tokens) + 1):
        for i in range(j - 1, -1, -1):
            apply_annotators(grammar, chart, tokens, i, j)
            apply_lexical_rules(grammar, chart, tokens, i, j)
            apply_binary_rules(grammar, chart, i, j)
            apply_unary_rules(grammar, chart, i, j)
    parses = chart[(0, len(tokens))]
    if hasattr(grammar, 'start_symbol') and grammar.start_symbol:
        parses = [parse for parse in parses if parse.rule.lhs == grammar.start_symbol]
    return parses

class Grammar:
    def __init__(self, rules=[], annotators=[], start_symbol='$ROOT'):
        self.categories = set()
        self.lexical_rules = defaultdict(list)
        self.unary_rules = defaultdict(list)
        self.binary_rules = defaultdict(list)
        self.annotators = annotators
        self.start_symbol = start_symbol
        for rule in rules:
            add_rule(self, rule)
        print('Created grammar with %d rules.' % len(rules))

    def parse_input(self, input):
        """Returns a list of parses for the given input."""
        return parse_input(self, input)

In [6]:
NumberAnnotator().annotate(['16'])

[('$Number', 16)]

In [7]:
TokenAnnotator().annotate(['foo'])

[('$Token', 'foo')]

In [8]:
class StopWordAnnotator(Annotator):
    """Let spacy detect stop words for us"""
    def annotate(self, tokens):
        if len(tokens) == 1:
            if nlp(tokens[0])[0].is_stop:
                return [('$StopWord', tokens[0])]
        return []

class ShowVerbAnnotator(Annotator):
    def __init__(self, threshold = 0.7):
        self.show_verbs = [("define", ""), ("tell", "me"), ("show", "me")]
        self.spacy_show_toks = nlp(" ".join([verb for verb, _ in self.show_verbs]))
        self.threshold = 0.7

    def annotate(self, tokens):
        if len(tokens) <= 2:
            spacy_tokens = nlp(" ".join(tokens))
            spacy_token = spacy_tokens[0]
            if spacy_token.pos_ != 'VERB':
                return []
            
            # If the verb matches in meaning and, in case it requires a
            # follow-up word, that this matches as well, then it's a match.
            for idx, (verb, acc) in enumerate(self.show_verbs):
                spacy_verb = self.spacy_show_toks[idx]
                if spacy_token.similarity(spacy_verb) >= self.threshold:
                    if verb == tokens[0] and acc != "" and (len(tokens) == 1 or tokens[1] != acc):
                        return []
                    return [('$ShowVerb', tokens)]
        return []

    
class TokenAnnotatorBuilder(Annotator):
    def __init__(self, category_name, excluded):
        Annotator.__init__(self)
        self.category_name = category_name
        self.excluded = excluded
    
    def annotate(self, tokens):
        if len(tokens) == 1:
            token = tokens[0]
            if token not in self.excluded:
                return [(self.category_name, token)]
        return []

In [9]:
ShowVerbAnnotator().annotate(['say'])

[('$ShowVerb', ['say'])]

In [10]:
ShowVerbAnnotator().annotate(['define'])

[('$ShowVerb', ['define'])]

In [11]:
ShowVerbAnnotator().annotate(['tell', 'me'])

[('$ShowVerb', ['tell', 'me'])]

In [12]:
TokenAnnotatorBuilder('TokenWithoutQuotes', ['"', '"']).annotate(['Jeff'])

[('TokenWithoutQuotes', 'Jeff')]

In [13]:
MAX_CELL_CAPACITY = 10000

# Grammar definition

We will model the queries after a few intents:

- Definition: asking for a definition of a noun phrase
- Comparison: compare two noun phrases
- Filtering/Details on a given sense: ask for further details on a previously mentioned sense
- Usage of form
- General grammar knowledge

In [698]:
from functools import reduce

def sems_0(sems):
    return sems[0]

def sems_1(sems):
    return sems[1]

def sems_2(sems):
    return sems[2]

def merge_dicts(d1, d2):
    if not d2:
        return d1
    if not d1:
        return {}
    return {**d1, **d2}

def strip_none(sems):
    return [sem for sem in sems if sem]

def merge_dicts_singleparam(sems):
    if all([sem is None for sem in sems]):
        return {}
    return reduce(merge_dicts, strip_none(sems))

def to_np(sems):
    return {'np': strip_none(sems)[0]}

def concatenate(sems):
    return " ".join(strip_none(sems))


rules_definition = [
    Rule('$ROOT', '$DefinitionQuery', sems_0),
    Rule('$DefinitionQuery', '$DefinitionQueryElements',
         lambda sems: merge_dicts({'intent': 'definition'}, sems[0])),
    Rule('$DefinitionQueryElements', '$DefinitionQuestion $NounPhrase',
         merge_dicts_singleparam),
    
    # Special case: "what does X mean?"
    Rule('$DefinitionQueryElements', 'what does $NounPhrase mean', sems_2),
    
    Rule('$DefinitionQuestion', '$ShowVerb ?me ?$Determiner'),
    Rule('$DefinitionQuestion', '$ShowVerb ?me $WhoDefinition'),
    Rule('$DefinitionQuestion', '$ShowVerb ?me $WhatDefinition'),
    Rule('$DefinitionQuestion', '$WhatDefinition'),
    Rule('$DefinitionQuestion', '$WhoDefinition', {'isPerson': True}),
    Rule('$WhoDefinition', 'who $Be'),
    Rule('$WhatDefinition', 'what $Be ?$Determiner ?$DefinitionFor'),
    Rule('$WhatDefinition', 'how do you $ShowVerb'),
    Rule('$DefinitionFor', '$WordSense $StopWord'),
    Rule('$NounPhrase', "$Tokens", to_np),
    Rule('$NounPhrase', "' $Tokens '", to_np),
    Rule('$NounPhrase', '" $Tokens "', to_np),
    Rule('$Tokens', '$UnquotedToken ?$Tokens', concatenate)
]

rules_determiner = [
    Rule('$Determiner', 'a'),
    Rule('$Determiner', 'an'),
    Rule('$Determiner', 'the'),
    Rule('$Determiner', 'about the'),
    Rule('$Determiner', 'its'),
]

rules_be = [
    Rule("$Be", "is"),
    Rule("$Be", "are"),
    Rule("$Be", "'s"),
    Rule("$Be", "were"),
    Rule("$Be", "was"),
]

rules_wordsenses = [
    Rule("$WordSense", "one"),
    Rule("$WordSense", "sense"),
    Rule("$WordSense", "meaning"),
    Rule("$WordSense", "definition"),
    Rule("$WordSense", "definitions"),
    Rule("$WordSense", "possibility"),
    Rule("$WordSense", "possibilities"),
    Rule("$WordSense", "case"),
    Rule("$WordSense", "field"),
]

In [699]:
annotators = [StopWordAnnotator(), ShowVerbAnnotator(), TokenAnnotatorBuilder("$UnquotedToken", ["'", '"', "?"])]
rules = rules_definition + rules_determiner + rules_be + rules_wordsenses
grammar = Grammar(rules=rules, annotators=annotators)
parses = grammar.parse_input('define pi')
parse = parses[0]

Created grammar with 36 rules.


In [700]:
parse.semantics

{'intent': 'definition', 'np': 'pi'}

In [701]:
parses = grammar.parse_input('define "pi"')

In [702]:
grammar.parse_input('who is apollo')[0].semantics

{'intent': 'definition', 'is_person': True, 'np': 'apollo'}

In [703]:
parses[0].semantics

{'intent': 'definition', 'np': 'pi'}

In [704]:
def pretty_print(parse, depth=0):
    if not isinstance(parse, str):
        if depth > 0:
            for _ in range(1):
                print("|")
        print("-" * depth * 2, parse.rule.lhs, parse.rule.rhs)
        for child in parse.children:
            pretty_print(child, depth+1)

pretty_print(parses[0])

 $ROOT ('$DefinitionQuery',)
|
-- $DefinitionQuery ('$DefinitionQueryElements',)
|
---- $DefinitionQueryElements ('$DefinitionQuestion', '$NounPhrase')
|
------ $DefinitionQuestion ('$ShowVerb',)
|
-------- $ShowVerb ('define',)
|
------ $NounPhrase ('$"__nonterminal', '$NounPhrase_$"__nonterminal')
|
-------- $"__nonterminal ('"',)
|
-------- $NounPhrase_$"__nonterminal ('$Tokens', '$"__nonterminal')
|
---------- $Tokens ('$UnquotedToken',)
|
------------ $UnquotedToken ('pi',)
|
---------- $"__nonterminal ('"',)


In [705]:
def print_parse(grammar, utterances):
    for utterance in utterances:
        print("=" * 20)
        print("For the utterance " + utterance + ":")
        for parse in grammar.parse_input(utterance):
            print(parse.semantics)

In [707]:
# parses = grammar.parse_input("define 'pi'")
define_utterances = ["define pie", "tell me about the life", "what is an 'apple'",
                         "what is the definition of botanics", "what does mechanophilia mean",
                         "what is apollo", "who is apollo", "what are ants",
                         "what are definitions for love"]
print_parse(grammar, define_utterances)

For the utterance define pie:
{'intent': 'definition', 'np': 'pie'}
For the utterance tell me about the life:
{'intent': 'definition', 'np': 'about the life'}
{'intent': 'definition', 'np': 'life'}
For the utterance what is an 'apple':
{'intent': 'definition', 'np': 'apple'}
For the utterance what is the definition of botanics:
{'intent': 'definition', 'np': 'the definition of botanics'}
{'intent': 'definition', 'np': 'definition of botanics'}
{'intent': 'definition', 'np': 'botanics'}
For the utterance what does mechanophilia mean:
{'intent': 'definition', 'np': 'mechanophilia'}
For the utterance what is apollo:
{'intent': 'definition', 'np': 'apollo'}
For the utterance who is apollo:
{'intent': 'definition', 'is_person': True, 'np': 'apollo'}
For the utterance what are ants:
{'intent': 'definition', 'np': 'ants'}
For the utterance what are definitions for love:
{'intent': 'definition', 'np': 'definitions for love'}
{'intent': 'definition', 'np': 'love'}


### Filter intents

- "show me the third sense"
- "tell me more about the mathematical meaning"
- "show me some examples"

In [28]:
from text_to_num import alpha2digit

def remove_suffix(word: str, suffix: str):
    """Remove a suffix from a string. """
    if word.endswith(suffix):
        return word[:-len(suffix)]
    return word

def convert_ordinal(word: str):
    """Convert a number to ordinal"""
    basic_forms = {"first": "one",
                   "second": "two",
                   "third": "three",
                   "fifth": "five",
                   "twelfth": "twelve"}
    
    for k, v in basic_forms.items():
        word = word.replace(k, v)
    
    word = word.replace("ieth", "y")
    
    for pattern in ["st", "nd", "rd", "th", "°"]:
        word = remove_suffix(word, pattern)
    
    converted = alpha2digit(word, "en")
    try:
        return int(converted)
    except:
        return None

class OrdinalNumberAnnotator(Annotator):
    def annotate(self, tokens):
        if len(tokens) > 1:
            return []
        value = convert_ordinal(tokens[0])
        if value:
            return [('$OrdinalNumber', value)]
        return []

In [98]:
OrdinalNumberAnnotator().annotate(['fortieth'])

[('$OrdinalNumber', 40)]

In [268]:
import pandas as pd
from tools.dumps import wrap_open

with wrap_open("wikidata/grammatical_categories.json") as fp:
    categories = pd.read_json(fp)
    
categories_set = set(categories["entityLabel.value"].to_numpy())

In [273]:
"plural" in categories_set

True

In [450]:
categories[categories['entityLabel.value'].str.startswith("present")]

Unnamed: 0,entity.value,entityLabel.value
136,http://www.wikidata.org/entity/Q192613,present tense
144,http://www.wikidata.org/entity/Q1240211,present perfect
152,http://www.wikidata.org/entity/Q3502553,present subjunctive
156,http://www.wikidata.org/entity/Q7240943,present continuous
157,http://www.wikidata.org/entity/Q9062494,present perfect in English
158,http://www.wikidata.org/entity/Q10345583,present participle
174,http://www.wikidata.org/entity/Q52434162,present imperative
175,http://www.wikidata.org/entity/Q52434245,present infinitive
177,http://www.wikidata.org/entity/Q52434511,present gerund
180,http://www.wikidata.org/entity/Q56682909,present indicative


In [403]:
class POSAnnotator(Annotator):
    def annotate(self, tokens):
        candidate = " ".join(tokens)
        value = None
        if candidate == "noun":
            value = "noun"
        if candidate == "verb":
            value = "verb"
        if candidate == "adjective":
            value = "adj"
        if candidate == "adverb":
            value = "adv"
        if candidate == "pronoun":
            value = "pron"
        if value:
            return [('$POS', value)]
        return []


class GrammaticalFeatureAnnotator:
    def annotate(self, tokens):
        candidate = " ".join(tokens)
        if candidate in categories_set:
            return [("$GrammaticalFeature", candidate)]
        return []

In [710]:
def foo(type_):
    def f(sems):
        return merge_dicts({'filtertype': type_, 'value': sems[4]}, sems[1])
    return f

def as_constraint(sem):
    return {'constraints': sem}
    

rules_filter = [
    Rule('$ROOT', '$FilterQuery', lambda sems: merge_dicts({'intent': 'filter'}, sems[0])),
    # Tell me about...
    Rule('$FilterQuery', '?$ShowVerb $FilterQueryElements', sems_1),
    # What about...
    Rule('$FilterQuery', 'what about $FilterQueryElements', sems_2),
    # What are the...
    Rule('$FilterQuery', 'what $Be ?$Determiner $FilterQueryElements', lambda sems: sems[3]),
    # "which examples are available?"
    Rule('$FilterQuery', 'what $FilterQueryElements $be $More', sems_1),
    Rule('$FilterQuery', 'which $FilterQueryElements $be $More', sems_1),
    Rule('$FilterQuery', '$FilterQueryElements', sems_0),
    
    
    # ordinal case
    Rule('$FilterQueryElements', "?$More the $OrdinalNumber ?$WordSense ?$Only",
         lambda sems: {'filtertype': 'number', 'value': strip_none(sems)[0]}),
         
    # "more about the mathematical case"
    Rule('$FilterQueryElements', "?$More the $UnquotedToken $WordSense ?$Only",
         lambda sems: {'filtertype': 'semantic', "value": strip_none(sems)[0]}),
    
    # some examples
    Rule('$FilterQueryElements', '?$More $Extra', sems_1),
    # some examples for the second case
    Rule('$FilterQueryElements', '?$More $Extra $StopWord ?$Determiner $OrdinalNumber $WordSense ?$Only',
         #lambda sems: merge_dicts({'type': 'number', 'value': sems[4]}, sems[1])),
         foo('number')),
         
    # some examples for the botanical case
    Rule('$FilterQueryElements', '?$More $Extra $StopWord ?$Determiner $UnquotedToken $WordSense ?$Only',
         # lambda sems: merge_dicts({'type': 'sense_meaning', 'value': sems[4]}, sems[1])),
         foo('semantic')),

    # some examples as a verb
    Rule('$FilterQueryElements', '?$More $Extra ?$Filler $StopWord ?$Determiner $POS',
         # lambda sems: merge_dicts({'type': 'sense_meaning', 'value': sems[4]}, sems[1])),
         lambda sems: merge_dicts({'filtertype': 'grammatical', 'requiredPos': sems[5]}, sems[1])),
    
    # Show me the plural form
    Rule("$FilterQueryElements", "$Determiner $GrammaticalFeature ?form",
         lambda sems: {'filtertype': 'grammatical', 'grammaticalFeature': sems[1]}),
    
    # Ask for examples, categories or usages
    Rule('$Extra', 'examples', {'variant': "example"}),
    Rule('$Extra', 'categories', {'variant': "categories"}),
    Rule('$Extra', 'usages', {'variant': "usages"}),
    Rule('$Extra', 'senses', {'variant': "senses"}),
    Rule('$Extra', 'parts of speech', {'variant': "pos"}),
    Rule('$Extra', 'conjugate', {'variant': "forms"}),
    Rule('$Extra', 'conjugation', {'variant': "forms"}),
    Rule('$Extra', 'forms', {'variant': "forms"}),
    
    # Category question where category precedes the rest
    Rule('$FilterQuery', "$FilterCategoryQuery",
         lambda sems: merge_dicts({'filtertype': 'semantic'}, sems[0])),
    # in the field of computer science, what does x mean?
    Rule('$FilterCategoryQuery', "$Category $WhatFilter", sems_0),
    Rule('$FilterCategoryQuery', "$WhatFilter $Category", sems_1),
    Rule('$FilterCategoryQuery', "$Category $?More $Extra", merge_dicts_singleparam),
    
    
    Rule('$More', "more"),
    Rule('$More', "more about"),
    Rule('$More', "some"),
    Rule('$More', "some some"),
    Rule('$More', 'possible'),
    Rule('$More', 'available'),
    

    
    Rule("$Only", "only"),
    Rule("$Only", "alone"),
    
    Rule("$Filler", "$StopWord $NounPhrase"),
    
    Rule("$Category", "in $Determiner $WordSense $StopWord $NounPhrase ?,", lambda sems: {'category': sems[4]['np']}),
    Rule("$WhatFilter", "what does $NounPhrase mean"),
    Rule("$WhatFilter", "what $Be $Determiner $WordSense"),
]


In [711]:
annotators = [StopWordAnnotator(), ShowVerbAnnotator(),
                TokenAnnotatorBuilder("$UnquotedToken", ["'", '"', "?", ","]), # commas must split noun phrases
                OrdinalNumberAnnotator(), POSAnnotator(),
                GrammaticalFeatureAnnotator()]

rules_2 = rules_definition + rules_determiner + rules_filter + rules_be + rules_wordsenses

grammar_2 = Grammar(rules=rules_2, annotators=annotators)

Created grammar with 74 rules.


In [745]:
filter_utterances = ["tell me the first one", "tell me the mathematical one only",
                         "tell me more examples", "what about more examples for the first one",
                         "show me more examples for the chemistry one", "show me more examples as a noun",
                         "show me more examples of home as a verb", "more examples of home as a verb",
                         "in the field of biology, what is the definition",
                         "what does it mean in the field of computer science",
                         "tell me the singular form",
                         "more examples",
                         "what are the available parts of speech"]

print_parse(grammar_2, filter_utterances)

For the utterance tell me the first one:
{'intent': 'filter', 'filtertype': 'semantic', 'value': 'first'}
{'intent': 'filter', 'filtertype': 'number', 'value': 1}
{'intent': 'definition', 'np': 'the first one'}
{'intent': 'definition', 'np': 'first one'}
For the utterance tell me the mathematical one only:
{'intent': 'filter', 'filtertype': 'semantic', 'value': 'mathematical'}
{'intent': 'definition', 'np': 'the mathematical one only'}
{'intent': 'definition', 'np': 'mathematical one only'}
For the utterance tell me more examples:
{'intent': 'filter', 'variant': 'example'}
{'intent': 'definition', 'np': 'more examples'}
For the utterance what about more examples for the first one:
{'intent': 'filter', 'filtertype': 'semantic', 'value': 'first', 'variant': 'example'}
{'intent': 'filter', 'filtertype': 'number', 'value': 1, 'variant': 'example'}
For the utterance show me more examples for the chemistry one:
{'intent': 'filter', 'filtertype': 'semantic', 'value': 'chemistry', 'variant': '

### Related intent

- What are possible synonyms?
- What are its opposites?
- What are related words? (generic)

In [713]:
rules_derived = [
    Rule('$ROOT', '$RelatedQuery', lambda sems: merge_dicts({'intent': 'related'}, sems[0])),
    
    Rule('$RelatedQuery', '?$ShowVerb $RelatedQueryElements', sems_1),
    # What are related senses?
    Rule('$RelatedQuery', 'what $Be $RelatedQueryElements', sems_2),
    Rule('$RelatedQuery', 'which $Be $RelatedQueryElements', sems_2),
    # What senses are related
    Rule('$RelatedQuery', 'what $Word $Be $RelatedQueryElements', lambda sems: sems[3]),
    Rule('$RelatedQuery', '$RelatedQueryElements', sems_0),
    Rule('$RelatedQueryElements', '?$Determiner $Derived ?$Word', sems_1),
    Rule('$RelatedQueryElements', '?$More $Derived ?$Word', sems_1),
    Rule('$RelatedQueryElements', '?$Determiner $Quality $Derived', lambda sems: merge_dicts(sems[1], sems[2])),
    Rule('$RelatedQueryElements', '?$More $Quality $Derived', lambda sems: merge_dicts(sems[1], sems[2])),
    
    
     # some examples of the derived words
    Rule('$RelatedQueryElements', '?$More $Extra $StopWord ?$Determiner $Derived ?$Word',
         lambda sems: merge_dicts(sems[1], sems[4])),
         
    Rule('$Derived', 'derived', {'filtertype': 'derived'}),
    Rule('$Derived', 'synonym', {'filtertype': 'synonym'}),
    Rule('$Derived', 'synonyms', {'filtertype': 'synonym'}),
    Rule('$Derived', 'antonym', {'filtertype': 'antonym'}),
    Rule('$Derived', 'opposites', {'filtertype': 'antonym'}),
    Rule('$Derived', 'antonyms', {'filtertype': 'antonym'}),
    
    Rule('$Quality', '$UnquotedToken', lambda sems: {'category': sems[0]}),
]

rules_words = [
    Rule('$Word', 'word'),
    Rule('$Word', 'words'),
    Rule('$Word', 'lexeme'),
    Rule('$Word', 'lexemes'),
    Rule('$Word', 'lemma'),
    Rule('$Word', 'lemmas'),
]


In [717]:
rules_3 = rules_be + rules_definition + rules_determiner + rules_filter + rules_words + rules_wordsenses + rules_derived


grammar_3 = Grammar(rules=rules_3, annotators=annotators)

Created grammar with 98 rules.


In [718]:
related_utterances = ["what are derived words", "what are possible derived words",
                          "show me some antonyms", "show me some stylish synonyms",
                          "what are some opposites"]

print_parse(grammar_3, related_utterances)

For the utterance what are derived words:
{'intent': 'related', 'filtertype': 'derived'}
{'intent': 'related', 'filtertype': 'derived'}
{'intent': 'definition', 'np': 'derived words'}
For the utterance what are possible derived words:
{'intent': 'related', 'filtertype': 'derived'}
{'intent': 'definition', 'np': 'possible derived words'}
For the utterance show me some antonyms:
{'intent': 'related', 'filtertype': 'antonym'}
{'intent': 'definition', 'np': 'some antonyms'}
For the utterance show me some stylish synonyms:
{'intent': 'definition', 'np': 'some stylish synonyms'}
For the utterance what are some opposites:
{'intent': 'related', 'filtertype': 'antonym'}
{'intent': 'definition', 'np': 'some opposites'}


## Interlude: Question Answering demo

By now we'll ignore that a parsed sentence may (and usually does) bring about multiple semantics.

Instead, we'll hardcode a "simple" priority choice: take the semantics with the greatest number of keys. It should work in a number of situations.

In case the choices are only definitions, pick the one with the shortest np.

In case I have to choose between two filters, always prefer the number type.

In [696]:
print_parse(grammar_3, ["what are definitions for friend"])

For the utterance what are definitions for friend:
{'intent': 'definition', 'np': 'definitions for friend'}


In [735]:
from tools.providers import WiktionaryProvider
from tools.answering import QuestionAnsweringContext, DefinitionIntent, FilterIntent
from IPython.core.display import display, HTML

provider = WiktionaryProvider()

def pick_best_semantics(parses):
    """
    Return the most likely matching parse.
    
    This is a simple stub. Does not do any ML here, despite it could
    (and should), so use with care.
    """
    if parses == []:
        return {}
    semantics = [parse.semantics for parse in parses]
    
    if all(parse["intent"] == "definition" for parse in semantics):
        picked_parser = min(semantics, key=lambda parse: len(parse["np"]))
    
    else:
        priority = {'grammatical': 1, 'semantic': 2, 'number': 3}
        picked_parser = max(semantics, key=lambda parse: len(parse.keys()) * 10 + (priority[parse['filtertype']] if 'filtertype' in parse else 0))
        
    return picked_parser

context = QuestionAnsweringContext()

def answer_question(grammar: Grammar, question: str):
    question = question.lower()
    
    for eos in [".", "?", "!"]:
        question = remove_suffix(question, eos)
    
    parses = grammar.parse_input(question)
    best_semantics = pick_best_semantics(parses)
    
    print(best_semantics)
    
    if best_semantics['intent'] == 'definition':
        display(HTML(context.handle_intent(DefinitionIntent(best_semantics['np'])).message))
    elif best_semantics['intent'] == 'filter':
        if best_semantics['type'] == 'number':
            display(HTML(context.handle_intent(FilterIntent('single', best_semantics['value'])).message))
        # ???

In [736]:
pick_best_semantics(grammar_2.parse_input('show me some examples'))

[{'intent': 'filter', 'variant': 'example'}, {'intent': 'definition', 'np': 'some examples'}]


{'intent': 'filter', 'variant': 'example'}

In [737]:
pick_best_semantics(grammar_2.parse_input('more examples for the botanics one'))

[{'intent': 'filter', 'filtertype': 'semantic', 'value': 'botanics', 'variant': 'example'}, {'intent': 'filter', 'filtertype': 'semantic', 'value': 'botanics', 'variant': 'example'}]


{'intent': 'filter',
 'filtertype': 'semantic',
 'value': 'botanics',
 'variant': 'example'}

In [738]:
pick_best_semantics(grammar_2.parse_input('more examples for the first one'))

[{'intent': 'filter', 'filtertype': 'semantic', 'value': 'first', 'variant': 'example'}, {'intent': 'filter', 'filtertype': 'semantic', 'value': 'first', 'variant': 'example'}, {'intent': 'filter', 'filtertype': 'number', 'value': 1, 'variant': 'example'}, {'intent': 'filter', 'filtertype': 'number', 'value': 1, 'variant': 'example'}]


{'intent': 'filter', 'filtertype': 'number', 'value': 1, 'variant': 'example'}

In [739]:
pick_best_semantics(grammar_2.parse_input('show me more examples for the first one'))

[{'intent': 'filter', 'filtertype': 'semantic', 'value': 'first', 'variant': 'example'}, {'intent': 'filter', 'filtertype': 'number', 'value': 1, 'variant': 'example'}, {'intent': 'definition', 'np': 'more examples for the first one'}]


{'intent': 'filter', 'filtertype': 'number', 'value': 1, 'variant': 'example'}

In [740]:
pick_best_semantics(grammar_2.parse_input('define butterfly'))

[{'intent': 'definition', 'np': 'butterfly'}]


{'intent': 'definition', 'np': 'butterfly'}

## Sklearn evaluation

In [746]:
import json
import pandas as pd

with wrap_open("intents/sample.json") as fp:
    dataset = json.load(fp)

In [748]:
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.pipeline import Pipeline
import copy

class SamplePreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X=None, Y=None):
        return self
    
    def transform(self, dataset):
        # simply return sessions as an array
        df = copy.deepcopy(dataset)
        sessions = df['sessions']
        for session in sessions:
            for turn in session['turns']:
                del turn['target']
            session['turns'] = [turn['utterance'] for turn in session['turns']]
        return sessions

sessions = SamplePreprocessor().fit_transform(dataset)
targets = [[turn['target'] for turn in session['turns']] for session in dataset['sessions']]

In [751]:
class GrammarMatcher(BaseEstimator, TransformerMixin):
    """This class serves as an evaluation framework for an intent classifier"""
    def __init__(self, grammar: Grammar):
        self.grammar = grammar

    def fit(self, sessions=None, targets=None):
        """
        """
        return self
    
    def transform(self, sessions):
        """
        Convert the given sessions to intents.
        Sessions is expected to be the output of SampleProcessor.
        """
        targets = []
        for session in sessions:
            answers = []
            for utterance in session['turns']:
                answers.append(self.grammar.parse_input(utterance))
            targets.append(answers)
        return targets

In [752]:
class GreedyMatcher(BaseEstimator, ClassifierMixin):  
    def fit(self, sessions=None, target=None):
        return self

    def predict(self, sessions):
        target = []
        for session in sessions:
            turns = []
            for parses in session:
                try:
                    turns.append(pick_best_semantics(parses))
                except (e):
                    turns.append({})
            target.append(turns)
        return target

In [753]:
greedy_matcher = Pipeline([('preprocessor', SamplePreprocessor()),
                                ('grammar_matcher', GrammarMatcher(grammar_2)),
                                ('picker', GreedyMatcher())])

greedy_matcher.fit(dataset, targets)
output = greedy_matcher.predict(dataset)

[{'intent': 'definition', 'np': 'pi'}]
[{'intent': 'definition', 'np': 'apple'}]
[{'intent': 'definition', 'np': 'the definition of botanics'}, {'intent': 'definition', 'np': 'definition of botanics'}, {'intent': 'definition', 'np': 'botanics'}]
[{'intent': 'definition', 'np': 'mechanophilia'}]
[{'intent': 'definition', 'np': 'apollo'}]
[{'intent': 'definition', 'is_person': True, 'np': 'apollo'}]
[{'intent': 'definition', 'np': 'all examples for stashes'}]
[{'intent': 'filter', 'filtertype': 'semantic', 'value': 'first'}, {'intent': 'filter', 'filtertype': 'number', 'value': 1}, {'intent': 'definition', 'np': 'the first one'}, {'intent': 'definition', 'np': 'first one'}]
[{'intent': 'filter', 'filtertype': 'semantic', 'value': 'mathematical'}, {'intent': 'definition', 'np': 'the mathematical one only'}, {'intent': 'definition', 'np': 'mathematical one only'}]
[{'intent': 'filter', 'filtertype': 'semantic', 'value': 'first', 'variant': 'example'}, {'intent': 'filter', 'filtertype': 'nu

In [758]:
# A bunch of evaluation functions

from sklearn.metrics import precision_recall_fscore_support
import numpy as np

# "perfect" scorers, because they check whether two values are equal
def perfect_accuracy(y_truth, y_pred, debug=False, debug_x):
    # Concatenate lists-of-lists into flattened lists.
    # Not the most efficient, but I find it very neat
    y_truth = sum(y_truth, [])
    y_pred = sum(y_pred, [])
    
    score = []
    for idx, (y_t, y_p) in enumerate(zip(y_truth, y_pred)):
        matched_items = len(set(y_t.items()) & set(y_p.items()))
        if len(y_t) == 0:
            score.append(1.0 if len(y_t.items()) == 0 else 0.0)
        else:
            score.append(matched_items / len(y_t))
        if debug and score[-1] < 1.0:
            print(y_t, y_p)
        
    return np.average(score)

def intent_match_score(y_truth, y_pred):
    y_pred = [intent['intent'] if len(intent) else '' for intent in sum(y_pred, [])]
    y_truth = [intent['intent'] if len(intent) else '' for intent in sum(y_truth, [])]
    
    return precision_recall_fscore_support(y_truth, y_pred)

perfect_accuracy(targets, output, debug=True)

{'intent': 'definition', 'np': 'desire', 'quantifier': 'all'} {}
{'intent': 'definition', 'np': 'stashes', 'all': True, 'variant': 'example'} {'intent': 'definition', 'np': 'all examples for stashes'}
{'intent': 'filter', 'filtertype': 'semantic', 'value': 'first', 'variant': 'example'} {'intent': 'filter', 'filtertype': 'number', 'value': 1, 'variant': 'example'}
{'intent': 'filter', 'filtertype': 'semantic', 'value': 'first', 'variant': 'example'} {'intent': 'filter', 'filtertype': 'number', 'value': 1, 'variant': 'example'}
{'intent': 'filter', 'filtertype': 'grammatical', 'required_pos': 'noun', 'variant': 'example'} {'intent': 'filter', 'filtertype': 'grammatical', 'requiredPos': 'noun', 'variant': 'example'}
{'intent': 'filter', 'filtertype': 'grammatical', 'requiredForm': 'singular'} {'intent': 'filter', 'filtertype': 'grammatical', 'grammaticalFeature': 'singular'}
{'intent': 'define', 'np': 'home'} {'intent': 'definition', 'np': 'home'}
{'intent': 'define', 'np': 'home'} {'int

0.34180790960451973

## Question answering

In [390]:
answer_question(grammar_2, "define butterfly")

{'intent': 'definition', 'np': 'butterfly'}
Current state of the entities:  <tools.answering.DefinitionEntity object at 0x7efc2b793a90>
Serializing an answer here...


In [50]:
answer_question(grammar_2, "more about the second one")

{'intent': 'filter', 'type': 'number', 'value': 2}
Current state of the entities:  <tools.answering.DefinitionEntity object at 0x7efc2b793a90>


ValueError: empty range for randrange() (1,1, 0)

In [742]:
!pip3 show scikit-learn

Name: scikit-learn
Version: 0.23.1
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: None
Author-email: None
License: new BSD
Location: /usr/local/lib/python3.7/dist-packages
Requires: scipy, numpy, threadpoolctl, joblib
Required-by: sklearn, sentence-transformers
