# Parsing tools to analyze free text queries

## Imports and model downlads

In [None]:
# Uncomment and run cell if Stanza is not installed
# Stanza - tokenizing, POS tagging, named entities (Apache v2 license)
#!pip install stanza

In [None]:
import stanza
stanza.download('en')       # This downloads the English models for the neural pipeline
nlp = stanza.Pipeline('en')

In [None]:
# Uncomment and run cell if NLTK is not installed
# !pip install nltk

In [None]:
import nltk
nltk.download('popular')

In [None]:
# Uncomment and run cell if GeoText is not installed
# geotext - detect places (MIT license)
# geogapy is better, but has dependency issues
# !pip install geotext


In [None]:
from geotext import GeoText

### Function definitions

In [None]:
def get_nouns(text):
    """
    Extract nouns and proper nouns from the query
    """
    nouns = []
    doc = nlp(text+".")
    for sentence in doc.sentences:

        # return just noun-related tags
        nouns += [word.text for word in sentence.words if word.pos in {"PROPN", "NOUN"}]
    return nouns


def get_chunks(text):
    """
    Extract Chunks with NLTK
    """
    chunks = []
    sentences = nltk.sent_tokenize(text) 
    #print("Sentences: ",sentences)
    for sent in sentences:
        tokens = nltk.word_tokenize(sent)
        tags = nltk.pos_tag(tokens)
        # create grammar regex to match the chunks we want
        grammar = "CHUNK: {<NN|NNP><CD>?}"
        cp = nltk.RegexpParser(grammar)
        result = cp.parse(tags)
        for subtree in result.subtrees():
             if subtree.label() == 'CHUNK': 
                    # return just noun-related tags
                    chunks += [child[0] for child in subtree]
    return chunks


def get_where(text):
    """
    Extract geographical named entities
    """
    places = GeoText(text)
    # return bbox of place?
    if places.cities:
        return places.cities[0]
    elif places.countries:
        return places.countries[0]

def get_conditions(text):
    """
    Get numeral, ordinal, percentage and quantity detected by Stanza
    """
    conditions = []
    # create chunk matching rules?
    doc = nlp(text)
    for sentence in doc.sentences:
        ents = sentence.ents
        if ents:
            conditions += [ent.text for ent in ents 
                           if ent.type in {"PERCENT", "CARDINAL", "ORDINAL", "QUANTITY"}]
    return conditions


def process_query(text):
    """
    Process the query and return key-value dictionary with extracted parameters
    """
    params = {}
    # What? - platform/collection
    params['what'] = get_nouns(text)
    # Where? - GeoNER
    params['where'] = get_where(text)
    # Conditions? - other variables
    params['conditions'] = get_conditions(text)
    return params


## Input  query

In [None]:
#input query text
#ex: Sentinel-2 over Ottawa from april to september 2020 with cloud cover less than 20%
query=input()

In [None]:
process_query(query) 