In [29]:
from pprint import pprint
from NER_spacy import NER_spacy
from NER_flair import NER_flair
from TER_heideltime import TER_heideltime
from Vars_values_textsearch import Vars_values_textsearch

In [1]:
query = "Sentinel-2 over Ottawa from april to september 2020 with cloud cover lower than 10%"

In [25]:
# try NER - Spacy
# call my nl2query class
print("Initializing NER Spacy...")
my_instance = NER_spacy()
# get the structured query from the nl query
print("\nNER Spacy detected:")
structq = my_instance.transform_nl2query(query)
print("\nStructured query: ")
pprint(structq.to_dict())

Initializing NER Spacy...

NER Spacy detected:
Ottawa 16 22 GPE
april to september 2020 28 51 DATE
Need a datestring parser to get the actual value. We don't know in what format the string date is!
lower than 10 69 82 PERCENT
Need a parser to detect operation! ex: less than -> lt. Default is 'eq'.

Structured query: 
{'annotations': [{'matchingType': 'overlap',
                  'name': 'Ottawa',
                  'position': [16, 22],
                  'text': 'Ottawa',
                  'type': 'location',
                  'value': {'coordinates': [[[-76.36311485399995,
                                              44.94455157800007],
                                             [-75.23249627099995,
                                              45.544858986000065]]],
                            'type': 'Polygon'}},
                 {'position': [28, 51],
                  'target': 'dataDate',
                  'tempex_type': 'range',
                  'text': 'april to september 20

In [24]:
# try NER - Flair
# call my nl2query class
print("Initializing NER Flair...")
my_instance = NER_flair()
# get the structured query from the nl query
print("\nNER Flair detected:")
structq = my_instance.transform_nl2query(query)
print("\nStructured query: ")
pprint(structq.to_dict())

Initializing NER Flair...
2021-07-05 12:03:57,831 --------------------------------------------------------------------------------
2021-07-05 12:03:57,832 The model key 'ner-large' now maps to 'https://huggingface.co/flair/ner-english-large' on the HuggingFace ModelHub
2021-07-05 12:03:57,834  - The most current version of the model is automatically downloaded from there.
2021-07-05 12:03:57,835 --------------------------------------------------------------------------------
2021-07-05 12:03:58,000 loading file /home/timea/.flair/models/ner-english-large/07301f59bb8cb113803be316267f06ddf9243cdbba92a4c8067ef92442d2c574.554244d3476d97501a766a98078421817b14654496b86f2f7bd139dc502a4f29

NER Flair detected:
{'text': 'Ottawa', 'start_pos': 16, 'end_pos': 22, 'labels': [LOC (1.0)]}

Structured query: 
{'annotations': [{'matchingType': 'overlap',
                  'name': 'Ottawa',
                  'position': [16, 22],
                  'text': 'Ottawa',
                  'type': 'location',

In [32]:
# try TER - Heideltime
# call my nl2query class
print("Initializing TER Heideltime...")
my_instance = TER_heideltime('heideltime_config.cfg')
# get the structured query from the nl query
structq = my_instance.transform_nl2query(query)
print("\nStructured query: ")
pprint(structq.to_dict())

Initializing TER Heideltime...
Reading config file:  heideltime_config.cfg
Heideltime returned:
 <?xml version="1.0"?>
<!DOCTYPE TimeML SYSTEM "TimeML.dtd">
<TimeML>
Sentinel-2 over Ottawa from <TIMEX3INTERVAL earliestBegin="2020-04-01T00:00:00" latestBegin="2020-04-30T23:59:59" earliestEnd="2020-09-01T00:00:00" latestEnd="2020-09-30T23:59:59"><TIMEX3 tid="t5" type="DATE" value="2020-04">april</TIMEX3> to <TIMEX3 tid="t4" type="DATE" value="2020-09">september 2020</TIMEX3></TIMEX3INTERVAL> with cloud cover lower than 10%
</TimeML>


april to september 2020 TIMEX3INTERVAL {'earliestBegin': '2020-04-01T00:00:00', 'latestBegin': '2020-04-30T23:59:59', 'earliestEnd': '2020-09-01T00:00:00', 'latestEnd': '2020-09-30T23:59:59'}
april TIMEX3 {'tid': 't5', 'type': 'DATE', 'value': '2020-04'}
september 2020 TIMEX3 {'tid': 't4', 'type': 'DATE', 'value': '2020-09'}

Structured query: 
{'annotations': [{'position': [28, 51],
                  'target': 'dataDate',
                  'tempex_type': '

In [30]:
# try variables and values search
print("Initializing Vars-values Textsearch...")
myvarval = Vars_values_textsearch("varval_config.cfg")
# change the query to more climate-specific vocabulary
nlquery = "I want the CO2 concentrations and total canopy water storage " \
          "used to force the CMIP6 models cell area in ScenarioMIP"
print("\nTextsearch detected:")
structq = myvarval.transform_nl2query(nlquery)
print("\nStructured query: ")
pprint(structq.to_dict())

Initializing Vars-values Textsearch...
Reading config file:  varval_config.cfg
Adding to textsearch engine vocabulary words for:  cmip6
Adding to textsearch engine vocabulary words for:  peps
Adding to textsearch engine vocabulary words for:  copernicus
Adding to textsearch engine vocabulary words for:  paviccs
Adding to textsearch engine vocabulary words for:  cf_standard_names
Vocabularies successfully added to textsearch engine!

Textsearch detected:
Searching vocabulary words...
TSResult(match='I', norm='I', start=0, end=1, case='upper', is_exact=False)
TSResult(match='CMIP6', norm='CMIP6', start=79, end=84, case='upper', is_exact=False)
TSResult(match='ScenarioMIP', norm='ScenarioMIP', start=105, end=116, case='mixed', is_exact=False)
TSResult(match='CO2', norm='co2', start=11, end=14, case='upper', is_exact=False)
TSResult(match='total canopy water storage', norm='Total Canopy Water Storage', start=34, end=60, case='lower', is_exact=False)
TSResult(match='cell area', norm='cell a