# Data Generators

In [15]:
import duckdb
import jinja2 as j2
import re
import json
from typing import Generator
import pydantic
from itertools import islice
from ruamel.yaml import YAML
from itertools import islice

OSM_TABLENAME = "osm"
ENV = j2.Environment()

def quote(value: str) -> str:
    value = re.sub(r"'", r"\\'", value)
    return f"'{value}'"

ENV.filters["quote"] = quote

class TextToSQL(pydantic.BaseModel):
    sql: str
    text: str

def load_special_phrases() -> Generator[TextToSQL, None, None]:

    df = duckdb.query(
    """
    SELECT
        lower("Word / Phrase") as phrase,
        Key AS key,
        Value AS value
    FROM read_csv_auto('../data/nominatim-special-phrases.csv', HEADER=TRUE)
    WHERE Operator = '-'
    """ 
    ).to_df()
    
    template = """
    SELECT osm_id
    FROM {{tablename}}
    WHERE tags ->> {{key | quote}} = {{value | quote}}
    """
    
    output = []
    for i, row in df.iterrows():
        sql = ENV.from_string(template).render(tablename=OSM_TABLENAME, key=row.key, value=row.value)
        yield TextToSQL(text=row.phrase, sql=sql)

list(islice(load_special_phrases(), 10))

[TextToSQL(sql="\n    SELECT osm_id\n    FROM osm\n    WHERE tags ->> 'aerialway' = 'zip_line'\n    ", text='zip line'),
 TextToSQL(sql="\n    SELECT osm_id\n    FROM osm\n    WHERE tags ->> 'aerialway' = 'zip_line'\n    ", text='zip lines'),
 TextToSQL(sql="\n    SELECT osm_id\n    FROM osm\n    WHERE tags ->> 'aerialway' = 'zip_line'\n    ", text='zip wire'),
 TextToSQL(sql="\n    SELECT osm_id\n    FROM osm\n    WHERE tags ->> 'aerialway' = 'zip_line'\n    ", text='zip wires'),
 TextToSQL(sql="\n    SELECT osm_id\n    FROM osm\n    WHERE tags ->> 'aerialway' = 'zip_line'\n    ", text='zipline'),
 TextToSQL(sql="\n    SELECT osm_id\n    FROM osm\n    WHERE tags ->> 'aerialway' = 'zip_line'\n    ", text='ziplines'),
 TextToSQL(sql="\n    SELECT osm_id\n    FROM osm\n    WHERE tags ->> 'aerialway' = 'zip_line'\n    ", text='zipwire'),
 TextToSQL(sql="\n    SELECT osm_id\n    FROM osm\n    WHERE tags ->> 'aerialway' = 'zip_line'\n    ", text='zipwires'),
 TextToSQL(sql="\n    SELECT osm

In [13]:
def load_id_presets() -> Generator[TextToSQL, None, None]:
    """Yield examples of query text and SQL to run for each preset in the ID tagging schema."""
    tmpl = """
        SELECT *
        FROM osm
        WHERE TRUE
        {% for tag, value in tags.items() %}
        {% if value == "*" %}
        tags ? {{ tag | quote }}
        {% else %}
        tags ->> {{ tag | quote }} = {{ value | quote }}
        {% endif %}
        {% endfor %}
        """    
    with open("../data/id-tagging-schema/presets.json", "r") as f:
        presets = json.load(f)
    with open("../data/id-tagging-schema/translations.en.json", "r") as f:
        translations = json.load(f)

    for k, v in translations["en"]["presets"]["presets"].items():
        if k not in presets:
            print(f"{k} not in presets")
        else:
            presets[k].update(v)

    for k, v in presets.items():
        terms = v.get("terms", "")
        presets[k]["terms"] = [x.strip() for x in terms.split(",") if x.strip() != ""]

    for k, v in presets.items():  
        aliases = v.get("aliases", "")
        presets[k]["aliases"] = [x.strip() for x in aliases.splitlines() if x.strip() != ""]
        
    for p in presets.values():
        queries = [
            p["name"].lower(),
            *[term.lower() for term in p["terms"]],
            *[alias.lower() for alias in p["aliases"]]
        ]
        for q in queries:
            if not q:
                continue
            tags = p["tags"]
            sql = ENV.from_string(tmpl).render(tablename=OSM_TABLENAME, tags=tags)
            yield TextToSQL(text=q, sql=sql)
            
list(islice(load_id_presets(), 10))

[TextToSQL(sql="\n        SELECT *\n        FROM osm\n        WHERE TRUE\n        \n        \n        tags ? 'aerialway'\n        \n        \n        ", text='aerialway feature'),
 TextToSQL(sql="\n        SELECT *\n        FROM osm\n        WHERE TRUE\n        \n        \n        tags ? 'aeroway'\n        \n        \n        ", text='aeroway feature'),
 TextToSQL(sql="\n        SELECT *\n        FROM osm\n        WHERE TRUE\n        \n        \n        tags ? 'amenity'\n        \n        \n        ", text='amenity'),
 TextToSQL(sql="\n        SELECT *\n        FROM osm\n        WHERE TRUE\n        \n        \n        tags ? 'attraction'\n        \n        \n        ", text='attraction'),
 TextToSQL(sql="\n        SELECT *\n        FROM osm\n        WHERE TRUE\n        \n        \n        tags ? 'boundary'\n        \n        \n        ", text='boundary'),
 TextToSQL(sql="\n        SELECT *\n        FROM osm\n        WHERE TRUE\n        \n        \n        tags ? 'building'\n        \n 

In [16]:
yaml = YAML()

def load_queries() -> Generator[TextToSQL, None, None]:
    """Yield some manually defined queries"""
    with open("../data/queries.yml", "r") as f:
        queries = yaml.load(f)
    tmpl = """
    SELECT *
    FROM osm
    WHERE FALSE
    {% for tag in tags %} {# assume at least one tag #}
    {% if tag is string %}
    OR {{ tag }}
    {% else %}
    OR (TRUE
        {% for tv in tag %}
        AND {{ tv }}
        {%  endfor %}
    )
    {% endif %}
    {% endfor %}
    ;
    """
    for q in queries:
        tags = q["tags"]
        sql = ENV.from_string(tmpl).render(tags=tags)
        yield TextToSQL(text=q["text"], sql=sql)

list(islice(load_queries(), 10))

[TextToSQL(sql='\n    SELECT *\n    FROM osm\n    WHERE FALSE\n     \n    \n    OR power=line\n    \n     \n    \n    OR power=minor_line\n    \n     \n    \n    OR power=cable\n    \n    \n    ;\n    ', text='power lines'),
 TextToSQL(sql='\n    SELECT *\n    FROM osm\n    WHERE FALSE\n     \n    \n    OR power=plant\n    \n    \n    ;\n    ', text='powerplants'),
 TextToSQL(sql='\n    SELECT *\n    FROM osm\n    WHERE FALSE\n     \n    \n    OR power=plant\n    \n    \n    ;\n    ', text='power plants'),
 TextToSQL(sql='\n    SELECT *\n    FROM osm\n    WHERE FALSE\n     \n    \n    OR power=substation\n    \n    \n    ;\n    ', text='power stations'),
 TextToSQL(sql='\n    SELECT *\n    FROM osm\n    WHERE FALSE\n     \n    \n    OR cusisine=ethiopian\n    \n    \n    ;\n    ', text='ethiopian'),
 TextToSQL(sql='\n    SELECT *\n    FROM osm\n    WHERE FALSE\n     \n    \n    OR cusisine=ethiopian\n    \n    \n    ;\n    ', text='ethiopian food'),
 TextToSQL(sql='\n    SELECT *\n    