# Data Generators

Examples of generating SQL queries for keywords and 

- https://github.com/openstreetmap/id-tagging-schema
- https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases
- hand-generated queries from 

These can be used for training or validation of retrieval.

In [221]:
import duckdb
import jinja2 as j2
import re
import json
from typing import Generator
import pydantic
from itertools import islice
from ruamel.yaml import YAML
from itertools import islice
from typing import Generator

OSM_TABLENAME = "osm"
ENV = j2.Environment()

def quote(value: str) -> str:
    value = re.sub(r"'", r"\\'", value)
    return f"'{value}'"

ENV.filters["quote"] = quote

class TextToSQL(pydantic.BaseModel):
    sql: str
    text: str

def load_special_phrases() -> Generator[TextToSQL, None, None]:

    df = duckdb.query(
    """
    SELECT
        lower("Word / Phrase") as phrase,
        Key AS key,
        Value AS value
    FROM read_csv_auto('../data/nominatim-special-phrases.csv', HEADER=TRUE)
    WHERE Operator = '-'
    """ 
    ).to_df()
    
    template = """
    SELECT osm_id
    FROM {{tablename}}
    WHERE tags ->> {{key | quote}} = {{value | quote}}
    """
    
    output = []
    for i, row in df.iterrows():
        sql = ENV.from_string(template).render(tablename=OSM_TABLENAME, key=row.key, value=row.value)
        yield TextToSQL(text=row.phrase, sql=sql)

list(islice(load_special_phrases(), 10))

[TextToSQL(sql="\n    SELECT osm_id\n    FROM osm\n    WHERE tags ->> 'aerialway' = 'zip_line'\n    ", text='zip line'),
 TextToSQL(sql="\n    SELECT osm_id\n    FROM osm\n    WHERE tags ->> 'aerialway' = 'zip_line'\n    ", text='zip lines'),
 TextToSQL(sql="\n    SELECT osm_id\n    FROM osm\n    WHERE tags ->> 'aerialway' = 'zip_line'\n    ", text='zip wire'),
 TextToSQL(sql="\n    SELECT osm_id\n    FROM osm\n    WHERE tags ->> 'aerialway' = 'zip_line'\n    ", text='zip wires'),
 TextToSQL(sql="\n    SELECT osm_id\n    FROM osm\n    WHERE tags ->> 'aerialway' = 'zip_line'\n    ", text='zipline'),
 TextToSQL(sql="\n    SELECT osm_id\n    FROM osm\n    WHERE tags ->> 'aerialway' = 'zip_line'\n    ", text='ziplines'),
 TextToSQL(sql="\n    SELECT osm_id\n    FROM osm\n    WHERE tags ->> 'aerialway' = 'zip_line'\n    ", text='zipwire'),
 TextToSQL(sql="\n    SELECT osm_id\n    FROM osm\n    WHERE tags ->> 'aerialway' = 'zip_line'\n    ", text='zipwires'),
 TextToSQL(sql="\n    SELECT osm

In [2]:

def load_id_presets() -> Generator[TextToSQL, None, None]:
    """Yield examples of query text and SQL to run for each preset in the ID tagging schema."""
    tmpl = """
        SELECT *
        FROM osm
        WHERE TRUE
        {%- for tag, value in tags.items() %}
        {%- if value == "*" %}
        AND tags ? {{ tag | quote }}
        {%- else %}
        AND tags ->> {{ tag | quote }} = {{ value | quote }}
        {%- endif %}
        {%- endfor %}
        AND (
            FALSE
            {%- if 'area' in geometry %}
            OR (osm_type = 'W' AND ST_GeometryType(geometry) = 'ST_Polygon')
            {%- endif %}
            {%- if 'line' in geometry %}
            OR (osm_type = 'W' AND ST_GeometryType(geometry) = 'ST_LineString')
            {%- endif %}
            {%- if 'point' in geometry %}
            OR (osm_type = 'N')
            {%- endif %}
            {#- TODO figure out what a vertex is #}
            {%- if 'vertex' in geometry %}
            OR (osm_type = 'N')
            {%- endif %}            
            {%- if 'point' in geometry %}
            OR (osm_type = 'R')
            {%- endif %}            
        )
        """    
    with open("../data/id-tagging-schema/presets.json", "r") as f:
        presets = json.load(f)
    with open("../data/id-tagging-schema/translations.en.json", "r") as f:
        translations = json.load(f)
    # Preset fallbacks are some generic presets that are used in the ID schema
    # but not defined in the presets.json file.
    with open("../data/id-tagging-schema/preset-fallbacks.json", "r") as f:
        defaults = json.load(f)        

    for k, v in translations["en"]["presets"]["presets"].items():
        if k not in presets:
            print(f"{k} not in presets")
        else:
            presets[k].update(v)

    for k, v in presets.items():
        terms = v.get("terms", "")
        presets[k]["terms"] = [x.strip() for x in terms.split(",") if x.strip() != ""]

    for k, v in presets.items():  
        aliases = v.get("aliases", "")
        presets[k]["aliases"] = [x.strip() for x in aliases.splitlines() if x.strip() != ""]

    presets.update(defaults)
        
    for p in presets.values():
        queries = [
            p["name"].lower(),
            *[term.lower() for term in p["terms"]],
            *[alias.lower() for alias in p["aliases"]]
        ]
        for q in queries:
            if not q:
                continue
            tags = p["tags"]
            sql = ENV.from_string(tmpl).render(tablename=OSM_TABLENAME, tags=tags, geometry=p["geometry"])
            yield TextToSQL(text=q, sql=sql)
            
list(islice(load_id_presets(), 10))

ImportError: cannot import name 'generator' from 'typing' (/opt/homebrew/Cellar/python@3.10/3.10.8/Frameworks/Python.framework/Versions/3.10/lib/python3.10/typing.py)

In [228]:
yaml = YAML()

def load_queries() -> Generator[TextToSQL, None, None]:
    """Yield some manually defined queries"""
    with open("../data/queries.yml", "r") as f:
        queries = yaml.load(f)
    tmpl = """
    SELECT *
    FROM osm
    WHERE FALSE
    {% for tag in tags %} {# assume at least one tag #}
    {% if tag is string %}
    OR {{ tag }}
    {% else %}
    OR (TRUE
        {% for tv in tag %}
        AND {{ tv }}
        {%  endfor %}
    )
    {% endif %}
    {% endfor %}
    ;
    """
    for q in queries:
        tags = q["tags"]
        sql = ENV.from_string(tmpl).render(tags=tags)
        yield TextToSQL(text=q["text"], sql=sql)

list(islice(load_queries(), 10))

[TextToSQL(sql='\n    SELECT *\n    FROM osm\n    WHERE FALSE\n     \n    \n    OR power=line\n    \n     \n    \n    OR power=minor_line\n    \n     \n    \n    OR power=cable\n    \n    \n    ;\n    ', text='power lines'),
 TextToSQL(sql='\n    SELECT *\n    FROM osm\n    WHERE FALSE\n     \n    \n    OR power=plant\n    \n    \n    ;\n    ', text='powerplants'),
 TextToSQL(sql='\n    SELECT *\n    FROM osm\n    WHERE FALSE\n     \n    \n    OR power=plant\n    \n    \n    ;\n    ', text='power plants'),
 TextToSQL(sql='\n    SELECT *\n    FROM osm\n    WHERE FALSE\n     \n    \n    OR power=substation\n    \n    \n    ;\n    ', text='power stations'),
 TextToSQL(sql='\n    SELECT *\n    FROM osm\n    WHERE FALSE\n     \n    \n    OR cusisine=ethiopian\n    \n    \n    ;\n    ', text='ethiopian'),
 TextToSQL(sql='\n    SELECT *\n    FROM osm\n    WHERE FALSE\n     \n    \n    OR cusisine=ethiopian\n    \n    \n    ;\n    ', text='ethiopian food'),
 TextToSQL(sql='\n    SELECT *\n    