In [4]:
import sqlalchemy as sa
import psycopg2
import pandas as pd
import shapely.wkt
import geopandas as gpd

In [5]:
from app.core.config import get_settings
from sqlalchemy import create_engine

settings = get_settings()
engine = create_engine(settings.db.url.get_secret_value().replace("asyncpg", "psycopg2"),
                          future=True)

In [10]:
from typing import Union, cast, Dict, Any, Generator
from sqlalchemy.engine import Connection, Engine
from sqlalchemy.sql.expression import Executable
from sqlalchemy import text
import pydeck as pdk
from pydeck.data_utils.viewport_helpers import compute_view, bbox_to_zoom_level

from typing import Optional, Union, cast, Dict, Any, Generator


from pydeck.data_utils import compute_view

def geodf_to_viewstate(df: gpd.GeoDataFrame) -> pdk.ViewState:
    bbox = df.total_bounds
    center_lat = (bbox[1] + bbox[3]) / 2
    center_lng = (bbox[0] + bbox[2]) / 2
    zoom_level = bbox_to_zoom_level([list(bbox[:2]), list(bbox[2:])])
    return pdk.ViewState(center_lng, center_lat, zoom_level=zoom_level)

def view_gpd(df: gpd.GeoDataFrame, **kwargs) -> pdk.Deck:
    return pdk.Deck(layers=[pdk.Layer("GeoJsonLayer", df)], 
                    initial_view_state=geodf_to_viewstate(df),
                    **kwargs)

class OsmDb:
    
    def __init__(self, engine: Engine) -> None:
        self._engine = engine

    def run(self, query: Union[str, Executable], **kwargs: Dict[str, Any]) -> Union[gpd.GeoDataFrame, Generator[gpd.GeoDataFrame, None, None]]:
        if isinstance(query, str):
            query = text(query)
        with self._engine.begin() as conn:
            df = gpd.read_postgis(cast(Executable, query), conn, **kwargs)
        return df
    
    def view(self, query: Union[str, Executable], layer_opts: Optional[Dict[str, Any]] = None, **kwargs: Dict[str, Any]) -> pdk.Deck:
        df = self.run(query, **kwargs)
        assert isinstance(df, gpd.GeoDataFrame)
        initial_view_state = geodf_to_viewstate(df)
        return pdk.Deck(layers=[pdk.Layer("GeoJsonLayer", df)], initial_view_state=initial_view_state, **(layer_opts or {}))

osm = OsmDb(engine)

In [11]:
df = osm.run("""
    select st_buildarea(geom) as geom, tags->>'name' AS name
    from osm w
    where category = 'boundary' 
    and tags->>'name' = 'Oakland' 
    and tags->>'boundary' = 'administrative' 
    and tags->>'admin_level'= '8' 
    limit 1
    """)

In [20]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [29]:
with open("questions.txt", "r") as f:
    lines = [line.strip() for line in f]

In [30]:
for i, sent in enumerate(lines):
    print(f"{i}: {sent}")
    doc = nlp(sent)
    for tok in doc:
        print(tok.text, tok.pos_, tok.dep_, tok.ent_type_)
    for np in enumerate(doc.noun_chunks):
        print(f"Noun phrase: {np}")


0: Restaurants in Oakland
Restaurants NOUN ROOT 
in ADP prep 
Oakland PROPN pobj GPE
Noun phrase: (0, Restaurants)
Noun phrase: (1, Oakland)
1: Italian restraunts in Oakland
Italian ADJ amod NORP
restraunts NOUN ROOT 
in ADP prep 
Oakland PROPN pobj GPE
Noun phrase: (0, Italian restraunts)
Noun phrase: (1, Oakland)
2: expensive French restraunts in Oakland
expensive ADJ amod 
French ADJ amod NORP
restraunts NOUN ROOT 
in ADP prep 
Oakland PROPN pobj GPE
Noun phrase: (0, expensive French restraunts)
Noun phrase: (1, Oakland)
3: Small intimate restraunts in Oakland
Small ADJ amod 
intimate ADJ amod 
restraunts NOUN ROOT 
in ADP prep 
Oakland PROPN pobj GPE
Noun phrase: (0, Small intimate restraunts)
Noun phrase: (1, Oakland)
4: Find all hospitals in Oakland, CA.
Find VERB ROOT 
all DET det 
hospitals NOUN dobj 
in ADP prep 
Oakland PROPN pobj GPE
, PUNCT punct 
CA PROPN appos PERSON
. PUNCT punct 
Noun phrase: (0, all hospitals)
Noun phrase: (1, Oakland)
Noun phrase: (2, CA)
5: Give me t

In [31]:
doc = nlp(lines[0])

In [34]:
from spacy import displacy

In [35]:
displacy.render(doc, style="dep", jupyter=True)

In [43]:
# for i, line in enumerate(lines):
#     doc = nlp(line)
#     for tok in doc:
#         if tok.lower_ == "in":
#             print(tok, tok.pos_, tok.dep_)
            

from spacy.matcher import Matcher
pattern = [{"LOWER": "in", "POS": "ADP"}]

matcher = Matcher(vocab=nlp.vocab)
matcher.add("in", [pattern])
matcher(doc, as_spans=True)



[in]

In [46]:
displacy.render(nlp("Find all italian restraunts that are in Oakland"), style="dep", jupyter=True)

In [102]:
from spacy.matcher import DependencyMatcher
pattern = [
  # anchor token: founded
  {
    "RIGHT_ID": "in",
    "RIGHT_ATTRS": {"POS": "ADP", "LOWER": "in"},
  },
  {
    "LEFT_ID": "in",
    "REL_OP": "<<",
    "RIGHT_ID": "trajector",
    "RIGHT_ATTRS": {"POS": {"IN": ["PROPN", "NOUN"]}},
  },
  {
    "LEFT_ID": "in",
    "REL_OP": ">>",
    "RIGHT_ID": "landmark",
    "RIGHT_ATTRS": {"POS": {"IN": ["PROPN", "NOUN"]}}
  },
]
print(doc)
matcher = DependencyMatcher(nlp.vocab)
matcher.add("in", [pattern])
doc = nlp("Find all small italian restraunts that are in Oakland, California")
matches = matcher(doc)
print(matches)
for m in matches:
    match_id, token_ids = m
    for i in range(len(token_ids)):
        print(pattern[i]["RIGHT_ID"] + ":", doc[token_ids[i]].text)

Find all small italian restraunts that are in Oakland
[(3002984154512732771, [7, 4, 8]), (3002984154512732771, [7, 4, 10])]
in: in
trajector: restraunts
landmark: Oakland
in: in
trajector: restraunts
landmark: California


In [104]:
displacy.render(nlp("Pub located in Oakland"), style="dep", jupyter=True)

In [109]:
displacy.render(nlp("Which pubs are located in Oakland"), style="dep", jupyter=True)

In [112]:
displacy.render(nlp("Search for all pubs located in Oakland"), style="dep", jupyter=True)

In [121]:
displacy.render(nlp("Find all pubs located in Oakland and San Francisco but not located in Piedmont"), style="dep", jupyter=True)

In [124]:
displacy.render(nlp("Find all pubs located in Oakland and San Francisco"), style="dep", jupyter=True)


In [125]:
displacy.render(nlp("Find all pubs located in the city of Oakland"))


In [129]:
with open("questions.txt", "r") as f:
    lines = [line.strip() for line in f]


In [130]:
nlp.add_pipe("merge_noun_chunks")

<function spacy.pipeline.functions.merge_noun_chunks(doc: spacy.tokens.doc.Doc) -> spacy.tokens.doc.Doc>

In [134]:
displacy.render(nlp("Find Irish pubs located in the city of Oakland"), style="dep", jupyter=True)

In [137]:
for line in lines:
    displacy.render(nlp(line), style="dep", jupyter=True)

In [149]:
from spacy.symbols import (
    AUX, VERB, NOUN, PROPN,
    agent, attr, aux, auxpass, csubj, csubjpass, dobj, neg, nsubj, nsubjpass, obj, pobj, xcomp,
)
from spacy.tokens import Doc, Span, Token


_NOMINAL_SUBJ_DEPS = {nsubj, nsubjpass}
_CLAUSAL_SUBJ_DEPS = {csubj, csubjpass}
_ACTIVE_SUBJ_DEPS = {csubj, nsubj}
_VERB_MODIFIER_DEPS = {aux, auxpass, neg}


In [331]:
nlp = spacy.load("en_core_web_sm")
from spacy.tokens import Span  # Get the global Span object
    
matcher = DependencyMatcher(nlp.vocab)

matcher.add("main_noun:root", [[
    {
        "RIGHT_ID": "main_noun",
        "RIGHT_ATTRS": {"POS": {"IN": ["PROPN", "NOUN"]}, "DEP": "ROOT"}
    }
]])


matcher.add("main_noun:nsubj", [[
    {
        "RIGHT_ID": "verb",
        "RIGHT_ATTRS": {"POS": "AUX", "dep": "ROOT"}
    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": "main_noun",
        "RIGHT_ATTRS": {"DEP": "nsubj"}
    }
]])

# matcher.add("main_noun:nsubjpass", [[
#     {
#         "RIGHT_ID": "verb",
#         "RIGHT_ATTRS": {"POS": "VERB", "dep": "ROOT"}
#     },
#     {
#         "LEFT_ID": "verb",
#         "REL_OP": ">",
#         "RIGHT_ID": "main_noun",
#         "RIGHT_ATTRS": {"DEP": "nsubjpass"}
#     }
# ]])

matcher.add("main_noun:dobj", [[
    {
        "RIGHT_ID": "verb",
        "RIGHT_ATTRS": {"POS": "VERB", "dep": "ROOT"}
    },  
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": "main_noun",
        "RIGHT_ATTRS": {"DEP": "dobj"}
    }    
]])

matcher.add("main_noun:search_for", [[
    {
        "RIGHT_ID": "verb",
        "RIGHT_ATTRS": {"LEMMA": "search", "dep": "ROOT"}
    },  
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": "for",
        "RIGHT_ATTRS": {"DEP": "prep", "LEMMA": "for"}
    },
    {
        "LEFT_ID": "for",
        "REL_OP": ">",
        "RIGHT_ID": "noun",
        "RIGHT_ATTRS": {"DEP": "pobj", "POS": {"IN": ["PROPN", "NOUN"]}}
    }         
]])


def get_noun_chunk(tok):
    for nc in tok.doc.noun_chunks:
        if tok.i >= nc.start and tok.i < nc.end:
            return nc

# def parse_sents(doc):
#     sent = next(doc.sents)
#     root = sent.root
#     if root.pos in (NOUN, PROPN):
#         # print("Noun root", root)
#         return root
#     elif root.pos == VERB:
#         for child in root.children:
#             if child.dep_ == "dobj" and child.pos in {NOUN, PROPN}:
#                 # print("dobj", child)
#                 return child
#         else:
#             print(root)
#     elif root.pos == AUX:
#         for child in root.children:
#             if child.dep_ == "nsubj" and child.pos in {NOUN, PROPN}:
#                 return child
#         else:
#             print(root)
#     else:
#         print(root)


for i, line in enumerate(lines):
    root_noun = None
    doc = nlp(line)
    for sent in doc.sents:
        m = matcher(sent)
        if m:
            for match_id, match_tokens in m:
                if doc.vocab.strings[match_id] == "main_noun:root":
                    # print(nlp.vocab.strings[match_id], [doc[i] for i in match_tokens])
                    root_noun = doc[match_tokens[0]]
                elif nlp.vocab.strings[match_id] in {"main_noun:nsubj", "main_noun:dobj"}:
                    root_noun = doc[match_tokens[1]]
                elif nlp.vocab.strings[match_id] in {"main_noun:search_for"}:
                    root_noun = doc[match_tokens[2]]


Restaurants | ROOT Restaurants
Italian restraunts | ROOT restraunts
expensive French restraunts | ROOT restraunts
Small intimate restraunts | ROOT restraunts
all hospitals | dobj hospitals
the best restaurants | dobj restaurants
the parks | nsubj parks
Which airports | nsubj airports
roads | dobj roads
Which roads | nsubj roads
Pub | ROOT Pub
all pubs | dobj pubs
all pubs | dobj pubs
all pubs | dobj pubs
all pubs | pobj pubs


In [334]:
# located in
# in
# within
for child in root_noun.children:
    loc
        

all det
located acl


In [304]:
doc = nlp("Search for all pubs that are located in Oakland and east of the 580 freeway")
displacy.render(doc, style="dep", jupyter=True)



In [260]:
doc = nlp("Show me all the pubs and restraunts in Oakland")
displacy.render(doc, style="dep", jupyter=True)

In [None]:
patterns = {"located_in": [
    {"LEMMA": "locate", "POS": "VERB"},
    {"LEMMA": "in", "POS": "ADP"},
]}

In [192]:
doc = nlp(lines[0])
displacy.render(doc, style="dep", jupyter=True)

In [1]:
for i, tok in enumerate(next(doc.sents)):
    print(f"{i} {tok} {tok.lemma_} {tok.pos_} {tok.tag_} ({tok.dep_} {tok.head}) {tok.ent_type_}")
    
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)


NameError: name 'doc' is not defined