In [1]:
%pip install overpass pyahocorasick

Collecting overpass
  Downloading overpass-0.7.2-py3-none-any.whl.metadata (6.1 kB)
Collecting osm2geojson<0.3.0,>=0.2.5 (from overpass)
  Downloading osm2geojson-0.2.5.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading overpass-0.7.2-py3-none-any.whl (13 kB)
Building wheels for collected packages: osm2geojson
  Building wheel for osm2geojson (setup.py) ... [?25l[?25hdone
  Created wheel for osm2geojson: filename=osm2geojson-0.2.5-py3-none-any.whl size=13950 sha256=d7cf5493e52a55a4f9b0ad2c52a64c95e11fef6cc67ed979eccd0b74f8cc5e85
  Stored in directory: /root/.cache/pip/wheels/e8/4e/f3/3183652838130e7ba8ede546f048bb546f1d6ec371da6f0cbe
Successfully built osm2geojson
Installing collected packages: osm2geojson, overpass
Successfully installed osm2geojson-0.2.5 overpass-0.7.2


In [57]:
import shutil
import subprocess
import os
import time
import json

import re
import pandas as pd
import geopandas as gpd
import numpy as np
from tqdm import tqdm
import overpass
from ahocorasick import Automaton


import spacy
from spacy.tokens import DocBin

In [3]:
COLAB = True

In [56]:
DATA_DIR = "gdrive/MyDrive/Work/quantify-news/data/" if COLAB else "data/"
PROJECT_DIR = "gdrive/MyDrive/Work/quantify-news/" if COLAB else "./"
TMP_DIR = "/content/" if COLAB else "./"

DATA_TRAIN_BIN_PATH = DATA_DIR + "ner_train.spacy"
DATA_DEV_BIN_PATH = DATA_DIR + "ner_dev.spacy"
DATA_TEST_BIN_PATH = DATA_DIR + "ner_test.spacy"

BEST_MODEL_OPT_PATH = PROJECT_DIR + "models/model-best/"

PARKS_PATH = DATA_DIR + "Parks_2025.csv"
COMM_AREA_PATH = DATA_DIR + "CommAreas_2025.csv"
STREET_NAMES_PATH = DATA_DIR + "StreetNames_2025.csv"
STREET_SEGMENTS_PATH = DATA_DIR + "StreetSegments_2025.geojson"
NEIGHBORHOODS_PATH = DATA_DIR + "Neighborhoods_2025.csv"

In [5]:
if COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')

    import locale
    print(locale.getpreferredencoding())
    def getpreferredencoding(do_setlocale=True):
        return 'UTF-8'
    locale.getpreferredencoding = getpreferredencoding

Mounted at /content/gdrive
UTF-8


In [6]:
nlp = spacy.load(BEST_MODEL_OPT_PATH)

# Comm Areas

In [7]:
comm_areas = pd.read_csv(COMM_AREA_PATH)

# Streets

In [8]:
street_names = pd.read_csv(STREET_NAMES_PATH)
street_names.columns = [c.strip() for c in street_names.columns]
street_names['street_partial'] = street_names['Street'] + " " + street_names['Suffix']

# Intersections

In [65]:
segments = gpd.read_file(STREET_SEGMENTS_PATH)
segment_names = segments['pre_dir'] + " " + segments['street_nam']
segment_names_full = segments['pre_dir'] + " " + segments['street_nam'] + " " + segments['street_typ']
cross_streets_to = segments['t_cross'].str.replace('|',' ', regex=False).str.lstrip('1234567890').str.replace(r'\s+', ' ',regex=True).str.strip()
cross_streets_from = segments['f_cross'].str.replace('|',' ', regex=False).str.lstrip('1234567890').str.replace(r'\s+', ' ',regex=True).str.strip()


In [69]:
invalid_cross_from = cross_streets_from.str.count(' ') < 2
invalid_cross_to = cross_streets_to.str.count(' ') < 2

In [79]:
def enumerate_cross_streets(segments, crosses, mask):
    return pd.concat([
        (segments + " and " + crosses)[~mask],
        (segments + " & " + crosses)[~mask],
        (crosses + " and " + segments)[~mask],
        (crosses + " & " + segments)[~mask],
    ])
intersections = pd.concat([
    enumerate_cross_streets(segment_names, cross_streets_to, invalid_cross_to),
    enumerate_cross_streets(segment_names, cross_streets_from, invalid_cross_from),
    enumerate_cross_streets(segment_names_full, cross_streets_to, invalid_cross_to),
    enumerate_cross_streets(segment_names_full, cross_streets_from, invalid_cross_from),
    ]).drop_duplicates().dropna().rename('intersection')

# Neighborhoods

In [9]:
neighborhoods = pd.read_csv(NEIGHBORHOODS_PATH)

In [10]:
neighborhood_names = pd.concat([neighborhoods['PRI_NEIGH'], neighborhoods['SEC_NEIGH']]).str.title().drop_duplicates().rename('name')

# Sides

In [11]:
sides = ['West Side', 'South Side', 'North Side', 'Northwest Side', 'Southwest Side']

# Hospitals

In [43]:
hospital_query = """
(
  node["amenity"="hospital"](41.6445,-87.9401,42.0230,-87.5240);
  way["amenity"="hospital"](41.6445,-87.9401,42.0230,-87.5240);
  relation["amenity"="hospital"](41.6445,-87.9401,42.0230,-87.5240);
);
out center;
"""
landmark_query = """
(
  node["building"]["name"](41.6445,-87.9401,42.0230,-87.5240);
  way["building"]["name"](41.6445,-87.9401,42.0230,-87.5240);
  relation["building"]["name"](41.6445,-87.9401,42.0230,-87.5240);
);
out center;
"""
parks_query = """
(
  way["leisure"="park"](41.6445,-87.9401,42.0230,-87.5240);
  relation["leisure"="park"](41.6445,-87.9401,42.0230,-87.5240);
);
out geom;
"""

In [44]:
import overpass
from shapely.geometry import shape
import geopandas as gpd

def query_overpass(query):
    api = overpass.API()
    response = api.get(query)
    result = pd.DataFrame([
        dict(the_geom=feature['geometry'],
            street=feature['properties']['tags'].get('addr:street',None),
            housenumber=feature['properties']['tags'].get('addr:housenumber',None),
            name=feature['properties']['tags'].get('name',None))
        for feature in response['features']
    ])
    result['geometry'] = result['the_geom'].apply(shape)
    result['street'] = result['street'].str.strip()
    result['name'] = result['name'].str.strip()
    result = gpd.GeoDataFrame(result, geometry='geometry').drop(columns=['the_geom'])
    return result

hospitals = query_overpass(hospital_query)
landmarks = query_overpass(landmark_query)
parks = query_overpass(parks_query)

In [48]:
non_landmarks = landmarks['name'].isna() | (landmarks['name'].str.len() <= 2) | landmarks['name'].duplicated(keep=False)
non_hospitals = hospitals['name'].isna()
non_parks = parks['name'].isna()

# Label text

In [30]:
gold_docs = list(DocBin().from_disk(DATA_TRAIN_BIN_PATH).get_docs(nlp.vocab))

In [31]:
df = pd.DataFrame([e.text for d in gold_docs for e in d.ents],columns=['text'])

In [100]:
def contains_substr(texts, keywords):
    pattern = "|".join(rf"{re.escape(keyword)}" for keyword in keywords)
    pattern = re.compile(f"({pattern})", flags=re.IGNORECASE)
    return texts.str.contains(pattern, regex=True)
def contains_words(texts, keywords):
    pattern = "|".join(rf"\b{re.escape(keyword)}\b" for keyword in keywords)
    pattern = re.compile(f"({pattern})", flags=re.IGNORECASE)
    return texts.str.contains(pattern, regex=True)
def contains_fast(texts, keywords):
    texts = texts.str.upper()
    keywords = keywords.str.upper()
    automaton = Automaton()
    for keyword in keywords:
        automaton.add_word(keyword, keyword)
    automaton.make_automaton()

    def contains_match(text):
        return any(automaton.iter(text))
        # TODO: make sure word boundary check is working
        text_lower = text.lower()
        for end_idx, keyword in automaton.iter(text_lower):
            start_idx = end_idx - len(keyword) + 1
            if is_word_boundary(text_lower, start_idx, end_idx + 1):
                return True  # Stop early if a valid match is found
        return False

    return texts.apply(contains_match)  # Much faster than regex for large keyword lists


In [80]:
# df['is_block'] = df['text'].str.contains(r'\d+ block of [A-Za-z0-9]+')
# df['is_community'] = contains_words(df['text'], comm_areas['COMMUNITY'])
# df['is_street_full'] = contains_words(df['text'], street_names['Full Street Name'])
# df['is_street_name'] = contains_words(df['text'], street_names['Street'])
df['is_intersection'] = contains_substr(df['text'], intersections)
# df['is_street_partial'] = contains_substr(df['text'], street_names['street_partial'])
# df['is_neighborhood'] = contains_words(df['text'], neighborhood_names)
# df['is_side'] = contains_words(df['text'], sides)
# df['is_hospital'] = contains_words(df['text'], hospitals[~non_hospitals]['name'])
# df['is_landmark'] = contains_words(df['text'], landmarks[~non_landmarks]['name'])
# df['is_park'] = contains_words(df['text'], parks[~non_parks]['name'])
df['unmatched'] = ~df.filter(like='is_').any(axis=1)

KeyboardInterrupt: 

In [50]:
marginals = pd.concat([
    (~df.filter(like='is_').drop(columns=[c]).any(axis=1) & df[c]).rename(c)
    for c in df.filter(like='is_').columns], axis=1)

In [54]:
marginals.mean().sort_values()

Unnamed: 0,0
is_street_full,0.0
is_community,0.000121
is_hospital,0.002186
is_park,0.0051
is_landmark,0.021979
is_neighborhood,0.024165
is_street_partial,0.07031
is_side,0.078081
is_block,0.186764


In [55]:
df[df['unmatched']]

Unnamed: 0,text,is_block,is_community,is_street_full,is_street_partial,is_neighborhood,is_side,is_hospital,is_landmark,is_park,unmatched
0,Chicago,False,False,False,False,False,False,False,False,False,True
4,Lawndale neighborhood,False,False,False,False,False,False,False,False,False,True
8,Chicago,False,False,False,False,False,False,False,False,False,True
9,Chicago,False,False,False,False,False,False,False,False,False,True
11,Chicago,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
8218,Chicago’s,False,False,False,False,False,False,False,False,False,True
8225,Chicago,False,False,False,False,False,False,False,False,False,True
8230,Chicago,False,False,False,False,False,False,False,False,False,True
8233,76th and Essex.,False,False,False,False,False,False,False,False,False,True
