In [None]:
%pip install overpass pyahocorasick

In [None]:
import shutil
import subprocess
import os
import time
import json

import re
import pandas as pd
import geopandas as gpd
import numpy as np
from tqdm import tqdm
import overpass
from ahocorasick import Automaton


import spacy
from spacy.tokens import DocBin

In [None]:
COLAB = True

In [None]:
DATA_DIR = "gdrive/MyDrive/Work/quantify-news/data/" if COLAB else "data/"
PROJECT_DIR = "gdrive/MyDrive/Work/quantify-news/" if COLAB else "./"
TMP_DIR = "/content/" if COLAB else "./"

DATA_TRAIN_BIN_PATH = DATA_DIR + "ner_train.spacy"
DATA_DEV_BIN_PATH = DATA_DIR + "ner_dev.spacy"
DATA_TEST_BIN_PATH = DATA_DIR + "ner_test.spacy"

BEST_MODEL_OPT_PATH = PROJECT_DIR + "models/model-best/"

PARKS_PATH = DATA_DIR + "Parks_2025.csv"
COMM_AREA_PATH = DATA_DIR + "CommAreas_2025.csv"
STREET_NAMES_PATH = DATA_DIR + "StreetNames_2025.csv"
STREET_SEGMENTS_PATH = DATA_DIR + "StreetSegments_2025.geojson"
NEIGHBORHOODS_PATH = DATA_DIR + "Neighborhoods_2025.csv"

In [None]:
if COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')

    import locale
    print(locale.getpreferredencoding())
    def getpreferredencoding(do_setlocale=True):
        return 'UTF-8'
    locale.getpreferredencoding = getpreferredencoding

In [None]:
nlp = spacy.load(BEST_MODEL_OPT_PATH)

# Label text

In [None]:
gold_docs = list(DocBin().from_disk(DATA_TRAIN_BIN_PATH).get_docs(nlp.vocab))

In [None]:
df = pd.DataFrame([e.text for d in gold_docs for e in d.ents],columns=['text'])

In [None]:
import string
def contains_substr(texts, keywords):
    pattern = "|".join(rf"{re.escape(keyword)}" for keyword in keywords)
    pattern = re.compile(f"({pattern})", flags=re.IGNORECASE)
    return texts.str.contains(pattern, regex=True)
def contains_words(texts, keywords):
    pattern = "|".join(rf"\b{re.escape(keyword)}\b" for keyword in keywords)
    pattern = re.compile(f"({pattern})", flags=re.IGNORECASE)
    return texts.str.contains(pattern, regex=True)

def contains_fast(texts, keywords, whole_word=True):
    texts = texts.str.upper()
    keywords = keywords.str.upper()
    word_boundaries = set(string.whitespace + string.punctuation)
    
    automaton = Automaton()
    for keyword in keywords:
        automaton.add_word(keyword, keyword)
    automaton.make_automaton()

    def is_word_boundary(text, start, end):
        """Ensures the match is enclosed by word boundaries"""
        before = start == 0 or text[start - 1] in word_boundaries
        after = end == len(text) or text[end] in word_boundaries
        return before and after
    
    def contains_match(text):
        if not whole_word:
            return any(automaton.iter(text))
        for end_idx, keyword in automaton.iter(text):
            start_idx = end_idx - len(keyword) + 1
            if is_word_boundary(text, start_idx, end_idx + 1):
                return True  # Stop early if a valid match is found
        return False

    return texts.apply(contains_match)  # Much faster than regex for large keyword lists


In [None]:
# df['is_block'] = df['text'].str.contains(r'\d+ block of [A-Za-z0-9]+')
# df['is_community'] = contains_words(df['text'], comm_areas['COMMUNITY'])
# df['is_street_full'] = contains_words(df['text'], street_names['Full Street Name'])
# df['is_street_name'] = contains_words(df['text'], street_names['Street'])
df['is_intersection'] = contains_substr(df['text'], intersections)
# df['is_street_partial'] = contains_substr(df['text'], street_names['street_partial'])
# df['is_neighborhood'] = contains_words(df['text'], neighborhood_names)
# df['is_side'] = contains_words(df['text'], sides)
# df['is_hospital'] = contains_words(df['text'], hospitals[~non_hospitals]['name'])
# df['is_landmark'] = contains_words(df['text'], landmarks[~non_landmarks]['name'])
# df['is_park'] = contains_words(df['text'], parks[~non_parks]['name'])
df['unmatched'] = ~df.filter(like='is_').any(axis=1)

In [None]:
marginals = pd.concat([
    (~df.filter(like='is_').drop(columns=[c]).any(axis=1) & df[c]).rename(c)
    for c in df.filter(like='is_').columns], axis=1)

In [None]:
marginals.mean().sort_values()

In [None]:
df[df['unmatched']]