In [None]:
# Format:
# lon,lat,number,street,unit,city,district,region,postcode,id,hash
#
# Each state/country represented by a separate file

import csv
from typing import NamedTuple
from pathlib import Path
import pandas as pd


class OpenAddrEntry(NamedTuple):
    lat: float
    lng: float
    city: str
    district: str
    region: str
    country: str
    entry_hash: str

def parse_openaddr_filename(filename):
    # country, region, district, city
    result = {}
    parts = Path(filename).relative_to("inputs/openaddr/").parts
    if len(parts) >= 4:
        result["city"] = parts[3]
    if len(parts) >= 3:
        # TODO: kind of a hack
        result["district"] = Path(parts[2]).stem.replace("_", " ")
    if len(parts) >= 2:
        result["region"] = parts[1]
    if len(parts) >= 1:
        result["country"] = parts[0]
    return result

def read_openaddr_csv(base_fn):
    # Parse the country, state, and city from the filename
    filename = "inputs/openaddr/" + base_fn
    ambient = parse_openaddr_filename(filename)

    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        return [OpenAddrEntry(
                    float(row["LAT"]),
                    float(row["LON"]),
                    row.get("CITY") or ambient.get("city"),
                    row.get("DISTRICT") or ambient.get("district"),
                    row.get("REGION") or ambient.get("region"),
                    ambient.get("country"),
                    row.get("HASH"),
                ) for row in reader]

In [None]:
parse_openaddr_filename("inputs/openaddr/us/ca/san_diego.csv")

In [None]:
def points_to_df(points):
    df = pd.DataFrame(points, columns=["orig_lat", "orig_lng", "city", "district", "region", "country", "entry_hash"])
    return df