# Sample addresses from openaddress

In [39]:
GMAPS_API_KEY = open("../../api_keys/google_maps.txt").read().strip()

In [40]:
# Format:
# lon,lat,number,street,unit,city,district,region,postcode,id,hash
#
# Each state/country represented by a separate file

import csv
from typing import NamedTuple
from pathlib import Path


class OpenAddrEntry(NamedTuple):
    lat: float
    lng: float
    city: str
    district: str
    region: str
    country: str
    entry_hash: str

def parse_openaddr_filename(filename):
    # country, region, district, city
    result = {}
    parts = Path(filename).relative_to("inputs/openaddr/").parts
    if len(parts) >= 4:
        result["city"] = parts[3]
    if len(parts) >= 3:
        # TODO: kind of a hack
        result["district"] = Path(parts[2]).stem.replace("_", " ")
    if len(parts) >= 2:
        result["region"] = parts[1]
    if len(parts) >= 1:
        result["country"] = parts[0]
    return result

def read_openaddr_csv(base_fn):
    # Parse the country, state, and city from the filename
    filename = "inputs/openaddr/" + base_fn
    ambient = parse_openaddr_filename(filename)

    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        return [OpenAddrEntry(
                    float(row["LAT"]),
                    float(row["LON"]),
                    row.get("CITY") or ambient.get("city"),
                    row.get("DISTRICT") or ambient.get("district"),
                    row.get("REGION") or ambient.get("region"),
                    ambient.get("country"),
                    row.get("HASH"),
                ) for row in reader]

In [41]:
parse_openaddr_filename("inputs/openaddr/us/ca/san_diego.csv")

{'district': 'san diego', 'region': 'ca', 'country': 'us'}

In [54]:
import requests
import pandas as pd
import json

# Transform points to nearest Street View panorama ID
def annotate_streetview(points):
    sv_lat = []
    sv_lng = []
    sv_pano = []
    sv_status = []
    for point in points:
        # Call the Street View API to get the nearest panorama
        result = requests.get("https://maps.googleapis.com/maps/api/streetview/metadata?location={},{}&key={}".format(point[0], point[1], GMAPS_API_KEY)).json()
        sv_status.append(result["status"])
        sv_lat.append(result.get("location", {}).get("lat"))
        sv_lng.append(result.get("location", {}).get("lng"))
        sv_pano.append(result.get("pano_id"))

    df = pd.DataFrame(sampled, columns=["orig_lat", "orig_lng", "city", "district", "region", "country", "entry_hash"])
    df['lat'] = sv_lat
    df['lng'] = sv_lng
    df['pano_id'] = sv_pano
    df['status'] = sv_status
    return df

# Download panorama images from street view
def streetview_write_point(df_row, output_basename):
    for heading in [0, 45, 90, 135, 180, 225, 270, 315]:
        metadata = df_row.to_dict()
        metadata["heading"] = heading
        metadata["fov"] = 45
        metadata["pitch"] = 0

        url = "https://maps.googleapis.com/maps/api/streetview?size=640x640&pano={pano_id}&fov={fov}&heading={heading}".format(**metadata)
        url += "&key={}".format(GMAPS_API_KEY)
        # TODO: vary the heading, pitch, and FOV

        # Store image & metadata in webdataset format
        # TODO: use the official library?
        jpeg_filename = "{}_{}.jpg".format(output_basename, heading)
        json_filename = "{}_{}.json".format(output_basename, heading)
        if not os.path.exists(jpeg_filename):
            print("Downloading", jpeg_filename)
            r = requests.get(url, allow_redirects=True)
            with open(jpeg_filename, 'wb') as f:
                f.write(r.content)
            with open(json_filename, 'w') as f:
                json.dump(metadata, f)
        else:
            print("Skipping", jpeg_filename)

## California - Tiny dataset

In [43]:
# Sample addresses from CA bay area, Sacramento, LA area, San Diego area
filenames = [
    # Bay area
    "us/ca/san_francisco.csv",
    "us/ca/santa_clara.csv",
    "us/ca/san_mateo.csv",
    "us/ca/alameda.csv",

    # Sacramento
    "us/ca/sacramento.csv",

    # Los Angeles
    "us/ca/los_angeles.csv",

    # San Diego
    "us/ca/san_diego.csv",
]

all_points = []
for fn in filenames:
    all_points += read_openaddr_csv(fn)

len(all_points)

6089274

In [56]:
# Sample 1000 addresses
import random
r = random.Random(42)
sampled = r.sample(all_points, 1000)
sampled

[OpenAddrEntry(lat=33.3800005, lng=-117.2392053, city='Fallbrook', district='san diego', region='CA', country='us', entry_hash='5ff46c2a403f7164'),
 OpenAddrEntry(lat=37.6333143, lng=-122.4079326, city=None, district='san mateo', region='ca', country='us', entry_hash=''),
 OpenAddrEntry(lat=37.7847249, lng=-122.4832535, city=None, district='san francisco', region='ca', country='us', entry_hash='c97a880bc2f53d79'),
 OpenAddrEntry(lat=34.0185846, lng=-118.1627271, city='Los Angeles', district='los angeles', region='ca', country='us', entry_hash='0b590d1a1c599d7c'),
 OpenAddrEntry(lat=38.6639086, lng=-121.470757, city='SACRAMENTO', district='sacramento', region='ca', country='us', entry_hash='7cd39a5781b39e7d'),
 OpenAddrEntry(lat=38.4517842, lng=-121.4052465, city='ELK GROVE', district='sacramento', region='ca', country='us', entry_hash='5e3f9baea4526042'),
 OpenAddrEntry(lat=37.6651009, lng=-122.0953801, city='HAYWARD', district='alameda', region='ca', country='us', entry_hash='227ec138

In [57]:
# sampled_df = points_to_df(sampled)
# TODO: Split into train/test
annotated = annotate_streetview(sampled)
df = annotated.query('status == "OK"')
df

Unnamed: 0,orig_lat,orig_lng,city,district,region,country,entry_hash,lat,lng,pano_id,status
0,33.380001,-117.239205,Fallbrook,san diego,CA,us,5ff46c2a403f7164,33.380174,-117.239158,LTjkmdrm1Lt3E_x6ao67CA,OK
1,37.633314,-122.407933,,san mateo,ca,us,,37.633230,-122.407711,iS4QqsapOiRWclN8EEvhPw,OK
2,37.784725,-122.483254,,san francisco,ca,us,c97a880bc2f53d79,37.784752,-122.482946,CAoSLEFGMVFpcE5veGdWaGFjRXNOd1NCWndaSE5lUGk2T0...,OK
3,34.018585,-118.162727,Los Angeles,los angeles,ca,us,0b590d1a1c599d7c,34.018503,-118.162541,Hi3Eyi8QxZ3aN_fGjo_vWA,OK
5,38.451784,-121.405247,ELK GROVE,sacramento,ca,us,5e3f9baea4526042,38.451714,-121.405436,JMjXx4cJmsn2abk5LFwqRQ,OK
...,...,...,...,...,...,...,...,...,...,...,...
995,37.256217,-121.837329,SAN JOSE,santa clara,ca,us,457602090c8bc2e5,37.256058,-121.837204,B2uix62bf_b7KiybFN2neQ,OK
996,37.678123,-122.133754,SAN LORENZO,alameda,ca,us,ce62e2966a99f969,37.678111,-122.134039,vCiOkF7q4x7rkNfUGcqBaw,OK
997,37.734176,-122.418610,,san francisco,ca,us,f6c9ff19dabe1224,37.734026,-122.418672,E8KMmql_cfIa0wq9fIVlhQ,OK
998,34.377715,-118.528095,Newhall,los angeles,ca,us,8c7568384c52fb55,34.377777,-118.528336,CAoSLEFGMVFpcE1mSXphTUdoNUMzUXRaMVdMbDBOOFEzRV...,OK


In [58]:
annotated.query('status != "OK"')

Unnamed: 0,orig_lat,orig_lng,city,district,region,country,entry_hash,lat,lng,pano_id,status
4,38.663909,-121.470757,SACRAMENTO,sacramento,ca,us,7cd39a5781b39e7d,,,,ZERO_RESULTS
8,32.686165,-116.947294,CHULA VISTA,san diego,CA,us,c7e03f3e48ac7938,,,,ZERO_RESULTS
17,38.400194,-121.339595,ELK GROVE,sacramento,ca,us,f39b83215ab30aef,,,,ZERO_RESULTS
23,32.633313,-116.474016,CAMPO,san diego,CA,us,c4366231c12d5fba,,,,ZERO_RESULTS
24,32.951934,-117.023770,Poway,san diego,CA,us,82191c60825925bc,,,,ZERO_RESULTS
...,...,...,...,...,...,...,...,...,...,...,...
926,33.191045,-117.233456,VISTA,san diego,CA,us,319204838228f93c,,,,ZERO_RESULTS
968,32.736136,-116.827665,Jamul,san diego,CA,us,f1cc0336dd471a3d,,,,ZERO_RESULTS
970,32.930265,-116.883242,Lakeside,san diego,CA,us,cd9438fee59e328e,,,,ZERO_RESULTS
976,33.900155,-118.297361,Gardena,los angeles,ca,us,d3b6a0ef37906d38,,,,ZERO_RESULTS


In [60]:
# Write images and metadata JSONs to disk
import os
os.makedirs("outputs/california_tiny", exist_ok=True)
for index, row in df.iterrows():
    streetview_write_point(row, "outputs/california_tiny/loc_{}".format(index))

Downloading outputs/california_tiny/loc_0_0.jpg
Skipping outputs/california_tiny/loc_0_45.jpg
Skipping outputs/california_tiny/loc_0_90.jpg
Skipping outputs/california_tiny/loc_0_135.jpg
Skipping outputs/california_tiny/loc_0_180.jpg
Skipping outputs/california_tiny/loc_0_225.jpg
Skipping outputs/california_tiny/loc_0_270.jpg
Skipping outputs/california_tiny/loc_0_315.jpg
Skipping outputs/california_tiny/loc_1_0.jpg
Skipping outputs/california_tiny/loc_1_45.jpg
Skipping outputs/california_tiny/loc_1_90.jpg
Skipping outputs/california_tiny/loc_1_135.jpg
Skipping outputs/california_tiny/loc_1_180.jpg
Skipping outputs/california_tiny/loc_1_225.jpg
Skipping outputs/california_tiny/loc_1_270.jpg
Skipping outputs/california_tiny/loc_1_315.jpg
Skipping outputs/california_tiny/loc_2_0.jpg
Skipping outputs/california_tiny/loc_2_45.jpg
Skipping outputs/california_tiny/loc_2_90.jpg
Skipping outputs/california_tiny/loc_2_135.jpg
Skipping outputs/california_tiny/loc_2_180.jpg
Skipping outputs/calif

In [59]:
# Plot points on map using Folium
import folium

map_ca = folium.Map(location=[37.4301922691736, -122.16943741588071], zoom_start=10)
for index, row in df.iterrows():
    folium.CircleMarker((row["lat"], row["lng"]), radius=5, color="red").add_to(map_ca)
map_ca