# Create H3 for the Data and Pull Tags for all H3


In [None]:
# !pip install alphashape
# !pip install openpyxl

In [1]:
import sys
import pathlib
import os

sys.path.insert(0, f"{os.environ['HOME']}/Development/green-last-mile/hex2vec")
sys.path.insert(0, f"{os.environ['HOME']}/Development/green-last-mile/amazon-routing-challenge")
# sys.path.insert(0, f"/gcsmount-notebook/codebase/urban_tools")


import json
import alphashape
import h3
import pandas as pd
import geopandas as gpd
# import dtale as dt
# import arc_tools
# import hex_tools as ht

from pathlib import Path
# from almrcc_tools.notebook_setup import GLMFileHandler, MapboxPlot, ORSClient
from src.utils.tesselate_amazon_data import cover_point_array_w_hex, pull_tags_for_hex_gdf
from src.data.utils import TOP_LEVEL_OSM_TAGS

In [2]:
data_path = Path(f"{os.environ['HOME']}/Development/green-last-mile/amazon-routing-challenge/data/almrrc2021-data-training/model_build_inputs")


## Open the Route File


In [None]:
# route_df = arc_tools.get_route_dataframe((data_path / "route_data.json").absolute())
route_df = pd.read_pickle(
    "/Users/max/Development/green-last-mile/amazon-routing-challenge/data/almrrc2021-data-training/model_build_inputs/route_df_merged_augmented.pkl"
)

## Find the Bounding Box of Boston

In [None]:
route_df = route_df.loc[route_df.station_code.str.contains("BO")].reset_index()
route_df = gpd.GeoDataFrame(
    route_df,
    geometry=gpd.points_from_xy(route_df.lng, route_df.lat),
    crs="EPSG:4326",
)


In [None]:
unique_lat_lon = route_df["geometry"].unique()

### Cover the Area of Boston Deliveries with H3 - level 5 hexagons

Level 6 on cluster b.c. it can't handle 5

In [None]:
LEVEL = 5

In [None]:
def geometry_series_to_xy(geometry_series, epgs=32633):
    g = geometry_series.to_crs(epsg=epgs).copy()
    return list(zip(g.x, g.y))


In [None]:
route_df["geometry"].apply(lambda x: h3.geo_to_h3(x.y, x.x, LEVEL)).unique()

In [None]:
from shapely.geometry import mapping

xy = geometry_series_to_xy(unique_lat_lon,)
print("computing alpha shape, this may take a while...")
res = alphashape.alphashape(xy, )
new_res = res.buffer(2 * h3.edge_length(resolution=LEVEL, unit="m"))
convex_hull_df = gpd.GeoDataFrame(
    geometry=gpd.GeoSeries(new_res),
    crs=f"EPSG:{32633}",
)
convex_hull_df = convex_hull_df.to_crs(epsg=4326)
feature = mapping(convex_hull_df)

# reverse coordinates in geojson
for feature in feature["features"]:
    geom = feature["geometry"]
    geom["coordinates"] = [[j[::-1] for j in i] for i in geom["coordinates"]]
    hexes = list(h3.polyfill(geom, LEVEL))
    break


In [None]:
res

In [None]:
new_res

In [None]:
hex_gdf = cover_point_array_w_hex(unique_lat_lon, LEVEL)

In [None]:
hex_gdf.head()

In [None]:
hex_gdf['tmp'] = 1
# could cast to UTM crs here but meh, doesn't need to be that accurate
center = hex_gdf.centroid.y.mean(), hex_gdf.centroid.x.mean()
fig = ht.plot_hexagons(hex_gdf, center, value_field="tmp", geometry_field='geometry', show=False )
hex_gdf.drop('tmp', axis=1, inplace=True)
fig.show()

In [None]:
# hex_gdf.head()

## Pull the Tags Asynchronously (speeds up dramatically)

In [None]:
data_dir = pathlib.Path("/gcsmount-research-data-staging/osmnx-cities/hexed-raw/Boston, MA")
data_dir.mkdir(exist_ok=True, parents=True)

In [None]:
# _ = await pull_tags_for_hex_gdf(data_dir, hex_gdf, TOP_LEVEL_OSM_TAGS, LEVEL)

### Investigating the Raw Data

In [None]:
Boston_Center = 42.34950116575089, -71.12349065689138
center_h3 = h3.geo_to_h3(*Boston_Center, LEVEL)
center_h3

#### Pull complete Building DF

In [4]:
from src.utils.tesselate_amazon_data import pull_tags_for_hex
from src.data.make_dataset import h3_to_polygon

In [5]:
hexes = ["862a30667ffffff", "862a3066fffffff", "862a3074fffffff", "862a33927ffffff", "862a33937ffffff"]

In [6]:
from collections import namedtuple
import asyncio

row = namedtuple("row", ["h3", "geometry"])
await asyncio.gather(*(pull_tags_for_hex(row(target_hex, h3_to_polygon(target_hex))._asdict(), Path(f"{os.environ['HOME']}/Development/green-last-mile"), ['natural']) for target_hex in hexes))

running for hex 862a30667ffffff
running for hex 862a3066fffffff
natural already exists
running for hex 862a3074fffffff
running for hex 862a33927ffffff
running for hex 862a33937ffffff


  return GeometryArray(vectorized.from_shapely(data), crs=crs)
  aout[:] = out


[None, None, None, None, None]

In [None]:
hex_gdf.set_index('h3', drop=False, inplace=True)

await pull_tags_for_hex(
    hex_gdf.loc[center_h3],
    data_dir,
    ['building', 'amenity'],
    simplify_data=False,  # keep extra columns per Nico request
    force_pull=True
)
hex_gdf.reset_index(drop=True, inplace=True)

In [None]:
building_df = pd.read_pickle(data_dir / center_h3 / "building.pkl")
building_df.head()

In [None]:
building_df = building_df.loc[building_df['element_type'] == 'way']

In [None]:
# filter out all the junk in height
def keep_dig(char):
    return char == "." or str.isdigit(char)

building_df['height_filt'] = building_df['height'].fillna("0").apply(lambda x: float(''.join(filter(keep_dig, x))))

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=building_df.loc[~(building_df['height_filt'] == 0), 'height_filt'].astype(float)
    )
)


#### Percent of Buildings without Height?

In [None]:
sum(building_df['height'].isna()) / len(building_df['height'])

In [None]:
sum(building_df['access'].isna()) / len(building_df['access'])

In [None]:
sum(building_df['building'].isna()) / len(building_df['building'])

In [None]:
building_df.sample(n=2000).to_excel(
    f"{center_h3}_building.xlsx"
)

In [None]:
list_columns = building_df.columns[building_df.applymap(lambda x: isinstance(x,list)).any()]
building_df[building_df.columns.difference(list_columns)].to_file(f"keplergl_data/{center_h3}_building.geojson", driver="GeoJSON", )  

### Amenity Tag

In [None]:
amenity_df = pd.read_pickle(data_dir / center_h3 / "amenity.pkl")
amenity_df.head()

In [None]:
sum(amenity_df['access'].isna()) / len(amenity_df['access'])

In [None]:
list_columns = amenity_df.columns[amenity_df.applymap(lambda x: isinstance(x,list)).any()]
amenity_df[amenity_df.columns.difference(list_columns)].to_file(f"{center_h3}_amenity.geojson", driver="GeoJSON", )  

## Map Tags to Little Hexes inside of Big Hexes 

In [None]:
from src.utils.tesselate_amazon_data import join_hex_dfs

In [None]:
interim_path = pathlib.Path("/gcsmount-research-data-staging/osmnx-cities/hexed-interim/Boston, MA")
interim_path.mkdir(exist_ok=True, parents=True)

In [None]:
# join_hex_dfs(
#     data_dir.joinpath("resolution_5"),
#     TOP_LEVEL_OSM_TAGS,
#     target_resolution=9,
#     output_dir=interim_path,
# )

## Join Tags for Each Hexagon

In [None]:
from src.utils.tesselate_amazon_data import group_hex_tags, iterate_hex_dir
from src.data.load_data import load_filter, load_city_tag_h3

In [None]:
# from pathlib import Path

# group_hex_tags(
#     interim_path,
#     TOP_LEVEL_OSM_TAGS,
#     output_dir=interim_path,
#     resolution=9,
#     filter_values=load_filter(Path(f"{os.environ['HOME']}/hex2vec/filters/from_wiki.json")),    
# )

## Join Hexagons into a Big City GDF

In [None]:
from src.utils.tesselate_amazon_data import create_city_from_hex

In [None]:
city_output_dir = pathlib.Path("/gcsmount-research-data-staging/osmnx-cities/hexed-complete/Boston, MA")
city_output_dir.mkdir(exist_ok=True, parents=True)

In [None]:
df = create_city_from_hex(
    interim_path,
    output_dir=city_output_dir,
    resolution=9,
    drop_all_zero=True,
)

In [None]:
# df.head()

In [None]:
# drop all zero columns
df.loc[:, (df != 0).any(axis=0)]

In [None]:
df.to_pickle(city_output_dir.joinpath("city.pkl"))