Assign all ETCs in all regions to h3 hexagons, which will be used to limit spatial linkage in the model train/test split.

In [3]:
import h3
import shapely
import pandas as pd
import tobler
import geopandas as gpd

In [4]:
h3_resolution = 7

In [5]:
regions_datadir = "/data/uscuni-eurofab/"
tessellations_dir = '/data/uscuni-eurofab/processed_data/tessellations/'
buildings_dir = '/data/uscuni-eurofab/processed_data/buildings/'

region_hulls = gpd.read_parquet(
        regions_datadir + "regions/" + "ms_ce_region_hulls.parquet"
    )
region_hulls.shape

(474, 1)

In [4]:
def assign_hexagons(region_id, region_hull):
    '''Assign all ETCs in a reigion to h3 hexagons.'''
    
    ## split region hull into hexagons
    bounds = region_hull.iloc[0]
    poly = h3.geo_to_cells(bounds, res=h3_resolution)
    res = [shapely.geometry.shape(h3.cells_to_geo([p])) for p in poly]
    hexagons = gpd.GeoSeries(res, index=poly,name='geometry', crs='epsg:4326').to_crs(epsg=3035)

    tess = gpd.read_parquet(
            tessellations_dir + f"tessellation_{region_id}.parquet"
    )

    # assign hexagons to tessellation cells
    inp, res = tess.sindex.query(hexagons, predicate='intersects')
    # polygons should be assigned to only one h3 grid
    duplicated = pd.Series(res).duplicated()
    inp = inp[~duplicated]
    res = res[~duplicated]
    
    hex_assignments = pd.Series(hexagons.index[inp].values, tess.index[res], name='hexagons').sort_index()
    return hex_assignments

In [5]:
%%time
for region_id, region_hull in region_hulls.to_crs(epsg=4326).iterrows():
    print(region_id)
    hex_assignments = assign_hexagons(region_id, region_hull)
    hex_assignments.reset_index().to_parquet(f'/data/uscuni-eurofab/processed_data/hexagons/{region_id}_hexagon.pq')

19
24
33
478
754
817
1049
1485
1677
2415
2513
2707
2785
2790
2820
3228
3307
3313
3357
3540
3661
3762
3806
4271
4285
4640
4763
5175
5189
5320
5429
5874
6337
6351
6477
6858
6881
7113
7381
7411
7640
7693
7728
7921
7924
8014
8087
8147
8440
8659
8927
8960
9560
9840
9887
10197
10283
10600
10673
10764
10875
11024
11178
11550
11623
11640
12080
12222
12247
12347
12401
12546
12614
12649
12695
12736
13285
13496
13497
14086
14327
14383
14605
14836
15151
15308
15362
15415
15540
15560
15646
16446
16582
16687
17219
17720
17763
17808
17857
17874
17951
18006
18143
18215
19325
19474
20008
20063
20356
20573
20597
20811
21128
22022
22398
22633
22770
23227
23621
23941
24079
24141
24683
24737
25065
25497
25588
25814
25964
26146
26642
26773
27374
27700
27783
27997
28059
28060
28237
28601
28751
28795
28961
29387
30259
30571
30662
30938
31101
31696
31807
32671
32890
33094
33150
33415
33718
33769
33803
34553
34902
36043
36064
36227
36327
36396
36457
37246
37360
37371
37637
37789
37937
38374
38429
38584
38606
38

## Explore assignment

In [6]:
region_id = 65806
hex_assignments = pd.read_parquet(f'/data/uscuni-eurofab/processed_data/hexagons/{region_id}_hexagon.pq').set_index('index')

In [7]:
selected = hex_assignments[hex_assignments.hexagons == '871e354ddffffff'].index
selected.shape

(1021,)

In [8]:
tess = gpd.read_parquet(
            tessellations_dir + f"tessellation_{region_id}.parquet"
    )

In [10]:
# tess.loc[selected].explore()