## Morphometic processing

In [1]:
import geopandas as gpd
import pandas as pd
import glob

def get_processed_region_ids(directory):
    return [int(s.split('_')[-1].split('.')[0]) for s in glob.glob(directory + '*')]

In [2]:
regions_buildings_dir = '/data/uscuni-eurofab/regions/buildings/'
buildings_dir = '/data/uscuni-eurofab/processed_data/buildings/'
overture_streets_dir = '/data/uscuni-eurofab/overture_streets/'
streets_dir = '/data/uscuni-eurofab/processed_data/streets/'
enclosures_dir = '/data/uscuni-eurofab/processed_data/enclosures/'
tessellations_dir = '/data/uscuni-eurofab/processed_data/tessellations/'
graph_dir = '/data/uscuni-eurofab/processed_data/neigh_graphs/'
chars_dir = '/data/uscuni-eurofab/processed_data/chars/'
simplfied_buildings_dir = '/data/uscuni-eurofab/processed_data/simplified_buildings/'


regions_datadir = "/data/uscuni-eurofab/"

In [3]:
regions_datadir = "/data/uscuni-eurofab/"
region_hulls = gpd.read_parquet(
        regions_datadir + "regions/" + "ms_ce_region_hulls.parquet"
    )
region_hulls.shape

(474, 1)

In [4]:
region_id = 53490

## Process buildings

Run the building processing pipeline for all regions

In [None]:
from core.generate_buildings import read_region_buildings, process_region_buildings

In [None]:
def process_single_region_buildings(region_id):
    print('processing', region_id)
    buildings = gpd.read_parquet(regions_buildings_dir + f'buildings_{region_id}.pq')
    buildings = process_region_buildings(buildings, True, simplification_tolerance=.1, merge_limit=25)
    buildings.to_parquet(simplfied_buildings_dir + f"buildings_{region_id}.parquet")

In [None]:
%%capture cap
for region_id, _ in region_hulls.iterrows():
    process_single_region_buildings(region_id)

Copy over the simplified buildings to the processed data folder.

In [None]:
# !cp -r /data/uscuni-eurofab/processed_data/simplified_buildings/ /data/uscuni-eurofab/processed_data/buildings/

## Process streets

Run the street processing pipeline for all regions.

In [None]:
from core.generate_streets import process_region_streets

In [None]:
%%capture cap
for region_id, _ in region_hulls.iterrows():
    streets = process_region_streets(region_id, overture_streets_dir, buildings_dir)
    streets.to_parquet(streets_dir + f'streets_{region_id}.parquet')

In [None]:
region_id  = 109005

## Process elements

Run the element generating pipeline for all regions.

In [None]:
from core.generate_elements import process_region_elements, generate_enclosures_representative_points, generate_tess

In [None]:
processed_region_ids = get_processed_region_ids(tessellations_dir)


In [None]:
for region_id, _ in region_hulls[~region_hulls.index.isin(processed_region_ids)].iterrows():
    enclosures, tesselations = process_region_elements(buildings_dir, streets_dir, region_id)

    enclosures.to_parquet(enclosures_dir + f"enclosure_{region_id}.parquet")
    print("Processed enclosures")
    
    ## save files
    tesselations.to_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
    )
    print("processed tesselations")

## Process graphs

Run the graph generating pipeline for all regions.

In [None]:
from core.generate_ngraphs import process_region_graphs

In [None]:
for region_id, _ in region_hulls.iterrows():

    process_region_graphs(
        region_id,
        graph_dir,
        buildings_dir,
        streets_dir,
        enclosures_dir,
        tessellations_dir,
    )

## Process morphometrics

Run the morphometric character processing pipeline for all regions.

In [None]:
from core.generate_chars import process_single_region_chars, process_building_chars

In [None]:
for region_id, _ in region_hulls.iterrows():

    process_single_region_chars(
        region_id,
        graph_dir,
        buildings_dir,
        streets_dir,
        enclosures_dir,
        tessellations_dir,
        chars_dir
    )

## Merge data

Merge all the building, street, nodes, enclosures and ETCs morphometric.

In [None]:
from core.generate_merged_primary_chars import merge_into_primary
from core.utils import used_keys

In [None]:
for region_id, _ in region_hulls.iterrows():
    tessellation = gpd.read_parquet(chars_dir + f"tessellations_chars_{region_id}.parquet")
    buildings = gpd.read_parquet(chars_dir + f"buildings_chars_{region_id}.parquet")
    enclosures = gpd.read_parquet(chars_dir + f"enclosures_chars_{region_id}.parquet")
    streets = gpd.read_parquet(chars_dir + f"streets_chars_{region_id}.parquet")
    nodes = gpd.read_parquet(chars_dir + f"nodes_chars_{region_id}.parquet")
    
    
    merged = pd.merge(
        tessellation.drop(columns=["geometry"]),
        buildings.drop(columns=["nodeID", "geometry", 'nID']),
        right_index=True,
        left_index=True,
        how="left",
    )
    
    merged = merged.merge(
        enclosures.drop(columns="geometry"),
        right_on="eID",
        left_on="enclosure_index",
        how="left",
    )
    
    merged = merged.merge(streets.drop(columns="geometry"), on="nID", how="left")
    merged = merged.merge(nodes.drop(columns="geometry"), on="nodeID", how="left")
    
    merged = merged.drop(
        columns=[
            "nID",
            "eID",
            "nodeID",
            "mm_len",
            "cdsbool",
            "node_start",
            "node_end",
            # "x",
            # "y",
            "enclosure_index",
            # "id",
            # "osm_id",
            # "index",  ## maybe keep
        ]
    )
    merged = merged.set_index(tessellation.index)
    
    primary = merged[list(used_keys.keys())]
    primary.to_parquet(chars_dir + f'primary_chars_{region_id}.parquet')

## Generate spatial lag

In [5]:
from core.generate_context import spatially_weighted_partial_lag, parallel_higher_order_context
import numpy as np
from libpysal.graph import read_parquet
import shapely

In [6]:
spatial_lag = 3


kernel ='inverse' 
n_splits = 10
bandwidth_type = -1

Spatially weighted context lag

In [32]:
%%time

for region_id in region_hulls.index:
    
    print(region_id)    
    
    # setup data for the spatial lag calculations
    X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')
    graph = read_parquet(graph_dir + f"tessellation_graph_{region_id}.parquet")
    tessellation = gpd.read_parquet(
            tessellations_dir + f"tessellation_{region_id}.parquet"
    )
    centroids = shapely.get_coordinates(tessellation.representative_point())
    
    # calculate and store the lag
    lag = spatially_weighted_partial_lag(X_train, graph, centroids, kernel=kernel, k=spatial_lag, n_splits=n_splits, bandwidth=bandwidth_type)
    combined_data = X_train.join(lag, how='inner')
    combined_data.to_parquet(f'{chars_dir}lag_chars_{region_id}_{kernel}_{spatial_lag}_{bandwidth}.parquet')

19
24
33
478
754
817
1049
1485
1677
2415
2513
2707
2785
2790
2820
3228
3307
3313
3357
3540
3661
3762
3806
4271
4285
4640
4763
5175
5189
5320
5429
5874
6337
6351
6477
6858
6881
7113
7381
7411
7640
7693
7728
7921
7924
8014
8087
8147
8440
8659
8927
8960
9560
9840
9887
10197
10283
10600
10673
10764
10875
11024
11178
11550
11623
11640
12080
12222
12247
12347
12401
12546
12614
12649
12695
12736
13285
13496
13497
14086
14327
14383
14605
14836
15151
15308
15362
15415
15540
15560
15646
16446
16582
16687
17219
17720
17763
17808
17857
17874
17951
18006
18143
18215
19325
19474
20008
20063
20356
20573
20597
20811
21128
22022
22398
22633
22770
23227
23621
23941
24079
24141
24683
24737
25065
25497
25588
25814
25964
26146
26642
26773
27374
27700
27783
27997
28059
28060
28237
28601
28751
28795
28961
29387
30259
30571
30662
30938
31101
31696
31807
32671
32890
33094
33150
33415
33718
33769
33803
34553
34902
36043
36064
36227
36327
36396
36457
37246
37360
37371
37637
37789
37937
38374
38429
38584
38606
38

Equal spatial weights

In [7]:
%%time

for region_id in region_hulls.index:
    
    print(region_id)    
    
    # setup data for the spatial lag calculations
    X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')
    graph = read_parquet(graph_dir + f"tessellation_graph_{region_id}.parquet")

    # calculate and store the lag
    lag = parallel_higher_order_context(X_train, graph, k=spatial_lag, n_splits=n_splits, output_vals=3)
    lag.columns = np.concatenate([(c + "_15", c + "_median", c + "_85") for c in X_train.columns])
    combined_data = X_train.join(lag, how='inner')
    combined_data.to_parquet(f'{chars_dir}lag_chars_{region_id}_unweighted_{spatial_lag}.parquet')

19
24
33
478
754
817
1049
1485
1677
2415
2513
2707
2785
2790
2820
3228
3307
3313
3357
3540
3661
3762
3806
4271
4285
4640
4763
5175
5189
5320
5429
5874
6337
6351
6477
6858
6881
7113
7381
7411
7640
7693
7728
7921
7924
8014
8087
8147
8440
8659
8927
8960
9560
9840
9887
10197
10283
10600
10673
10764
10875
11024
11178
11550
11623
11640
12080
12222
12247
12347
12401
12546
12614
12649
12695
12736
13285
13496
13497
14086
14327
14383
14605
14836
15151
15308
15362
15415
15540
15560
15646
16446
16582
16687
17219
17720
17763
17808
17857
17874
17951
18006
18143
18215
19325
19474
20008
20063
20356
20573
20597
20811
21128
22022
22398
22633
22770
23227
23621
23941
24079
24141
24683
24737
25065
25497
25588
25814
25964
26146
26642
26773
27374
27700
27783
27997
28059
28060
28237
28601
28751
28795
28961
29387
30259
30571
30662
30938
31101
31696
31807
32671
32890
33094
33150
33415
33718
33769
33803
34553
34902
36043
36064
36227
36327
36396
36457
37246
37360
37371
37637
37789
37937
38374
38429
38584
38606
38