# Separate the original GEOJSON

We'll separate into three files:
- `hexes.geojson` which only contains the integer `id` column and the hex outlines in the `geometry` column
- `meta.json` (currently not used in the app) which contains properties of the hexes like depth (of water), known presence of disease, whether there's a restoration site, aquaculture, or known wild oyster populations.
- `connectivity.pq` which contains rows of the form `["start_id", "end_id", "time", "depth", "weight"]`

## Tech preparation

In [1]:
import geopandas as gpd
import pandas as pd
from tqdm import tqdm

In [2]:
# only necessary once
!unzip -f -v hex_features_real.geojson.zip

Archive:  hex_features_real.geojson.zip
 Length   Method    Size  Cmpr    Date    Time   CRC-32   Name
--------  ------  ------- ---- ---------- ----- --------  ----
343313593  Defl:N 70552449  79% 09-26-2024 10:31 53ab374e  hex_features_real.geojson
--------          -------  ---                            -------
343313593         70552449  79%                            1 file


## Read original geojson

In [3]:
gdf = gpd.read_file("hex_features_real.geojson")
gdf

Unnamed: 0,id,lon,lat,depth,disease,rest,aqc,pop,connectivity,geometry
0,0,-3.215782,51.464549,5.0,0.0,0.0,0.0,1.0,"{ ""00d-07d_05m"": { ""0"": ""9.5e-02"", ""2"": ""4.4e-...","POLYGON ((-3.07193 51.46473, -3.14361 51.3868,..."
1,1,-3.229353,54.110769,5.0,0.0,0.0,0.0,1.0,"{ ""00d-07d_05m"": { ""1"": ""1.1e-01"", ""10"": ""4.4e...","POLYGON ((-3.07645 54.11096, -3.15262 54.03309..."
2,2,-3.215043,51.308819,15.0,0.0,0.0,0.0,6.0,"{ ""00d-07d_05m"": { ""0"": ""4.0e-04"", ""2"": ""4.1e-...","POLYGON ((-3.07168 51.309, -3.14312 51.23106, ..."
3,3,-3.214311,51.153079,10.0,0.0,0.0,0.0,0.0,"{ ""00d-07d_05m"": { ""2"": ""2.9e-01"", ""3"": ""9.7e-...","POLYGON ((-3.07144 51.15326, -3.14263 51.07532..."
4,4,-3.212150,50.685801,20.0,0.0,0.0,0.0,5.0,"{ ""00d-07d_05m"": { ""4"": ""3.1e-01"", ""5"": ""3.9e-...","POLYGON ((-3.07072 50.68598, -3.1412 50.60802,..."
...,...,...,...,...,...,...,...,...,...,...
8352,8352,-0.838398,55.959414,60.0,0.0,0.0,0.0,0.0,"{ ""00d-07d_05m"": { ""5641"": ""3.1e-04"", ""5642"": ...","POLYGON ((-0.67839 55.95655, -0.76285 55.88026..."
8353,8353,-0.829732,56.114909,60.0,0.0,0.0,0.0,0.0,"{ ""00d-07d_05m"": { ""5626"": ""3.0e-05"", ""5641"": ...","POLYGON ((-0.66908 56.11203, -0.75391 56.03575..."
8354,8354,-0.820983,56.270403,60.0,0.0,0.0,0.0,0.0,"{ ""00d-07d_05m"": { ""5626"": ""4.6e-04"", ""5641"": ...","POLYGON ((-0.65969 56.26751, -0.74488 56.19124..."
8355,8355,-0.812150,56.425896,60.0,0.0,0.0,0.0,0.0,"{ ""00d-07d_05m"": { ""5626"": ""9.6e-06"", ""5641"": ...","POLYGON ((-0.6502 56.42299, -0.73576 56.34673,..."


## Extract hex outlines

In [4]:
gdf_hexes = gdf[["id", "geometry"]]
gdf_hexes

Unnamed: 0,id,geometry
0,0,"POLYGON ((-3.07193 51.46473, -3.14361 51.3868,..."
1,1,"POLYGON ((-3.07645 54.11096, -3.15262 54.03309..."
2,2,"POLYGON ((-3.07168 51.309, -3.14312 51.23106, ..."
3,3,"POLYGON ((-3.07144 51.15326, -3.14263 51.07532..."
4,4,"POLYGON ((-3.07072 50.68598, -3.1412 50.60802,..."
...,...,...
8352,8352,"POLYGON ((-0.67839 55.95655, -0.76285 55.88026..."
8353,8353,"POLYGON ((-0.66908 56.11203, -0.75391 56.03575..."
8354,8354,"POLYGON ((-0.65969 56.26751, -0.74488 56.19124..."
8355,8355,"POLYGON ((-0.6502 56.42299, -0.73576 56.34673,..."


In [5]:
gdf_hexes.to_file("hexes.geojson")

## Extract metadata

In [6]:
df_meta = gdf[['id', 'lon', 'lat', 'depth', 'disease', 'rest', 'aqc', 'pop']]
df_meta

Unnamed: 0,id,lon,lat,depth,disease,rest,aqc,pop
0,0,-3.215782,51.464549,5.0,0.0,0.0,0.0,1.0
1,1,-3.229353,54.110769,5.0,0.0,0.0,0.0,1.0
2,2,-3.215043,51.308819,15.0,0.0,0.0,0.0,6.0
3,3,-3.214311,51.153079,10.0,0.0,0.0,0.0,0.0
4,4,-3.212150,50.685801,20.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...
8352,8352,-0.838398,55.959414,60.0,0.0,0.0,0.0,0.0
8353,8353,-0.829732,56.114909,60.0,0.0,0.0,0.0,0.0
8354,8354,-0.820983,56.270403,60.0,0.0,0.0,0.0,0.0
8355,8355,-0.812150,56.425896,60.0,0.0,0.0,0.0,0.0


In [7]:
df_meta.to_json("meta.json")

## Extract connectivity table

In [8]:
records = []
for k, v in tqdm(gdf.iterrows(), total=len(gdf)):
    start_id = v["id"]
    for timedepth, conn in eval(v["connectivity"]).items():
        time, depth = timedepth.split("_")
        for end_id, weight in conn.items():
            records.append(
                {"start_id": start_id, "end_id": end_id, "time": time, "depth": depth, "weight": float(weight)}
            )

100%|█████████████████████████████████████████████████████████████████████████| 8357/8357 [00:31<00:00, 267.46it/s]


In [9]:
df_conn = pd.DataFrame.from_records(records)
df_conn

Unnamed: 0,start_id,end_id,time,depth,weight
0,0,0,00d-07d,05m,0.095
1,0,2,00d-07d,05m,0.440
2,0,1019,00d-07d,05m,0.140
3,0,1021,00d-07d,05m,0.018
4,0,5498,00d-07d,05m,0.300
...,...,...,...,...,...
17833835,8356,8352,14d-28d,15m,0.051
17833836,8356,8353,14d-28d,15m,0.092
17833837,8356,8354,14d-28d,15m,0.090
17833838,8356,8355,14d-28d,15m,0.051


In [10]:
# parquet has builtin compression. Otherwise, this would be a 1.5 GB JSON
df_conn.to_parquet("connectivity.pq")  