In [1]:
import pandas as pd
import geopandas as gpd
import math

In [2]:
gdf = gpd.read_file("../data/EJSCREEN_2020_CA.shp/EJSCREEN_2020_CA.shp")
cali_repr = gdf.to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

In [8]:
cali_repr.dropna(subset=['P_PM25_D2'], inplace=True)
cali_repr.reset_index(drop=True, inplace=True)

In [9]:
# aggregate block groups to tracts
tracts = []
for i in range(len(cali_repr.index)):
    tracts.extend([cali_repr["ID"][i][:11]])
cali_repr['tract'] = tracts

In [10]:
cali_tracts = cali_repr.dissolve(by='tract',  aggfunc ='mean')

In [11]:
# assign tracts to buckets by D_PM25_2
cali_tracts.sort_values("D_PM25_2", inplace=True)

In [12]:
D_PM25_2_bucket = []
num_buckets = 10
bucket_size = math.ceil(len(cali_tracts.index) / num_buckets)
for i in range(len(cali_tracts.index)):
    D_PM25_2_bucket.extend([math.floor(i / bucket_size)])
cali_tracts['D_PM25_2_bucket'] = D_PM25_2_bucket

In [13]:
# dissolve tracts by bucket
cali_attr = cali_tracts[["D_PM25_2", "D_PM25_2_bucket", "geometry"]].reset_index(drop=True)

In [14]:
cali_dissolve = cali_attr.dissolve(by='D_PM25_2_bucket', aggfunc='mean')

In [15]:
# break up MULTIPOLYGONs into individual polygons
compressed = []
for i in range(num_buckets):
    for j in range(len(cali_dissolve['geometry'][i].geoms)):
        compressed.append([math.floor(cali_dissolve['D_PM25_2'][i]), cali_dissolve['geometry'][i].geoms[j]])
 
gdf_compressed = gpd.GeoDataFrame(compressed, columns=["D_PM25_2", "geometry"], crs="EPSG:4326")


In [16]:
gdf_compressed.to_file("../data/EJSCREEN_2020_CA_D_PM25_2_dissolve.geojson", driver="GeoJSON")