In [8]:
import pandas as pd
import geopandas as gpd
import mercantile
from tqdm import tqdm
import os
from shapely import geometry
import time
import json

In [9]:
def get_bounds(geom):
    """
    Get bounds of a GeoJSON geometry.
    """
    aoi_shape = geometry.shape(aoi_geom)
    minx, miny, maxx, maxy = aoi_shape.bounds
    return (minx, miny, maxx, maxy)

aoi_geom = {
    "coordinates": [        
        [
            [-76.15741548689954, 43.05692144640927], 
            [-76.15741548689954, 43.05635088078997],  
            [-76.15648427005196, 43.05635088078997],  
            [-76.15648427005196, 43.05692144640927],  
            [-76.15741548689954, 43.05692144640927], 
        ]
    ],
    "type": "Polygon"
}

minx, miny, maxx, maxy = get_bounds(aoi_geom)

In [10]:
# zoom value 9 to match https://minedbuildings.z5.web.core.windows.net/global-buildings/dataset-links.csv 
quad_keys = set()
for tile in list(mercantile.tiles(minx, miny, maxx, maxy, zooms=9)):
    quad_keys.add(mercantile.quadkey(tile))
quad_keys = list(quad_keys)
print(f"The input area spans {len(quad_keys)} tiles: {quad_keys}")

The input area spans 1 tiles: ['030232211']


In [11]:
df = pd.read_csv(
    "https://minedbuildings.z5.web.core.windows.net/global-buildings/dataset-links.csv", dtype=str
)
df.head()

Unnamed: 0,Location,QuadKey,Url,Size,UploadDate
0,Abyei,122320113,https://minedbuildings.z5.web.core.windows.net...,74.5KB,2025-02-28
1,Abyei,122320131,https://minedbuildings.z5.web.core.windows.net...,8.3KB,2025-02-28
2,Abyei,122321002,https://minedbuildings.z5.web.core.windows.net...,392.2KB,2025-02-28
3,Abyei,122321003,https://minedbuildings.z5.web.core.windows.net...,72.8KB,2025-02-28
4,Abyei,122321020,https://minedbuildings.z5.web.core.windows.net...,1.2MB,2025-02-28


In [12]:
def save_bounds_index(quad_key, gdf, base_path):
    """Save bounds information to index.json"""
    index_path = os.path.join(base_path, 'index.json')
    
    # Create or load existing index
    if os.path.exists(index_path):
        with open(index_path, 'r') as f:
            index_dict = json.load(f)
    else:
        index_dict = {
            "type": "FeatureCollection",
            "features": []
        }
    
    # Calculate bounds and create polygon
    minx, miny, maxx, maxy = gdf.total_bounds
    
    new_feature = {
        "type": "Feature",
        "geometry": {
            "type": "Polygon",
            "coordinates": [[
                [float(minx), float(miny)],
                [float(maxx), float(miny)],
                [float(maxx), float(maxy)],
                [float(minx), float(maxy)],
                [float(minx), float(miny)]
            ]]
        },
        "properties": {
            "quad_key": quad_key,
            "file_path": os.path.join(base_path, f"{quad_key}_processed.json")
        }
    }
    
    features = index_dict["features"]
    for i, feature in enumerate(features):
        if feature["properties"]["quad_key"] == quad_key:
            features[i] = new_feature
            break
    else:
        features.append(new_feature)
    
    with open(index_path, 'w') as f:
        json.dump(index_dict, f, indent=2)

In [13]:
idx = 0
combined_gdf = gpd.GeoDataFrame()
force_download = False
error_list = []
start_time = time.time()
success_count = 0
bounds_dict = {} 
data_folder = '/Users/huajunchen/Library/Project/Python/segment-geo/segment_geospatial_api/notebook/global_buildings'

# Download the GeoJSON files for each tile that intersects the input geometry
for quad_key in tqdm(quad_keys):
    rows = df[df["QuadKey"] == quad_key]
    if rows.shape[0] == 1:
        url = rows.iloc[0]["Url"]

        json_fn = os.path.join(data_folder, f"{quad_key}_processed.json")
        if force_download or not os.path.exists(json_fn):
            try:
                df2 = pd.read_json(url, lines=True)
                df2["geometry"] = df2["geometry"].apply(geometry.shape)
                gdf = gpd.GeoDataFrame(df2, crs=4326)
                
                # 计算bounds并存储
                minx, miny, maxx, maxy = gdf.total_bounds
                bounds_dict[quad_key] = {
                    "bounds": {
                        "minx": float(minx),
                        "miny": float(miny),
                        "maxx": float(maxx),
                        "maxy": float(maxy)
                    },
                    "file_path": json_fn
                }
                
                gdf.to_file(json_fn, driver="GeoJSON")
                
                save_bounds_index(quad_key, gdf, data_folder)
                success_count += 1
            except Exception as e:
                error_list.append({"quad_key": quad_key, "url": url, "error": str(e)})
                continue



# Calculate statistics
total_time = time.time() - start_time
total_files = len(quad_keys)
failed_count = len(error_list)

print("\n" + "="*50)
print(f"Download Summary:")
print(f"Total files processed: {total_files}")
print(f"Successfully downloaded: {success_count}")
print(f"Failed downloads: {failed_count}")
print(f"Time elapsed: {total_time:.2f} seconds")
print("="*50)

100%|██████████| 1/1 [00:07<00:00,  7.15s/it]


Download Summary:
Total files processed: 1
Successfully downloaded: 1
Failed downloads: 0
Time elapsed: 7.15 seconds



