In [1]:
def extract_metadata(directory, simplify_tolerance=None, include_wkt=True, decimal_places=4):
    """
    Extract metadata from geospatial datasets in a directory.

    Parameters:
    - directory (str): The directory containing the datasets.
    - simplify_tolerance (float, optional): Tolerance for simplifying WKT outlines.
    - include_wkt (bool, optional): Whether to include the WKT outline in the metadata.
    - decimal_places (int, optional): Number of decimal places to round WKT coordinates.

    Returns:
    - None
    """
    # List to hold metadata for each file
    metadata = []

    # Supported vector formats by GeoPandas
    vector_formats = {
        '.shp': 'Shapefile',
        '.geojson': 'GeoJSON'
    }

    # Walk through the directory and its subdirectories
    for root, _, files in os.walk(directory):
        for filename in files:
            # Get the file extension
            file_ext = os.path.splitext(filename)[1].lower()

            # Construct the full file path
            filepath = os.path.join(root, filename)

            # Get the name of the enclosing folder
            folder_name = os.path.basename(os.path.dirname(filepath))
            
            # Calculate the total size of the folder in MB; can switch to KB
            folder_size = get_folder_size(os.path.dirname(filepath), unit='MB')

            # Check if the file is a recognized vector format
            if file_ext in vector_formats:
                try:
                    # Read the vector file with GeoPandas
                    gdf = gpd.read_file(filepath)

                    # Process geometry type
                    geometry_type = gdf.geom_type.unique()[0] if not gdf.empty else 'Unknown'
                    if geometry_type != 'Unknown':
                        geometry_type = geometry_type.replace("LineString", "Line").replace("MultiPolygon", "Polygon") + " data"

                    # Default value for spatial resolution
                    spatial_resolution = 'Unknown'

                    # Get the original CRS
                    if gdf.crs:
                        original_crs = gdf.crs.to_string()
                        # Convert the original CRS to a resolvable URI if possible
                        crs_uri = format_crs_uri(original_crs)

                        
                        # DEGREES
                        # Reproject to WGS84 (EPSG:4326) if needed for bounding box calculation
                        if gdf.crs.to_string() != 'EPSG:4326':
                            gdf = gdf.to_crs(epsg=4326)

                        # Calculate and round bounding box
                        bounds = gdf.total_bounds if not gdf.empty else [None, None, None, None]
                        rounded_bounds = [round(coord, 3) if coord is not None else None for coord in bounds]
                        bbox = f"{rounded_bounds[0]},{rounded_bounds[1]},{rounded_bounds[2]},{rounded_bounds[3]}"

                        # Generate WKT outline if include_wkt is True
                        wkt_outline = None
                        if include_wkt:
                            wkt_outline = generate_wkt_outline(gdf, simplify_tolerance=0.001, decimal_places=2)

                        # METERS
                        # Convert the CRS to an equal area projection for area calculation
                        gdf = gdf.to_crs(epsg=6933)
                        total_area_km2 = gdf.geometry.area.sum() / 1e6
                        
                        # Calculate spatial resolution (average vertex distance)
                        try:
                            # Calculate spatial resolution (average vertex distance)
                            avg_vertex_distance = gdf.geometry.apply(calculate_avg_vertex_distance)
                            # If all values are None, set spatial_resolution to 'Unknown'
                            if avg_vertex_distance.notna().any():
                                spatial_resolution = round(avg_vertex_distance.dropna().mean(), 3)
                            else:
                                spatial_resolution = 'Unknown'

                            # Continue with the rest of the processing for vector data...
                            # (bounding box calculation, metadata extraction, etc.)
                        except Exception as e:
                            print(f"Could not calculate spatial resolution for {filename}: {e}")
                            spatial_resolution = 'Unknown'
                        
                    else:
                        # Skip CRS-dependent calculations
                        print(f"Skipping CRS-related processing for {filename}: No CRS found.")
                        original_crs = 'Unknown'
                        crs_uri = 'Unknown'
                        bbox = 'Unknown'
                        total_area_km2 = 'Unknown'
                        wkt_outline = None

                    # Extract metadata
                    file_metadata = {
                        column_mapping['filename']: filename,
                        column_mapping['folder_name']: folder_name,
                        column_mapping['crs']: crs_uri,
                        column_mapping['total_area_km2']: total_area_km2,
                        column_mapping['file_format']: vector_formats[file_ext],
                        column_mapping['geometry_type']: geometry_type,
                        column_mapping['bounding_box']: bbox,
                        column_mapping['spatial_resolution']: spatial_resolution,
                        column_mapping['folder_size']: str(folder_size) + " MB"
                    }

                    # Add the WKT outline to metadata if included
                    if include_wkt and wkt_outline:
                        file_metadata[column_mapping['wkt_outline']] = wkt_outline

                    # Add the metadata to the list
                    metadata.append(file_metadata)

                except Exception as e:
                    print(f"Could not read vector file {filename}: {e}")


                except Exception as e:
                    print(f"Could not read vector file {filename}: {e}")

            # Check if the file is a raster format (e.g., .tif)
            elif file_ext == '.tif':
                try:
                    # Read the raster file with Rasterio
                    with rasterio.open(filepath) as src:
                        # Default value for spatial resolution
                        spatial_resolution = 'Unknown'

                        # Get the original CRS
                        if src.crs:
                            original_crs = src.crs.to_string()
                            # Convert the original CRS to a resolvable URI if possible
                            crs_uri = format_crs_uri(original_crs)

                            # Calculate spatial resolution (pixel size)
                            pixel_size_x, pixel_size_y = src.res

                            # Use the average of pixel_size_x and pixel_size_y to get a single resolution value
                            spatial_resolution = round((pixel_size_x + pixel_size_y) / 2, 3)


                            # Calculate the total geographic area
                            pixel_area = pixel_size_x * pixel_size_y  # area of one pixel
                            total_pixels = src.width * src.height     # number of pixels
                            total_area_m2 = pixel_area * total_pixels  # total area in square meters
                            total_area_km2 = total_area_m2 / 1e6       # convert to square kilometers

                            # Get the bounding box, handling both object and tuple cases
                            bounds = src.bounds
                            if isinstance(bounds, tuple):
                                left, bottom, right, top = bounds
                            else:
                                left, bottom, right, top = bounds.left, bounds.bottom, bounds.right, bounds.top

                            # Reproject the bounding box to WGS84 if needed
                            if src.crs.to_string() != 'EPSG:4326':
                                from rasterio.warp import transform_bounds
                                left, bottom, right, top = transform_bounds(src.crs, 'EPSG:4326', left, bottom, right, top)

                            # Calculate and round bounding box
                            rounded_bounds = [round(coord, 3) for coord in [left, bottom, right, top]]
                            bbox = f"{rounded_bounds[0]},{rounded_bounds[1]},{rounded_bounds[2]},{rounded_bounds[3]}"
                            
                        else:
                        # Skip CRS-dependent calculations
                            print(f"Skipping CRS-related processing for {filename}: No CRS found.")
                            original_crs = 'Unknown'
                            crs_uri = 'Unknown'
                            bbox = 'Unknown'
                            total_area_km2 = 'Unknown'

                        # Generate WKT outline if include_wkt is True
                        wkt_outline = None
                        if include_wkt:
                            # Specify decimal_places for rounding the WKT outline
                            wkt_outline = generate_raster_wkt([left, bottom, right, top], decimal_places=2) if src.crs else None

                            
                        # Extract metadata
                        file_metadata = {
                            column_mapping['filename']: filename,
                            column_mapping['folder_name']: folder_name,
                            column_mapping['crs']: crs_uri,
                            column_mapping['file_format']: 'GeoTIFF',
                            column_mapping['geometry_type']: 'Raster data',
                            column_mapping['bounding_box']: bbox,
                            column_mapping['spatial_resolution']: spatial_resolution,
                            column_mapping['total_area_km2']: total_area_km2,
                            column_mapping['folder_size']: str(folder_size) + " MB"
                        }

                        # Add the WKT outline to metadata if included
                        if include_wkt and wkt_outline:
                            file_metadata[column_mapping['wkt_outline']] = wkt_outline

                        # Add the metadata to the list
                        metadata.append(file_metadata)

                except Exception as e:
                    print(f"Could not read raster file {filename}: {e}")



        # Additional check for geodatabases (folders with .gdb extension)
        if root.endswith('.gdb'):
            try:
                folder_name = os.path.basename(os.path.dirname(root))
                folder_size = get_folder_size(root, unit='MB')

            
                    # Fill in default values for the geodatabase metadata
                    file_metadata = {
                        column_mapping['filename']: os.path.basename(root),
                        column_mapping['folder_name']: folder_name,
                        column_mapping['crs']: '',
                        column_mapping['file_format']: '',
                        column_mapping['geometry_type']: '',
                        column_mapping['bounding_box']: '',
                        column_mapping['folder_size']: str(folder_size) + " MB"
                    }

                    # Add the WKT outline column as 'None' if included
                    if include_wkt:
                        file_metadata[column_mapping['wkt_outline']] = ''

                    # Add the default metadata to the list
                    metadata.append(file_metadata)

            except Exception as e:
                print(f"Unexpected error with geodatabase {root}: {e}")


    # Convert the metadata list to a DataFrame
    df = pd.DataFrame(metadata)

    # Save the DataFrame to a CSV file
    output_csv = os.path.join(directory, 'geospatial_metadata.csv')
    df.to_csv(output_csv, index=False)

    print(f'Metadata extraction complete. CSV saved to {output_csv}')
