# Extract technical metadata

This recipe will extract technical metadata from a directory of datasets and export it to a CSV file. It requires the python libraries `pandas`, `geopandas`, and `rasterio`. Part 1 writes the filenames, coordinate reference system, file format, resource type, and (optionally) the WKT polygon outline. Part 2 creates CSV files of the attribute table field names and types.

Created 2024-10-18 by Karen Majewicz

In [2]:
import os
import geopandas as gpd
import pandas as pd
import rasterio
from shapely.geometry import Polygon
from shapely.ops import transform
from shapely.geometry import box



## Part 1: Extract metadata to a CSV

In [3]:
# Define a mapping from variable names to desired column headers
column_mapping = {
    'folder_name': 'Folder Name',
    'filename': 'File Name',
    'crs': 'Conforms To',
    'file_format': 'Format',
    'geometry_type': 'Resource Type',
    'bounding_box': 'Bounding Box',
    'wkt_outline': 'Geometry',
    'folder_size': 'File Size'
}

# Define the root directory for the geospatial data
root_directory = 'data'

# Define the output directory for the attribute table CSV files
output_directory = 'codebooks'


In [4]:
# function to add up the files in each dataset folder

def get_folder_size(folder_path, unit='MB', decimal_places=3):
    """
    Calculate the total size of all files in a folder and return it in the specified unit.

    Parameters:
    - folder_path (str): Path to the folder.
    - unit (str): The unit for the size ('bytes', 'KB', 'MB'). Default is 'MB'.
    - decimal_places (int): The number of decimal places to round the size to. Default is 3.

    Returns:
    - float: Total size of the folder contents in the specified unit, rounded to the specified number of decimal places.
    """
    total_size = 0
    for dirpath, _, filenames in os.walk(folder_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # Add to the total size only if it is a file (not a broken link, etc.)
            if os.path.isfile(fp):
                total_size += os.path.getsize(fp)

    # Convert the total size to the specified unit
    if unit == 'KB':
        total_size /= 1024  # Convert bytes to kilobytes
    elif unit == 'MB':
        total_size /= (1024 * 1024)  # Convert bytes to megabytes

    # Round the total size to the specified number of decimal places
    rounded_size = round(total_size, decimal_places)
    return rounded_size

In [5]:
# function to reformat the CRS into a resolvable URI

def format_crs_uri(crs_string):
    # If the CRS is in the "EPSG:xxxx" format, convert it to a resolvable URI
    if crs_string and crs_string.startswith("EPSG:"):
        epsg_code = crs_string.split(":")[1]
        return f"https://epsg.io/{epsg_code}"
    else:
        # Return the original CRS string if it's not an EPSG code
        return crs_string

In [14]:
# functions to create Geometry values (WKT polygon outlines)

def round_coordinates(geometry, decimal_places=2):
    """
    Round the coordinates of a geometry to the specified number of decimal places.

    Parameters:
    - geometry (Geometry): The input Shapely geometry.
    - decimal_places (int): Number of decimal places to round to.

    Returns:
    - Geometry: The geometry with rounded coordinates.
    """
    if geometry.is_empty:
        return geometry

    # Function to round coordinates
    def rounder(x, y, z=None):
        if z is None:
            return (round(x, decimal_places), round(y, decimal_places))
        else:
            return (round(x, decimal_places), round(y, decimal_places), round(z, decimal_places))

    # Apply the rounding function using transform
    return transform(rounder, geometry)

def generate_wkt_outline(gdf, simplify_tolerance=None, decimal_places=2): #default values of none and 2
    """
    Generate a WKT representation of a generalized outline for the dataset.

    Parameters:
    - gdf (GeoDataFrame): The GeoDataFrame containing the geometries.
    - simplify_tolerance (float, optional): The tolerance for the simplify() method to reduce detail.
    - decimal_places (int, optional): The number of decimal places to round the coordinates.

    Returns:
    - str: The WKT representation of the generalized outline.
    """
    # Create a unified geometry from all geometries in the GeoDataFrame
    unified_geom = gdf.geometry.union_all()

    # Use the convex hull to create a generalized outline
    if not unified_geom.is_empty:
        generalized_outline = unified_geom.convex_hull
    else:
        return 'None'

    # Optionally simplify the outline for further generalization
    if simplify_tolerance is not None:
        generalized_outline = generalized_outline.simplify(simplify_tolerance)

    # Round the coordinates of the outline
    generalized_outline = round_coordinates(generalized_outline, decimal_places)

    # Convert the resulting geometry to WKT using shapely's wkt module
    if isinstance(generalized_outline, Polygon):
        wkt_outline = generalized_outline.wkt
    else:
        return 'None'

    return wkt_outline


def generate_raster_wkt(bbox):
    """
    Generate a WKT representation of a raster bounding box.

    Parameters:
    - bbox (list): The bounding box as [left, bottom, right, top].

    Returns:
    - str: The WKT representation of the bounding box.
    """
    left, bottom, right, top = bbox
    # Create a rectangular polygon from the bounding box
    rectangle = box(left, bottom, right, top)
    # Return the WKT representation of the rectangle
    return rectangle.wkt

In [19]:
# main function to extract a variety of technical metadata values from the datasets

def extract_metadata(directory, simplify_tolerance=None, include_wkt=True, decimal_places=4):
    """
    Extract metadata from geospatial datasets in a directory.

    Parameters:
    - directory (str): The directory containing the datasets.
    - simplify_tolerance (float, optional): Tolerance for simplifying WKT outlines.
    - include_wkt (bool, optional): Whether to include the WKT outline in the metadata.
    - decimal_places (int, optional): Number of decimal places to round WKT coordinates.

    Returns:
    - None
    """
    # List to hold metadata for each file
    metadata = []

    # Supported vector formats by GeoPandas
    vector_formats = {
        '.shp': 'Shapefile',
        '.geojson': 'GeoJSON'
    }

    # Walk through the directory and its subdirectories
    for root, _, files in os.walk(directory):
        for filename in files:
            # Get the file extension
            file_ext = os.path.splitext(filename)[1].lower()

            # Construct the full file path
            filepath = os.path.join(root, filename)

            # Get the name of the enclosing folder
            folder_name = os.path.basename(os.path.dirname(filepath))
            
            # Calculate the total size of the folder in MB; can switch to KB
            folder_size = get_folder_size(os.path.dirname(filepath), unit='MB')

            # Check if the file is a recognized vector format
            if file_ext in vector_formats:
                try:
                    # Read the vector file with GeoPandas
                    gdf = gpd.read_file(filepath)

                    # Get the original CRS
                    original_crs = gdf.crs.to_string() if gdf.crs else 'Unknown'
                    # Convert the original CRS to a resolvable URI if possible
                    crs_uri = format_crs_uri(original_crs)

                    # Reproject to WGS84 (EPSG:4326) if needed for bounding box calculation
                    if gdf.crs and gdf.crs.to_string() != 'EPSG:4326':
                        gdf = gdf.to_crs(epsg=4326)

                    # Calculate and round bounding box
                    bounds = gdf.total_bounds if not gdf.empty else [None, None, None, None]
                    rounded_bounds = [round(coord, 3) if coord is not None else None for coord in bounds]
                    bbox = f"{rounded_bounds[0]},{rounded_bounds[1]},{rounded_bounds[2]},{rounded_bounds[3]}"

                    # Generate WKT outline if include_wkt is True
                    wkt_outline = None
                    if include_wkt:
#                         wkt_outline = generate_wkt_outline(gdf, simplify_tolerance, decimal_places)
                        wkt_outline = generate_wkt_outline(gdf, simplify_tolerance=0.001, decimal_places=2)

                    # Process geometry type
                    geometry_type = gdf.geom_type.unique()[0] if not gdf.empty else 'Unknown'
                    if geometry_type != 'Unknown':
                        geometry_type = geometry_type.replace("LineString", "Line").replace("MultiPolygon", "Polygon") + " data"

                    # Extract metadata
                    file_metadata = {
                        column_mapping['filename']: filename,
                        column_mapping['folder_name']: folder_name,
                        column_mapping['crs']: crs_uri,
                        column_mapping['file_format']: vector_formats[file_ext],
                        column_mapping['geometry_type']: geometry_type,
                        column_mapping['bounding_box']: bbox,
                        column_mapping['folder_size']: str(folder_size) + " MB"
                    }

                    # Add the WKT outline to metadata if included
                    if include_wkt:
                        file_metadata[column_mapping['wkt_outline']] = wkt_outline

                    # Add the metadata to the list
                    metadata.append(file_metadata)

                except Exception as e:
                    print(f"Could not read vector file {filename}: {e}")

            
            # Check if the file is a raster format (e.g., .tif)
            elif file_ext == '.tif':
                try:
                    # Read the raster file with Rasterio
                    with rasterio.open(filepath) as src:
                        # Get the original CRS
                        original_crs = src.crs.to_string() if src.crs else 'Unknown'
                        crs_uri = format_crs_uri(original_crs)

                        # Get the bounding box, handling both object and tuple cases
                        bounds = src.bounds
                        if isinstance(bounds, tuple):
                            left, bottom, right, top = bounds
                        else:
                            left, bottom, right, top = bounds.left, bounds.bottom, bounds.right, bounds.top

                        # Reproject the bounding box to WGS84 if needed
                        if src.crs and src.crs.to_string() != 'EPSG:4326':
                            from rasterio.warp import transform_bounds
                            left, bottom, right, top = transform_bounds(src.crs, 'EPSG:4326', left, bottom, right, top)

                        # Round bounding box coordinates
                        rounded_bounds = [round(coord, 3) for coord in [left, bottom, right, top]]
                        bbox = f"{rounded_bounds[0]},{rounded_bounds[1]},{rounded_bounds[2]},{rounded_bounds[3]}"

                        # Generate WKT outline if include_wkt is True
                        wkt_outline = None
                        if include_wkt:
                            wkt_outline = generate_raster_wkt([left, bottom, right, top])                
                        
                        # Extract metadata
                        file_metadata = {
                            column_mapping['filename']: filename,
                            column_mapping['folder_name']: folder_name,
                            column_mapping['crs']: crs_uri,
                            column_mapping['file_format']: 'GeoTIFF',
                            column_mapping['geometry_type']: 'Raster data',
                            column_mapping['bounding_box']: bbox,
                            column_mapping['folder_size']: str(folder_size) + " MB"
                        }
                        
                        # Add the WKT outline to metadata if included
                        if include_wkt:
                            file_metadata[column_mapping['wkt_outline']] = wkt_outline

                        # Add the metadata to the list
                        metadata.append(file_metadata)

                except Exception as e:
                    print(f"Could not read raster file {filename}: {e}")
            

        # Additional check for geodatabases (folders with .gdb extension)
        if root.endswith('.gdb'):
            try:
                folder_name = os.path.basename(os.path.dirname(root))
                folder_size = get_folder_size(root, unit='MB')

                # Try listing layers in the geodatabase
                try:
                    layers = gpd.io.file.fiona.listlayers(root)
                    for layer in layers:
                        # Read each layer
                        gdf = gpd.read_file(root, layer=layer)

                        # Get the original CRS
                        original_crs = gdf.crs.to_string() if gdf.crs else 'Unknown'
                        crs_uri = format_crs_uri(original_crs)

                        # Reproject to WGS84 (EPSG:4326) if needed
                        if gdf.crs and gdf.crs.to_string() != 'EPSG:4326':
                            gdf = gdf.to_crs(epsg=4326)

                        # Calculate and round bounding box
                        bounds = gdf.total_bounds if not gdf.empty else [None, None, None, None]
                        rounded_bounds = [round(coord, 3) if coord is not None else None for coord in bounds]
                        bbox = f"{rounded_bounds[0]},{rounded_bounds[1]},{rounded_bounds[2]},{rounded_bounds[3]}"

                        # Generate WKT outline if include_wkt is True
                        wkt_outline = None
                        if include_wkt:
                            wkt_outline = generate_wkt_outline(gdf, simplify_tolerance, decimal_places)

                        # Process geometry type
                        geometry_type = gdf.geom_type.unique()[0] if not gdf.empty else 'Unknown'
                        if geometry_type != 'Unknown':
                            geometry_type = geometry_type.replace("LineString", "Line").replace("MultiPolygon", "Polygon") + " data"

                        # Extract metadata
                        file_metadata = {
                            column_mapping['filename']: f"{os.path.basename(root)} - {layer}",
                            column_mapping['folder_name']: folder_name,
                            column_mapping['crs']: crs_uri,
                            column_mapping['file_format']: 'Geodatabase',
                            column_mapping['geometry_type']: geometry_type,
                            column_mapping['bounding_box']: bbox,
                            column_mapping['folder_size']: str(folder_size) + " MB"
                        }

                        # Add the WKT outline to metadata if included
                        if include_wkt:
                            file_metadata[column_mapping['wkt_outline']] = wkt_outline

                        # Add the metadata to the list
                        metadata.append(file_metadata)

                except Exception as e:
                    print(f"Could not read geodatabase {root}: {e}")
                    # Fill in default values for the geodatabase metadata
                    file_metadata = {
                        column_mapping['filename']: os.path.basename(root),
                        column_mapping['folder_name']: folder_name,
                        column_mapping['crs']: '',
                        column_mapping['file_format']: '',
                        column_mapping['geometry_type']: '',
                        column_mapping['bounding_box']: '',
                        column_mapping['folder_size']: str(folder_size) + " MB"
                    }

                    # Add the WKT outline column as 'None' if included
                    if include_wkt:
                        file_metadata[column_mapping['wkt_outline']] = ''

                    # Add the default metadata to the list
                    metadata.append(file_metadata)

            except Exception as e:
                print(f"Unexpected error with geodatabase {root}: {e}")


    # Convert the metadata list to a DataFrame
    df = pd.DataFrame(metadata)

    # Save the DataFrame to a CSV file
    output_csv = os.path.join(directory, 'geospatial_metadata.csv')
    df.to_csv(output_csv, index=False)

    print(f'Metadata extraction complete. CSV saved to {output_csv}')


### Executing the code for Part 1

Run option 1 to process the metadata without extracting the WKT polygon outline
    Run option 2 to obtain the outline. Review the simplify_tolerance value and update if needed. `.01` can be used for decimal degrees.

In [None]:
# Option 1 Exclude WKT outline from the metadata
extract_metadata(root_directory, include_wkt=False)

In [20]:
# Option 2: Include WKT outline in the metadata
extract_metadata(root_directory, simplify_tolerance=.001, include_wkt=True)

  return ogr_read(


Could not read geodatabase data/Building_Footprints_Microsoft/Building_Footprints_Microsoft_IN.gdb: 'NoneType' object has no attribute 'listlayers'
Could not read geodatabase data/Boundaries_Miscellaneous_IGIO/County_Government_Boundaries_IGIO_IN_Apr2018.gdb: 'NoneType' object has no attribute 'listlayers'
Metadata extraction complete. CSV saved to data/geospatial_metadata.csv


## Part 2: Attribute Tables

This function will read the attribute table fields and write them to a CSV in a defined directory.

In [None]:
def extract_attribute_table_info(root_directory, output_dir):
    # Supported vector formats by GeoPandas
    vector_formats = {
        '.shp': 'Shapefile',
        '.geojson': 'GeoJSON'
    }

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Walk through the directory and its subdirectories
    for root, _, files in os.walk(root_directory):
        for filename in files:
            # Get the file extension
            file_ext = os.path.splitext(filename)[1].lower()

            # Construct the full file path
            filepath = os.path.join(root, filename)

            # Check if the file is a recognized vector format
            if file_ext in vector_formats:
                try:
                    # Read the vector file with GeoPandas
                    gdf = gpd.read_file(filepath)

                    # Extract field information
                    field_info = []
                    for column in gdf.columns:
                        field_metadata = {
                            'Field Name': column,
                            'Data Type': str(gdf[column].dtype),
                            'Unique Values': gdf[column].nunique(),
                            'Null Values': gdf[column].isnull().sum(),
                            'Definition' : '',
                            'Definition Source' : ''
                        }
                        field_info.append(field_metadata)

                    # Convert the field information to a DataFrame
                    field_df = pd.DataFrame(field_info)

                    # Create the output CSV filename
                    output_csv = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_fields.csv")

                    # Save the DataFrame to a CSV file
                    field_df.to_csv(output_csv, index=False)

#                     print(f"Field information extracted for {filename}. CSV saved to {output_csv}")

                except Exception as e:
                    print(f"Could not read {filename}: {e}")

In [None]:
# Extract attribute table information
extract_attribute_table_info(root_directory, output_directory)