# Import Libraries

In [7]:
import os
import sys
import math
import rasterio
from rasterio.features import rasterize
from rasterio.transform import from_origin
import fiona
import pandas as pd
import geopandas as gpd
from rasterio import features
import numpy as np
import matplotlib.pyplot as plt
from shapely.geometry import Polygon, MultiPolygon

# Import Constants

In [5]:
# Get the current working directory
current_dir = os.path.abspath('')

# Search for the 'constants.py' file starting from the current directory and moving up the hierarchy
project_root = current_dir
while not os.path.isfile(os.path.join(project_root, 'constants.py')):
    project_root = os.path.dirname(project_root)

# Add the project root to the Python path
sys.path.append(project_root)

In [2]:
from constants import STUDY_BOUNDARY_PATH, OUTPUT_PATH, LUP_YEAR,ROAD_PATH , RIVER_PATH

In [3]:
ROAD_PATH

['/Users/romero61/../../capstone/pyforest/ml_data/features/dissolved_road/dissolved_road.gpkg']

In [8]:
study_boundary = gpd.read_file(STUDY_BOUNDARY_PATH)


define the desired resolution and extent for your output rasters to match the tree cover and deforestation rasters.
 reproject the study_boundary to match the output CRS and update its total bounds:

In [9]:
output_crs = 'EPSG:4326'
output_extent = (-62.64186038139295, -25.354320073574613, -57.14929123970096, -19.287457970745013)
output_resolution = (0.00026949458523585647, -0.00026949458523585647)
study_boundary = study_boundary.to_crs(output_crs)
study_area_bounds = study_boundary.total_bounds


# Vector to Raster 
This function reads an input vector file (GeoPackage or Shapefile), converts the attribute column to numerical values or sets it to a single value if provided, and prepares the metadata for creating an output raster file.

input_vector: The path to the input vector file (GeoPackage or Shapefile).

output_raster: The path to the output raster file.

attribute: The name of the attribute column in the input vector file that should be used as the pixel value in the output raster.

study_area_bounds: The bounds of the study area as a tuple (minx, miny, maxx, maxy).

single_value (optional, default: None): If provided, all the features in the input vector will be encoded with this single value in the output raster. If not provided, the attribute column values will be used.

resolution: output raster generated from the vector_to_raster function will match the resolution and extent of the Hansen dataset.

dtype (optional, default: 'uint16'): The data type of the output raster pixel values.

Replace NaN/null values and empty strings with 'Uncategorized'
    #gdf[attribute] = gdf[attribute].apply(lambda x: "Uncategorized" if pd.isna(x) or x in ["", "NA", "None",'NULL','null','na'] else x)


In [10]:
def vector_to_raster(input_vector, output_raster, attribute, study_area_bounds, value_mapping, single_value=None, resolution=(abs(0.00026949458523585647), abs(-0.00026949458523585647)), dtype='uint16'):
    # Check if input_vector is a GeoDataFrame or a file path
    if isinstance(input_vector, gpd.GeoDataFrame):
        gdf = input_vector
    else:
        # Read the input vector file (GeoPackage or Shapefile) into a GeoDataFrame
        gdf = gpd.read_file(input_vector)
    # Reproject the GeoDataFrame to the desired CRS (EPSG:4326)
    gdf = gdf.to_crs(epsg=4326)

    # Ensure that categorical column is string
    gdf[attribute] = gdf[attribute].astype(str)
   

   # If single_value is None, convert the attribute column to numerical values
    # If single_value is provided, set the attribute column to the provided single_value
    if single_value is None:
        gdf[attribute] = gdf[attribute].replace(value_mapping).astype(dtype)
    else:
        gdf[attribute] = single_value


    # Use the study area bounds to define the dimensions and transform of the output raster
    minx, miny, maxx, maxy = study_area_bounds
    width = int(np.ceil((maxx - minx) / abs(resolution[0])))
    height = int(np.ceil((maxy - miny) / abs(resolution[1])))
    out_transform = rasterio.transform.from_bounds(minx, miny, maxx, maxy, width, height)


    # Define the metadata for the output raster file
    out_meta = {
        'driver': 'GTiff',
        'width': width,
        'height': height,
        'count': 1,
        'dtype': dtype,
        'crs': 'EPSG:4326',
        'transform': out_transform
    }
    
    # Open the output raster file for writing with the specified metadata
    with rasterio.open(output_raster, 'w', **out_meta) as dst:
        # Create a generator of tuples containing the geometry and attribute value for each feature in the input vector data
        shapes = ((geom, value) for geom, value in zip(gdf['geometry'], gdf[attribute]))
        
        # Burn the geometries and their corresponding attribute values into a raster array
        burned = features.rasterize(
            shapes=shapes,         # The generator of geometry-attribute tuples
            fill=0,                # The default value for pixels not covered by any geometry
            out_shape=(height, width), # The shape of the output raster array (number of rows and columns)
            transform=out_transform,   # The affine transformation matrix that maps pixel coordinates to the coordinate reference system
            dtype=dtype            # The data type of the raster array
        )
        
        # Write the burned raster array to the output raster file
        dst.write(burned, 1)

# Functions to call Vector to Raster

In [11]:
# To write text file of value mapping
def write_value_mapping(value_mapping, output_file):
    with open(output_file, 'w') as f:
        for key, value in value_mapping.items():
            f.write(f'{key}: {value}\n')


In [12]:
def process_columns(input_vector, output_dir, study_area_bounds, resolution, columns, value_mapping, single_value=None, file_name=None):
    if file_name is None:
        file_name = os.path.splitext(os.path.basename(input_vector))[0]

    for column in columns:
        column_output_dir = os.path.join(output_dir, column)
        os.makedirs(column_output_dir, exist_ok=True)

        output_raster = f"{column_output_dir}/{file_name}_{column}_raster.tif"
        vector_to_raster(input_vector, output_raster, column, study_area_bounds, value_mapping[column], single_value=single_value, resolution=resolution)
        


# Set Output Directory

In [14]:
# Where files will save, can add subfolders if desired
output_dir = os.path.join(OUTPUT_PATH[0], 'processed_rasters')
os.makedirs(output_dir, exist_ok=True)


# Using the function on a folder of shapefiles

The following uses the list of file paths defined in the constants.py folder.

In [15]:
def process_files_from_list(file_list, output_dir, study_area_bounds, resolution, columns, single_value=None):
    # Generate a global value mapping from the list of unique values for each column
    global_value_mapping = {column: set() for column in columns}
    
    for file_path in file_list:
        gdf = gpd.read_file(file_path)
        for column in columns:
            global_value_mapping[column].update(gdf[column].astype(str).unique())
    
    for column in columns:
        global_value_mapping[column] = {value: idx for idx, value in enumerate(sorted(list(global_value_mapping[column])), 1)}
        print(f"Global value mapping for {column}:")
        for value in global_value_mapping[column]:
            print(f"{value}: {global_value_mapping[column][value]}")
    
    # Write the global value mapping for each column to separate .txt files in the corresponding column folder
    for column in columns:
        column_output_dir = os.path.join(output_dir, column)
        os.makedirs(column_output_dir, exist_ok=True)

        output_value_mapping_file = f"{column_output_dir}/{column}_global_value_mapping.txt"
        write_value_mapping(global_value_mapping[column], output_value_mapping_file)

    # Process each file with the global value mapping
    for file_path in file_list:
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        process_columns(file_path, output_dir, study_area_bounds, columns=columns, resolution=resolution, value_mapping=global_value_mapping, single_value=single_value, file_name=file_name)


In [13]:
process_files_from_list(LUP_YEAR, output_dir, study_area_bounds, output_resolution, columns=['land_use_type'])

Global value mapping for land_use_type:
final_hedgerow: 1
forest_reserve: 2
paddocks: 3
riparian_corridor: 4
