# Import Libraries and Constants

In [1]:
import os
import re
import sys
import numpy as np
import pandas as pd
import tempfile
import shutil
import matplotlib.pyplot as plt
import rasterio

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (confusion_matrix, classification_report, accuracy_score, 
                             precision_score, recall_score, f1_score, roc_auc_score, 
                             precision_recall_curve, roc_curve, auc)
from sklearn.model_selection import (train_test_split, cross_val_score,
                                     GridSearchCV, RandomizedSearchCV)
from scipy.stats import randint as sp_randint

from imblearn.ensemble import BalancedRandomForestClassifier
import joblib
from joblib import dump


In [2]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=UserWarning)
simplefilter(action='ignore', category=FutureWarning)

In [3]:
# Get the current working directory
current_dir = os.path.abspath('')

# Search for the 'constants.py' file starting from the current directory and moving up the hierarchy
project_root = current_dir
while not os.path.isfile(os.path.join(project_root, 'constants.py')):
    project_root = os.path.dirname(project_root)

# Add the project root to the Python path
sys.path.append(project_root)



In [4]:
from constants import SERVER_PATH, OUTPUT_PATH, SIMULATION_FEATURES_DIR


In [5]:
#output- update this for subsequent runs
output_folder = os.path.join(OUTPUT_PATH[0], 'predictions-log')
if not os.path.exists(output_folder):
    os.makedirs(output_folder)



# Create Stack

In [6]:
# helper function to read tiff files
def read_tiff_image(file_path):
    with rasterio.open(file_path) as src:
        return src.read(1)


In [7]:
# List of paths to the raster files to be used as features
feature_files = [os.path.join(SIMULATION_FEATURES_DIR[0], 'sim25_raster.tif')]
#feature_files = [os.path.join(SIMULATION_FEATURES_DIR[0], 'sim50_raster.tif')]
#feature_files = [os.path.join(SIMULATION_FEATURES_DIR[0], 'simhedges_raster.tif')]


# Then you can use this list of feature_files to create feature_data_arrays and feature_data_flat:
feature_data_arrays = [read_tiff_image(file_path) for file_path in feature_files]
feature_data_flat = [data_array.flatten() for data_array in feature_data_arrays]




In [8]:
feature_files

['/Users/romero61/../../capstone/pyforest/ml_data/output/sim_lup_features/sim25_raster.tif']

In [9]:
with rasterio.open(feature_files[0]) as src:
    profile = src.profile
    profile.update(dtype=rasterio.float32, count=1)


In [10]:
profile

{'driver': 'GTiff', 'dtype': 'float32', 'nodata': -1.0, 'width': 20381, 'height': 22512, 'count': 1, 'crs': CRS.from_epsg(4326), 'transform': Affine(0.00026949458523585647, 0.0, -62.64186038139295,
       0.0, -0.00026949458523585647, -19.287457970745013), 'tiled': False, 'interleave': 'band'}

In [11]:
# Find the dimensions of all the raster data arrays
raster_shapes = [raster_data.shape for raster_data in feature_data_arrays]

# Check if all raster data arrays have the same dimensions
if len(set(raster_shapes)) > 1:
    print("There are mismatching dimensions:")
    for file_path, raster_shape in zip(raster_files, raster_shapes):
        print(f"File: {file_path}, Shape: {raster_shape}")
else:
    print("All raster data arrays have the same dimensions.")
    # Check the dimensions of all the raster data arrays
    for i, data_array in enumerate(feature_data_arrays):
        print(f"Raster {i}: {data_array.shape}")


All raster data arrays have the same dimensions.
Raster 0: (22512, 20381)


# Stack and Flatten Data

In [12]:

# NoData Value
no_data_value = -1

# Stack the flattened raster data
X_flat = np.column_stack(feature_data_flat)

# Remove rows with NoData values
valid_rows_X = ~(X_flat == no_data_value).any(axis=1)

# Create a new array X_cleaned by selecting only the rows in X_flat that correspond to the True elements in valid_rows_X
X_cleaned = X_flat[valid_rows_X]
 

To ensure your data cleaning steps have been applied correctly, you can check the following:

**NoData values have been removed:** You should confirm that there are no NoData values in your cleaned data. This can be done by asserting that there are no occurrences of no_data_value in X_cleaned 

In [13]:
assert not (X_cleaned == no_data_value).any()


These assertions will throw an error if there is a NoData value in X_cleaned 


In [14]:
print("Shape of X_cleaned:", X_cleaned.shape)


Shape of X_cleaned: (109775603, 1)


# Import Trained Model

In [15]:

# Load the model from the pickle file
model = joblib.load('/Users/romero61/github/PYFOREST-ML/05-outputs/BRFC-features-log/best_model.pkl')



# Probabilities For Deforestation Predict on Simulated Land Use Data

When you use the predict_proba method of a classifier, it returns a 2D array where each row corresponds to a data point (in your case, a pixel), and each column corresponds to a class. The value in each cell is the probability that the given data point belongs to the given class, according to the model.

In a binary classification problem, there are two classes: 0 and 1. Therefore, predict_proba returns a 2D array with two columns. The first column (index 0) contains the probabilities for class 0, and the second column (index 1) contains the probabilities for class 1.

So, when you do probabilities[:, 1], you are selecting all rows (:) and the second column (1). This gives you a 1D array containing the probabilities that each data point belongs to class 1.

In the context of the problem, class 1 might represent "deforested" areas. So class_1_probabilities would be an array where each value is the model's estimated probability that the corresponding pixel represents a deforested area.

In [None]:
# Predict probabilities on the new data for deforestation events
probabilities = model.predict_proba(X_cleaned)[:, 1]


In [None]:
np.unique(probabilities)

array([0.30507248, 0.38903786, 0.49881947, 0.65758311])

In [None]:
# Flatten X_flat to a 1D array
X_flat_1D = X_flat.flatten()

# Create a flat array filled with NoData values
probabilities_flat = np.full(X_flat_1D.shape[0], no_data_value, dtype=np.float32)

# Replace the valid positions in the flat probabilities with the predicted probabilities
probabilities_flat[valid_rows_X] = probabilities

# Reshape the flat probabilities back into the shape of the original raster
probabilities_reshaped = probabilities_flat.reshape(feature_data_arrays[0].shape)




In [38]:
output_file = os.path.join(output_folder, "sim-25-predicition.tiff")

# Save the reshaped predictions as a new raster file
with rasterio.open(output_file, 'w', **profile) as dst:
    dst.write(probabilities_reshaped, 1)