In [None]:
import pandas as pd
import numpy as np
import os
import copy
from pandas.errors import EmptyDataError
from multiprocessing import Pool
import pickle

In [None]:
def read_csv(csv_list):
  # Attempt to read the provided list of CSV files into DataFrames
  try:
    dataframes = []  # Initialize an empty list to store DataFrames
    for file_ in csv_list:
      if not file_.endswith('.csv'):  # Skip non-CSV files
          continue
      df = pd.read_csv(file_)  # Read the CSV file into a DataFrame
      dataframes.append(df)  # Add the DataFrame to the list
    print("Finished doing batch")  # Log message to indicate batch processing completion
    return dataframes  # Return the list of DataFrames
  except EmptyDataError:
    # If an EmptyDataError occurs (due to an empty CSV file), return an empty list
    return []

def merge_csv(all_csv, processes = 4):
  # Function to split the CSV files into batches and merge them in parallel using multiple processes
  batch_size = len(all_csv) // processes  # Determine the size of each batch based on the number of processes
  all_batches = []  # Initialize a list to store the batches
  dataframes = []  # Initialize an empty list to hold DataFrames from batches
  count = 0  # Initialize a counter (unused, but could be useful for tracking)

  # Split the list of all CSV files into batches
  for i in range(0, len(all_csv), batch_size):
    batch_files = all_csv[i:i + batch_size]  # Get a batch of files
    all_batches.append(batch_files)  # Append the batch to the list

  # Use a multiprocessing Pool to read and process batches in parallel
  with Pool(processes=processes) as pool:
    dataframes = pool.map(read_csv, all_batches)  # Map the read_csv function to each batch

  # Merge the DataFrames from all batches
  merged_gdf = merge_dataframe(dataframes)

  # Return the merged GeoDataFrame (or DataFrame)
  return merged_gdf

def merge_dataframe(dataframes):
  # Function to merge all DataFrames into a single DataFrame
  merged_gdf = None  # Initialize the final merged DataFrame as None
  for frame in dataframes:
    if len(frame) == 0:  # Skip empty DataFrames
      continue
    # Concatenate the DataFrames in the current batch, ignoring index for a continuous DataFrame
    batch_gdf = pd.concat(frame, ignore_index=True)

    if merged_gdf is None:
      # If this is the first batch, initialize merged_gdf with the current batch
      merged_gdf = batch_gdf
    else:
      # Otherwise, concatenate the current batch with the existing merged_gdf
      merged_gdf = pd.concat([merged_gdf, batch_gdf], ignore_index=True)

  # Return the fully merged DataFrame
  return merged_gdf

def get_csv_list(folder_path):
  # Function to retrieve a sorted list of CSV files from the specified folder
  csv_list = []  # Initialize an empty list to store CSV file paths

  # Iterate through all files in the directory
  for i in os.listdir(folder_path):
    if i.endswith(".csv"):  # If the file ends with '.csv', add it to the list
      csv_list.append(i)

  # Sort the CSV list based on the integer value in the file name (assumed to be at index 2 when split by '_')
  csv_list = sorted(csv_list, key=lambda x: int(x.split("_")[2]))

  # Update the list with full file paths by joining folder path and file name
  csv_list = [os.path.join(folder_path, i) for i in csv_list]

  # Return the final list of sorted CSV file paths
  return csv_list


In [None]:
l3_regs = ['15  Northern Rockies',
       '16  Idaho Batholith', '17  Middle Rockies', '18  Wyoming Basin',
       '19  Wasatch and Uinta Mountains', '20  Colorado Plateaus',
       '21  Southern Rockies', '22  Arizona/New Mexico Plateau',
       '25  High Plains', '26  Southwestern Tablelands',
       '27  Central Great Plains', '41  Canadian Rockies',
       '42  Northwestern Glaciated Plains', '43  Northwestern Great Plains',
       '44  Nebraska Sand Hills', '46  Northern Glaciated Plains',
       '47  Western Corn Belt Plains', '48  Lake Agassiz Plain']

#### Add the names for the best features that was found from FeatureSelectionBulkOverlap.ipynb notebook. Make sure the variable follows the same order as the trained model

In [None]:
BEST_FEATURES_BY_NAME = ["EVI_mean", "Fpar_mean_annual", "NDVI_mean", "NDWI_mean", "aspect", "elevation", "slope"]
BEST_TRANSFORMATION_FOR_FEATURES = ["yeojohnson", "yeojohnson", "yeojohnson", "yeojohnson", None, "yeojohnson", "yeojohnson"]

In [8]:
def get_list_and_predict():
    list_ = dataframes.values.tolist()
    X_test = []
    X_geo = []

    # Prepare the test data with the specified columns and ecoregion encoding
    for row in list_:
        values_to_add = [row[i] for i in [3, 6, 9, 11, 15, 17, 18]]  # Using the BEST_FEATURES_BY_NAME indices
        l3_vals = [0 for i in range(18)]  # add the eco regions
        if row[16] in l3_regs:
            l3_vals[l3_regs.index(row[16])] = 1
        values_to_add.extend(l3_vals)
        X_test.append(values_to_add)
        X_geo.append(row[-1])

    X_test = np.array(X_test)

    # Apply the appropriate transformation for each feature
    for idx, (feature_name, transformation) in enumerate(zip(BEST_FEATURES_BY_NAME, BEST_TRANSFORMATION_FOR_FEATURES)):
        if transformation is not None:
            power_transform_filename = f"Power_transform_params_{feature_name}.pkl"
            with open(power_transform_filename, 'rb') as f:
                power_transformer = pickle.load(f)

            # Apply the power transform to the feature
            X_test[:, idx] = power_transformer.transform(X_test[:, idx].reshape(-1, 1)).flatten()
        else:
            # If no transformation, just keep the original values
            X_test[:, idx] = X_test[:, idx]  # No operation needed, but placeholder

    # Load the model, bias model, and scaler
    with open("RF_base_model.pkl", 'rb') as f:
        rf_base_model = pickle.load(f)
    with open("RF_bias_model.pkl", 'rb') as f:
        rf_bias_model = pickle.load(f)
    with open("Standard_Scaler_Params.pkl", 'rb') as f:
        scaler = pickle.load(f)

    # Scale the transformed data
    X_test = scaler.transform(X_test)

    # Perform predictions
    X_pred = rf_base_model.predict(X_test)
    residual_predict = rf_bias_model.predict(X_test)

    # Calculate final predictions
    new_pred = np.exp(X_pred + residual_predict)

    # Create a new dataframe with predictions
    new_df = copy.deepcopy(dataframes.copy())
    new_df.insert(2, 'agbd_prediction', new_pred)

    return new_df


#### Change the name of states you want to predict {KS, NE, CO, MT, ND, SD, WY}

In [13]:
stateName = "WY"

In [None]:
# the data we predict "on" is pre-downloaded
# the data is stored in /data/{StateName}
# the data is structured like
#   data
#     StateName
#       YearWise Folders
#         All csv files

# after running prediction, we will store the data at /data/predictions/{StateName}

baseDataFolderName = "data"
dataFolderName = os.path.join(baseDataFolderName, stateName)
folderToSavePreds = os.path.join(baseDataFolderName, "predictions", stateName)
os.makedirs(folderToSavePreds, exist_ok=True)
if __name__ == "__main__":
  for folderToPredict_ in os.listdir(dataFolderName):
    folderToPredict = os.path.join(dataFolderName, folderToPredict_)
    print(f"Predicting for {folderToPredict}")
    csv_year = get_csv_list(folderToPredict)
    dataframes = merge_csv(csv_year, processes=32)
    predicted_dataframes = get_list_and_predict()
    predicted_dataframes.to_csv(os.path.join(folderToSavePreds, f"{folderToPredict_}.csv"))

Predicting for /content/drive/MyDrive/WY/EarthLabWY_year_2007
Finished doing batch
Finished doing batch
Finished doing batch
Finished doing batch
Finished doing batch
Finished doing batch
Finished doing batchFinished doing batchFinished doing batch


Finished doing batch
Finished doing batch
Finished doing batch
Finished doing batch
Finished doing batchFinished doing batch
Finished doing batch

Finished doing batch
Finished doing batchFinished doing batchFinished doing batch


Finished doing batchFinished doing batch

Finished doing batchFinished doing batch

Finished doing batchFinished doing batch

Finished doing batchFinished doing batchFinished doing batch

Finished doing batch

Finished doing batch
Predicting for /content/drive/MyDrive/WY/EarthLabWY_year_2015
Finished doing batch
Finished doing batchFinished doing batch

Finished doing batchFinished doing batch

Finished doing batchFinished doing batch
Finished doing batch
Finished doing batch

Finished doing batchFinished doing b