# Performing land cover predictions using the Random Forest model for subareas

    For each folder containing calculated spectral indices for a given subarea, the following is performed:

    - Reading values ​​from spectral indices for grid points - saving as csv
    - prediction of land cover classes usinRandom Forest e model closest to the date savinged as csv
    - changing the csv file to a tif file

In [None]:
import os
import time
import rasterio
import gc
import re
import pandas as pd
import pickle
from sklearn.impute import SimpleImputer
from shapely import wkt
import numpy as np
import datetime
from geocube.api.core import make_geocube
import pandas as pd
import geopandas as gpd
import shapely
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

shp_folder = '/path/to/grid_points_subareas_folder/grid_points_subareas.shp'
base_tif_folder = '/path/to/output_files_location/sentinel2'
output_csv_folder = '/path/to/save_output_files/sentinel2'
model_folder = '/path/to/random_forest_models/'

# Function to extract data from shapefile and TIF folders, process them, and save as CSVs
def extract_data(shp_folder, base_tif_folder, output_csv_folder):
    if not os.path.exists(output_csv_folder):
        os.makedirs(output_csv_folder)
        print("Output folder created:", output_csv_folder)
    else:
        print("Output folder already exists:", output_csv_folder)

    # Function to get values from raster files
    def get_raster_values(tif_file, coords):
        with rasterio.open(tif_file) as src:
            values = [x[0] for x in src.sample(coords)]
        return values

    tif_folders = os.listdir(base_tif_folder)

    area_names = {'_'.join(folder.split('_')[:2]) for folder in tif_folders}
    print(f"Found {len(area_names)} full area names in TIF folders.")

    for area_name in area_names:
        shp_file = os.path.join(shp_folder, area_name + ".shp")
        if not os.path.exists(shp_file):
            print(f"No SHP file for area {area_name}.")
            continue
            
        gc.collect()

        grid_df = gpd.read_file(shp_file)
        print(f"Wczytano plik SHP: {shp_file}")

        # Match TIF folders to SHP area
        tif_folders_for_area = [os.path.join(base_tif_folder, folder) for folder in tif_folders if folder.startswith(area_name + "_")]

        if not tif_folders_for_area:
            print(f"No TIF folders for area {area_name}")
            continue

        for tif_folder in tif_folders_for_area:
            tif_files = [os.path.join(tif_folder, file) for file in os.listdir(tif_folder) if file.endswith(".tif")]
            tif_files_sorted = sorted(tif_files)

            for tif_file in tif_files_sorted:
                column_name = os.path.splitext(os.path.basename(tif_file))[0]
                coord_list = [(x, y) for x, y in zip(grid_df["geometry"].x, grid_df["geometry"].y)]
                grid_df[column_name] = get_raster_values(tif_file, coord_list)

            grid_df_cleaned = grid_df.dropna()
            grid_df_cleaned = grid_df_cleaned[(grid_df_cleaned != 0).all(axis=1)]

            new_column_names = {}
            for col in grid_df_cleaned.columns:
                if col.startswith("S2_"):
                    new_name = col[:3] + col[11:]
                    new_column_names[col] = new_name
            grid_df_cleaned.rename(columns=new_column_names, inplace=True)

            # Save to CSV
            #output_csv_name = os.path.basename(tif_folder)[:-11] + ".csv"  # S2
            output_csv_name = os.path.basename(tif_folder) + ".csv"  # L8
            output_csv_path = os.path.join(output_csv_folder, output_csv_name)
            grid_df_cleaned.to_csv(output_csv_path, sep=",", index=False, header=True)

            gc.collect()

# Function to match the most appropriate model to the data based on dates
def match_model_to_data(csv_file, model_folder):
    date_pattern = re.compile(r"\d{4}-\d{2}")
    match = date_pattern.search(csv_file)
    if match:
        csv_year_month = match.group()

        closest_model = None
        closest_date_diff = float('inf')
        model_files = [f for f in os.listdir(model_folder) if f.endswith('.sav')]

        for model_file in model_files:
            model_match = date_pattern.search(model_file)
            if model_match:
                model_year_month = model_match.group()
                year_diff = int(model_year_month[:4]) - int(csv_year_month[:4])
                month_diff = int(model_year_month[5:7]) - int(csv_year_month[5:7])
                total_month_diff = abs(year_diff * 12 + month_diff)

                # Find the model with the smallest difference
                if total_month_diff < closest_date_diff:
                    closest_date_diff = total_month_diff
                    closest_model = model_file

        if closest_model:
            print(f"Matched model {closest_model} for CSV {csv_file}")
            return os.path.join(model_folder, closest_model)

    return None

# Function to perform predictions on extracted data
def predict(model_folder, output_csv_folder):
    csv_files = [file for file in os.listdir(output_csv_folder) if file.endswith('.csv') and file.startswith("AOI0")]

    for csv_file in csv_files:
        print(f"\nProcessing CSV file: {csv_file}")

        csv_path = os.path.join(output_csv_folder, csv_file)
        model_path = match_model_to_data(csv_file, model_folder)

        if not model_path:
            print(f"No suitable model found for file {csv_file}")
            continue

        df = pd.read_csv(csv_path)
        if df.empty:
            print(f"No data in file {csv_file}")
            continue

        df = pd.read_csv(csv_path)
        grid_gdf = gpd.GeoDataFrame(data=df, geometry=df['geometry'].apply(wkt.loads), crs="EPSG:32631")
        grid_features = df.iloc[:, 2:17]
        inputation = SimpleImputer(missing_values=np.nan, strategy="median")
        grid_features_filled = inputation.fit_transform(grid_features)
        loaded_model = pickle.load(open(model_path, "rb"))
        result = pd.DataFrame(loaded_model.predict(grid_features_filled))
        df1 = pd.merge(df.iloc[:, 1], result, left_index=True, right_index=True)
        df1.columns = ["geometry", "pred_class"]

        output_csv_name = f"predicted_{csv_file}"
        output_csv_path = os.path.join(output_csv_folder, output_csv_name)
        df1.to_csv(output_csv_path, sep=",", index=False, header=True)

# Function to convert predicted CSV files to GeoTIFF
def convert_csv_to_geotiff(csv_folder, result_tif):
    csv_files = [file for file in os.listdir(csv_folder) if file.startswith("predicted") and file.endswith(".csv")]

    for csv_file in csv_files:
        csv_path = os.path.join(csv_folder, csv_file)
        s = time.process_time()

        try:
            df = pd.read_csv(csv_path)
            grid_prediction_gdf = gpd.GeoDataFrame(data=df, geometry=df['geometry'].apply(shapely.wkt.loads),  crs="EPSG:32631")
            csv_file_short = csv_file[10:28]  
            geotif_file = os.path.join(result_tif, f"Vector_classification_{csv_file_short}.tif")
            res = 20
            out_grd = make_geocube(vector_data=grid_prediction_gdf, measurements=["pred_class"],  resolution=(-res, res))

            out_grd["pred_class"].rio.to_raster(geotif_file)

        except Exception as e:
            print("Error processing:", csv_path)
            print("Error:", e)

start = time.process_time()

# Extract data for all TIF folders
extract_data(shp_folder, base_tif_folder, output_csv_folder)
print("Data extraction completed\n")

# Perform predictions on the extracted data
predict(model_folder, output_csv_folder)
print("Predictions completed\n")

convert_csv_to_geotiff(output_csv_folder, output_csv_folder)
print("Convert to tif completed\n")
              
print("Total processing time in minutes:", (time.process_time() - start) / 60)