In [None]:
import os
import geopandas as gpd
import numpy as np
import pandas as pd
import json
import pickle
from datacube.utils import geometry
from deafrica_tools.classification import collect_training_data
from odc.io.cgroups import get_cpu_quota
from sklearn.preprocessing import LabelEncoder

from feature_collection import feature_layers

In [None]:
# Create data and results directories if they don't exist
if not os.path.exists("data"):
    os.makedirs("data")

if not os.path.exists("results"):
    os.makedirs("results")

## Read in cleaned data and select as required

This step reads in the cleaned points, and determines which will be used for the training set. We make the following selections:
* Only use points corresponding to a single crop (no multi-cropped fields)
* Remove fallow fields
* Only use classes with 10 or more observations (required for cross-validation during model training and evaluation)

We use the geojson from the previous step (columns are the full, cleaned name, rather than the truncated, cleaned name used for shapefiles).

In [None]:
# Point to cleaned data from previous step
path = "../1_Prepare_samples_for_ML/results/cleaned_points.geojson"

# Load input data
input_data = gpd.read_file(path)

In [None]:
# Convert date fields to datetimes

input_data["start"] = pd.to_datetime(input_data["start"], yearfirst=True)
input_data["end"] = pd.to_datetime(input_data["end"], yearfirst=True)

### Split into single crops and multiple crops

Also remove fallow fields from the dataset

In [None]:
# Identify rows with multiple crops or fallow fields
multiple_crop_condition = input_data.loc[:, "multiple_crops"] == "yes"
fallow_field_condition = input_data.loc[:, "field_fallow"] == "yes"

# Split datasets
single_crops = input_data.loc[
    (multiple_crop_condition == False) & (fallow_field_condition == False), :
].copy()

multiple_crops = input_data.loc[
    (multiple_crop_condition == True) & (fallow_field_condition == False), :
].copy()

## Explore and refine single crops dataset

In [None]:
single_crops.primary_crop.value_counts()

In [None]:
# Remove rows with fewer than 10 observations, as it won't be possible to run cross-validation on these if only using a single pixel from each observation

single_crops_subset = single_crops[single_crops.groupby('primary_crop').primary_crop.transform('count')>=10].reset_index(drop=True).copy()
single_crops_subset

## Map crop types to numeric classes for prediction

This step also saves out the mapping as a JSON file for later use.

In [None]:
# Select field to label
field = "primary_crop"

# Fit label encoder to match classes to numeric labels
le = LabelEncoder()
le.fit(single_crops_subset[field])

# Get a list of the crop types
classes = list(le.classes_)

# Assign numeric label for each class
single_crops_subset["label"] = le.transform(single_crops_subset[field])

# Create a dictionary mapping classes to numeric labels
class_dictionary = {crop_class: int(le.transform([crop_class])[0]) for crop_class in classes}
print("Class Dictionary:")
print(class_dictionary)

# Create results directory if it doesn't exist
if not os.path.exists("results"):
    os.makedirs("results")

# Export class dictionary
with open("results/class_labels.json", 'w', encoding='utf-8') as f:
    json.dump(class_dictionary, f, ensure_ascii=False, indent=4)

## Prepare geometry for feature extraction

Either points or polygons can be used for extraction. If a point, the method will extract the pixel containing the point. If a polygon, the method will extract all pixels touching or within that polygon. 

The method uses 10m resolution pixels. As such, we recommend buffering the point to a square polygon of 30m across (15m on each side of the point). This should return nine pixels per point collected. Only use this if your points are more than 15 metres from the road or another field on all sides.

In [None]:
# Set a flag to convert to polygons:
use_polygons = True

if use_polygons:
    # Convert from lat,lon to EPSG:6933 (projection in metres)
    single_crops_subset = single_crops_subset.to_crs("EPSG:6933")

    # Buffer geometry to get a square - only if trying to sample multiple pixels
    buffer_radius_m = 15
    single_crops_subset.geometry = single_crops_subset.geometry.buffer(buffer_radius_m, cap_style=3)

## Prepare query for feature extraction

In [None]:
start_date = single_crops_subset.start.min()
end_date = single_crops_subset.end.max()

query_start_date = pd.Timestamp(
    year=start_date.year, month=start_date.month, day=1
) - pd.DateOffset(months=9)
query_end_date = pd.Timestamp(
    year=start_date.year, month=start_date.month, day=1
) - pd.DateOffset(minutes=1)
print(f"Query start: {query_start_date}")
print(f"Query end: {query_end_date}")

# Write a general query
time = (query_start_date, query_end_date)
resolution = (-10, 10)
output_crs = "EPSG:6933"

query = {
    "time": time,
    "resolution": resolution,
    "output_crs": output_crs,
}

# Export query to pickle file for future re-use
with open('results/query.pickle', 'wb') as f:
    pickle.dump(query, f)


## Collect training data

By default, the method below will run in parallel mode, which decreases the amount of time to run feature extraction for each geometry. This will work well as long as your feature collection function (defined in feature_collection.py) is running with no problems. 

### When testing
If you are testing a new feature collection function, it is suggested you set `parallel = False` below to switch back to serial mode. 

You can also set `gdf = single_crops_subset.iloc[0:5, :].copy()` in the function call to only run the first five geometries.

In [None]:
# Set parallel mode on or off (set to False if testing a new feature extraction function).
parallel = True

if parallel:
    ncpus = round(get_cpu_quota())
else:
    ncpus = 1
    
print("ncpus = " + str(ncpus))

In [None]:
# Collect the training data
column_names, model_input = collect_training_data(
    gdf=single_crops_subset,
    dc_query=query,
    ncpus=ncpus,
    field="label",
    zonal_stats=None,
    feature_func=feature_layers,
)

## Export training data

In [None]:
#set the name and location of the output file
output_file = f"results/training_data_multipixel.txt"

#grab all columns
model_col_indices = [column_names.index(var_name) for var_name in column_names]

#Export files to disk
np.savetxt(output_file, model_input[:, model_col_indices], header=" ".join(column_names), fmt="%4f")