# Extract the Training Features for the September 2019 to January 2020 season

In [1]:
import os

import geopandas as gpd
import pandas as pd
from deafrica_tools.classification import collect_training_data
from feature_collection import feature_layers
from odc.io.cgroups import get_cpu_quota
from sklearn.model_selection import train_test_split

In [2]:
# Define the analysis parameters.
output_crs = "EPSG:6933"
resolution = (-10, 10)
time_range = ("2019-09", "2020-01")

# Column in the training data containing the numeric class labels.
field = "label"

# Define the path to save the training features to.
output_folder="results"

# Define the path to the training data.
training_data_file = "clean_data/sep_2019_to_jan_2020_training_data.geojson"

# fraction of the dataset will be set aside as the testing dataset.
test_size = 0.20

# random state
random_state = 42

In [3]:
# Create the results directory if it does not exist.
os.makedirs(output_folder, exist_ok=True)

In [4]:
# Load the  training data
training_data = gpd.read_file(training_data_file).to_crs(output_crs)

In [5]:
# Create a datacube query object using our analysis parameters.
query = dict(output_crs=output_crs, resolution=resolution, time=time_range)
query

{'output_crs': 'EPSG:6933',
 'resolution': (-10, 10),
 'time': ('2019-09', '2020-01')}

In [6]:
# Set parallel mode on or off (set to False if testing a new feature extraction function).
ncpus = round(get_cpu_quota())
print("ncpus = " + str(ncpus))

ncpus = 15


In [7]:
%%time
column_names, model_input = collect_training_data(
                                    gdf=training_data,
                                    dc_query=query,
                                    ncpus=ncpus,
                                    field=field,
                                    zonal_stats=None,
                                    feature_func=feature_layers
                                    )

Collecting training data in parallel mode


  0%|          | 0/351 [00:00<?, ?it/s]

  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(
  _reproject(


Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (5896, 91)
CPU times: user 1.99 s, sys: 376 ms, total: 2.37 s
Wall time: 11min 2s


In [8]:
# Convert to dataframe.
model_input_df = pd.DataFrame(model_input, columns=column_names)

In [9]:
label_col = model_input_df.columns[0]
feature_cols = model_input_df.columns[1:]

# Split the data into features and labels
X = model_input_df[feature_cols].values
y = model_input_df[label_col].values.ravel()

In [10]:
# Print the list of features collected
print("# of training features collected:", len(feature_cols))
print("List of training features collected:", feature_cols)

# of training features collected: 90
List of training features collected: Index(['vv_2019-09-30', 'vv_2019-10-31', 'vv_2019-11-30', 'vv_2019-12-31',
       'vv_2020-01-31', 'vh_2019-09-30', 'vh_2019-10-31', 'vh_2019-11-30',
       'vh_2019-12-31', 'vh_2020-01-31', 'red_2019-09-30', 'red_2019-10-31',
       'red_2019-11-30', 'red_2019-12-31', 'red_2020-01-31',
       'green_2019-09-30', 'green_2019-10-31', 'green_2019-11-30',
       'green_2019-12-31', 'green_2020-01-31', 'blue_2019-09-30',
       'blue_2019-10-31', 'blue_2019-11-30', 'blue_2019-12-31',
       'blue_2020-01-31', 'red_edge_1_2019-09-30', 'red_edge_1_2019-10-31',
       'red_edge_1_2019-11-30', 'red_edge_1_2019-12-31',
       'red_edge_1_2020-01-31', 'red_edge_2_2019-09-30',
       'red_edge_2_2019-10-31', 'red_edge_2_2019-11-30',
       'red_edge_2_2019-12-31', 'red_edge_2_2020-01-31',
       'red_edge_3_2019-09-30', 'red_edge_3_2019-10-31',
       'red_edge_3_2019-11-30', 'red_edge_3_2019-12-31',
       'red_edge_3_2020

In [11]:
# Split the data into the training and validation dataset.
X_train, X_test, y_train, y_test= train_test_split(X, y, stratify=y, test_size=test_size, random_state=random_state)

In [12]:
# Save the training dataset to use for training the model.
training_df = pd.DataFrame(data=X_train, columns=feature_cols)
training_df[label_col] = y_train
training_df = training_df[model_input_df.columns]
training_df.to_csv(os.path.join(output_folder, 'sep_2019_to_jan_2020_training_features_training.csv'), index=False)

In [13]:
# Save the validation dataset to use for accuracy assessment.
validation_df = pd.DataFrame(data=X_test, columns=feature_cols)
validation_df[label_col] = y_test
validation_df = validation_df[model_input_df.columns]
validation_df.to_csv(os.path.join(output_folder, 'sep_2019_to_jan_2020_training_features_validation.csv'), index=False)

In [14]:
len(training_df)

4716

In [15]:
len(validation_df)

1180