# Prepare training data

In [1]:
# Load the required python packages.
import json
import os

import geopandas as gpd
import pandas as pd
import numpy as np
import rioxarray
from sklearn.preprocessing import LabelEncoder

In [2]:
# Set the Analysis Parameters

# Folder to write the cleaned data to
output_folder = "clean_data"

# Column containing the non-numeric labels for the crop types.
non_numeric_field = "Crop_type"
# Column containing the numeric labels for the crop types.
numeric_field = "label"

In [3]:
# Create the output folder if it does not exist.
os.makedirs(output_folder, exist_ok=True)

## Prepare the training data for the September 2019 to January 2020 season

In [4]:
# Load the September 2019 to January 2020 crop data.
sep_2019_to_jan_2020_training_data = gpd.read_file("data/2019_training_data.geojson").to_crs("EPSG:4326")
sep_2019_to_jan_2020_training_data.head()

Unnamed: 0,id,Crop_type,training_d,training_1,geometry
0,0.0,Cassava,,2,"MULTIPOLYGON (((34.22980 0.56801, 34.22981 0.5..."
1,0.0,Maize,,1,"MULTIPOLYGON (((34.23115 0.56801, 34.23106 0.5..."
2,0.0,Maize,,1,"MULTIPOLYGON (((34.23271 0.56661, 34.23302 0.5..."
3,0.0,Cassava,,2,"MULTIPOLYGON (((34.23078 0.56774, 34.23084 0.5..."
4,0.0,Maize,,1,"MULTIPOLYGON (((34.23103 0.56790, 34.23124 0.5..."


In [5]:
# Only keep the Crop_type and geometry column.
sep_2019_to_jan_2020_training_data = sep_2019_to_jan_2020_training_data[[non_numeric_field, "geometry"]]
sep_2019_to_jan_2020_training_data.head()

Unnamed: 0,Crop_type,geometry
0,Cassava,"MULTIPOLYGON (((34.22980 0.56801, 34.22981 0.5..."
1,Maize,"MULTIPOLYGON (((34.23115 0.56801, 34.23106 0.5..."
2,Maize,"MULTIPOLYGON (((34.23271 0.56661, 34.23302 0.5..."
3,Cassava,"MULTIPOLYGON (((34.23078 0.56774, 34.23084 0.5..."
4,Maize,"MULTIPOLYGON (((34.23103 0.56790, 34.23124 0.5..."


In [6]:
# View the unique crop types in the crop type column.
sep_2019_to_jan_2020_training_data[non_numeric_field].unique()

array(['Cassava', 'Maize', 'Maize/Soybean', 'Maize/Common bean',
       'Maize/Groundnuts', 'Maize/Cassava', 'Sugarcane', 'Common bean',
       'Soybean', 'Maize/Sorghum', 'Maize/Mixed'], dtype=object)

In [7]:
# Change the values of the crop type field to the required types.
sep_2019_to_jan_2020_training_data[non_numeric_field].replace('Maize/Cassava', 'Maize/Mixed', inplace=True)
sep_2019_to_jan_2020_training_data[non_numeric_field].replace('Maize/Common bean', 'Maize/Mixed', inplace=True)
sep_2019_to_jan_2020_training_data[non_numeric_field].replace('Maize/Groundnuts', 'Maize/Mixed', inplace=True)
sep_2019_to_jan_2020_training_data[non_numeric_field].replace('Maize/Sorghum', 'Maize/Mixed', inplace=True)
sep_2019_to_jan_2020_training_data[non_numeric_field].replace('Maize/Soybean', 'Maize/Mixed', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sep_2019_to_jan_2020_training_data[non_numeric_field].replace('Maize/Cassava', 'Maize/Mixed', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sep_2019_to_jan_2020_training_data[non_numeric_field].replace('Maize/Common bean', 'Maize/Mixed', inplace=True)
The behavior

In [8]:
# Verify
sep_2019_to_jan_2020_training_data[non_numeric_field].unique()

array(['Cassava', 'Maize', 'Maize/Mixed', 'Sugarcane', 'Common bean',
       'Soybean'], dtype=object)

In [9]:
# Fit label encoder to match the non-numerical labels to numeric labels
le = LabelEncoder()
le.fit(sep_2019_to_jan_2020_training_data[non_numeric_field])

In [10]:
# Assign a numeric label for each non-numerical label using the label encoder.
sep_2019_to_jan_2020_training_data[numeric_field] = le.transform(sep_2019_to_jan_2020_training_data[non_numeric_field])
sep_2019_to_jan_2020_training_data.head()

Unnamed: 0,Crop_type,geometry,label
0,Cassava,"MULTIPOLYGON (((34.22980 0.56801, 34.22981 0.5...",0
1,Maize,"MULTIPOLYGON (((34.23115 0.56801, 34.23106 0.5...",2
2,Maize,"MULTIPOLYGON (((34.23271 0.56661, 34.23302 0.5...",2
3,Cassava,"MULTIPOLYGON (((34.23078 0.56774, 34.23084 0.5...",0
4,Maize,"MULTIPOLYGON (((34.23103 0.56790, 34.23124 0.5...",2


In [11]:
# Create a dictionary mapping classes to numeric labels
classes = le.classes_
class_dictionary = {crop_class: int(le.transform([crop_class])[0]) for crop_class in classes}
print("Class Dictionary:")
print(class_dictionary)
# Export class dictionary
with open(os.path.join(output_folder,"class_labels.json"), 'w', encoding='utf-8') as f:
    json.dump(class_dictionary, f, ensure_ascii=False, indent=4)

Class Dictionary:
{'Cassava': 0, 'Common bean': 1, 'Maize': 2, 'Maize/Mixed': 3, 'Soybean': 4, 'Sugarcane': 5}


In [12]:
# Verify the numeric labels in the dataframe with the class dictionary
print(sep_2019_to_jan_2020_training_data[numeric_field].unique())
print("Class Dictionary:")
print(class_dictionary)

[0 2 3 5 1 4]
Class Dictionary:
{'Cassava': 0, 'Common bean': 1, 'Maize': 2, 'Maize/Mixed': 3, 'Soybean': 4, 'Sugarcane': 5}


In [13]:
# Write the cleaned training data to disk.
sep_2019_to_jan_2020_training_data.to_file(os.path.join(output_folder, "sep_2019_to_jan_2020_training_data.geojson"))

## Prepare the training data for the September 2021 to January 2022 season.

In [14]:
# Load the September 2021 to January 2022 crop data.
sep_2021_to_jan_2022_training_data = gpd.read_file("data/2021_training_data.geojson").to_crs("EPSG:4326")
sep_2021_to_jan_2022_training_data.head()

Unnamed: 0,sample_id,landcover,field_id,parcel_id,today,country,overview_p,detail_pho,_id,pinpoint_o,...,crop_mixed,other_mix,crop_agro,dominant,crop_stage,valid,surveyed,layer,path,geometry
0,49485.0,3,70,ke_49485_70,2021-11-29,KE,,,130312824,0.35904800481725374 34.26320855626585 0.0 0.0,...,,,,,field covered,yes,yes,maize_2021,C:\DATA\EO_proposal_October2022\Data_and_metho...,"MULTIPOLYGON (((34.26329 0.35922, 34.26326 0.3..."
1,49485.0,3,35,ke_49485_35,2021-11-29,KE,,,130459988,0.35667001156916456 34.262085334701254 0.0 0.0,...,,,,,field covered,yes,yes,maize_2021,C:\DATA\EO_proposal_October2022\Data_and_metho...,"MULTIPOLYGON (((34.26295 0.35725, 34.26305 0.3..."
2,49485.0,3,25,ke_49485_25,2021-11-29,KE,,,130460250,0.3561828721677358 34.261597493923404 0.0 0.0,...,,,,,field covered,no,yes,maize_2021,C:\DATA\EO_proposal_October2022\Data_and_metho...,"MULTIPOLYGON (((34.26168 0.35657, 34.26180 0.3..."
3,62923.0,3,26,ke_62923_26,2021-11-29,KE,,,130688751,0.3657261164707961 34.35757098663882 0.0 0.0,...,,,,,field covered,yes,yes,maize_2021,C:\DATA\EO_proposal_October2022\Data_and_metho...,"MULTIPOLYGON (((34.35768 0.36563, 34.35766 0.3..."
4,72442.0,3,41,ke_72442_41,2021-11-27,KE,1638000622464.jpg,1638000638151.jpg,126939133,0.730294 34.4240785 1387.097107009286 1.940000...,...,,,,,field covered,yes,yes,maize_2021,C:\DATA\EO_proposal_October2022\Data_and_metho...,"MULTIPOLYGON (((34.42422 0.73094, 34.42441 0.7..."


In [15]:
# Only keep the layer and geometry column.
sep_2021_to_jan_2022_training_data = sep_2021_to_jan_2022_training_data[["layer", "geometry"]]
sep_2021_to_jan_2022_training_data.head()

Unnamed: 0,layer,geometry
0,maize_2021,"MULTIPOLYGON (((34.26329 0.35922, 34.26326 0.3..."
1,maize_2021,"MULTIPOLYGON (((34.26295 0.35725, 34.26305 0.3..."
2,maize_2021,"MULTIPOLYGON (((34.26168 0.35657, 34.26180 0.3..."
3,maize_2021,"MULTIPOLYGON (((34.35768 0.36563, 34.35766 0.3..."
4,maize_2021,"MULTIPOLYGON (((34.42422 0.73094, 34.42441 0.7..."


In [16]:
# Rename the layer column
sep_2021_to_jan_2022_training_data.rename(columns={"layer": non_numeric_field}, inplace=True)
sep_2021_to_jan_2022_training_data.head()

Unnamed: 0,Crop_type,geometry
0,maize_2021,"MULTIPOLYGON (((34.26329 0.35922, 34.26326 0.3..."
1,maize_2021,"MULTIPOLYGON (((34.26295 0.35725, 34.26305 0.3..."
2,maize_2021,"MULTIPOLYGON (((34.26168 0.35657, 34.26180 0.3..."
3,maize_2021,"MULTIPOLYGON (((34.35768 0.36563, 34.35766 0.3..."
4,maize_2021,"MULTIPOLYGON (((34.42422 0.73094, 34.42441 0.7..."


In [17]:
# View the unique crop types in the crop type column.
sep_2021_to_jan_2022_training_data[non_numeric_field].unique()

array(['maize_2021', 'maize_mixed_2021', 'sugarcane_2021'], dtype=object)

In [18]:
# Change the values of the crop type field to the required types.
sep_2021_to_jan_2022_training_data[non_numeric_field].replace('maize_2021', 'Maize', inplace=True)
sep_2021_to_jan_2022_training_data[non_numeric_field].replace('maize_mixed_2021', 'Maize/Mixed', inplace=True)
sep_2021_to_jan_2022_training_data[non_numeric_field].replace('sugarcane_2021', 'Sugarcane', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sep_2021_to_jan_2022_training_data[non_numeric_field].replace('maize_2021', 'Maize', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sep_2021_to_jan_2022_training_data[non_numeric_field].replace('maize_mixed_2021', 'Maize/Mixed', inplace=True)
The behavior will chan

In [19]:
# Verify
sep_2021_to_jan_2022_training_data[non_numeric_field].unique()

array(['Maize', 'Maize/Mixed', 'Sugarcane'], dtype=object)

In [20]:
# Use the label encoder from the previous sections to transform the non-numerical labels to numerical labels.
sep_2021_to_jan_2022_training_data[numeric_field] = le.transform(sep_2021_to_jan_2022_training_data[non_numeric_field])
sep_2021_to_jan_2022_training_data.head()

Unnamed: 0,Crop_type,geometry,label
0,Maize,"MULTIPOLYGON (((34.26329 0.35922, 34.26326 0.3...",2
1,Maize,"MULTIPOLYGON (((34.26295 0.35725, 34.26305 0.3...",2
2,Maize,"MULTIPOLYGON (((34.26168 0.35657, 34.26180 0.3...",2
3,Maize,"MULTIPOLYGON (((34.35768 0.36563, 34.35766 0.3...",2
4,Maize,"MULTIPOLYGON (((34.42422 0.73094, 34.42441 0.7...",2


In [21]:
# Verify the numeric labels
print(sep_2021_to_jan_2022_training_data[numeric_field].unique())
print("Class Dictionary:")
print(class_dictionary)

[2 3 5]
Class Dictionary:
{'Cassava': 0, 'Common bean': 1, 'Maize': 2, 'Maize/Mixed': 3, 'Soybean': 4, 'Sugarcane': 5}


In [22]:
# Write the cleaned training data to disk.
sep_2021_to_jan_2022_training_data.to_file(os.path.join(output_folder, "sep_2021_to_jan_2022_training_data.geojson"))

## Prepare the training data for the February 2021 to August 2021 season

In [23]:
# Load the 2021 maize data.
maize_2021 = gpd.read_file("data/2021_maize.geojson").to_crs("EPSG:4326")
mixed_maize_2021 = gpd.read_file("data/2021_mixed_maize.geojson").to_crs("EPSG:4326")

display(maize_2021.head())
display(mixed_maize_2021.head())

Unnamed: 0,OBJECTID_1,OBJECTID,Name,descriptio,layer,Shape_Leng,Shape_Le_1,Shape_Le_2,Shape_Area,geometry
0,11,11,Day 23 plot 23.1,Maize,,197.091469,0.001773,197.091469,1687.497851,"MULTIPOLYGON (((34.35160 0.57589, 34.35230 0.5..."
1,12,12,Day 24 plot 24.2,Maize,,142.024242,0.00128,142.024242,1091.795501,"MULTIPOLYGON (((34.35127 0.57581, 34.35087 0.5..."
2,13,13,Day 24 plot 24.3 plaughed,Maize,,210.630063,0.001899,210.63007,2814.044204,"MULTIPOLYGON (((34.35024 0.57673, 34.35035 0.5..."
3,15,15,Day 24 plot 24.4,Maize,,112.212733,0.001012,112.212699,979.410877,"MULTIPOLYGON (((34.35526 0.57730, 34.35519 0.5..."
4,16,16,Day 24 plot 24.6,Maize,,359.51338,0.003238,359.513022,6034.076563,"MULTIPOLYGON (((34.35451 0.57792, 34.35381 0.5..."


Unnamed: 0,OBJECTID_1,OBJECTID,Name,descriptio,layer,Shape_Leng,Shape_Le_1,Shape_Le_2,Shape_Area,geometry
0,48,48,Plot 6.7,Maize and groundnuts,plot _Day 6_polygon_16-07-2022,419.974553,0.003784,419.97447,10780.64825,"MULTIPOLYGON (((34.15574 0.47496, 34.15485 0.4..."
1,94,94,Plot 11_2.1,Maize and soyabeans,plot 11_Visited_12-07-2022,207.065394,0.001866,207.065406,2594.238143,"MULTIPOLYGON (((34.22950 0.47267, 34.22942 0.4..."
2,98,98,Plot 11_2.6,Maize and soyabeans,plot 11_Visited_12-07-2022,133.299991,0.001201,133.299991,1092.141129,"MULTIPOLYGON (((34.23625 0.47557, 34.23661 0.4..."
3,119,119,Day 15 plot 15.2_First season maize and soybeans,Maize and soyabeans,plot_Day 15 polygon_25-07-2022,167.754998,0.001511,167.755024,1735.66154,"MULTIPOLYGON (((34.25753 0.60134, 34.25800 0.6..."
4,120,120,Day 15 plot 15.1_Maize only,Maize and soyabeans,plot_Day 15 polygon_25-07-2022,194.08403,0.001749,194.083993,2484.341854,"MULTIPOLYGON (((34.25744 0.60135, 34.25733 0.6..."


In [24]:
# Keep only the descriptio and geometry column.
maize_2021 = maize_2021[["descriptio", "geometry"]]
mixed_maize_2021 = mixed_maize_2021[["descriptio", "geometry"]]

display(maize_2021.head())
display(mixed_maize_2021.head())

Unnamed: 0,descriptio,geometry
0,Maize,"MULTIPOLYGON (((34.35160 0.57589, 34.35230 0.5..."
1,Maize,"MULTIPOLYGON (((34.35127 0.57581, 34.35087 0.5..."
2,Maize,"MULTIPOLYGON (((34.35024 0.57673, 34.35035 0.5..."
3,Maize,"MULTIPOLYGON (((34.35526 0.57730, 34.35519 0.5..."
4,Maize,"MULTIPOLYGON (((34.35451 0.57792, 34.35381 0.5..."


Unnamed: 0,descriptio,geometry
0,Maize and groundnuts,"MULTIPOLYGON (((34.15574 0.47496, 34.15485 0.4..."
1,Maize and soyabeans,"MULTIPOLYGON (((34.22950 0.47267, 34.22942 0.4..."
2,Maize and soyabeans,"MULTIPOLYGON (((34.23625 0.47557, 34.23661 0.4..."
3,Maize and soyabeans,"MULTIPOLYGON (((34.25753 0.60134, 34.25800 0.6..."
4,Maize and soyabeans,"MULTIPOLYGON (((34.25744 0.60135, 34.25733 0.6..."


In [25]:
# Rename the descriptio column to the required field.
maize_2021.rename(columns={"descriptio" : non_numeric_field}, inplace=True)
mixed_maize_2021.rename(columns={"descriptio" : non_numeric_field}, inplace=True)

display(maize_2021.head())
display(mixed_maize_2021.head())

Unnamed: 0,Crop_type,geometry
0,Maize,"MULTIPOLYGON (((34.35160 0.57589, 34.35230 0.5..."
1,Maize,"MULTIPOLYGON (((34.35127 0.57581, 34.35087 0.5..."
2,Maize,"MULTIPOLYGON (((34.35024 0.57673, 34.35035 0.5..."
3,Maize,"MULTIPOLYGON (((34.35526 0.57730, 34.35519 0.5..."
4,Maize,"MULTIPOLYGON (((34.35451 0.57792, 34.35381 0.5..."


Unnamed: 0,Crop_type,geometry
0,Maize and groundnuts,"MULTIPOLYGON (((34.15574 0.47496, 34.15485 0.4..."
1,Maize and soyabeans,"MULTIPOLYGON (((34.22950 0.47267, 34.22942 0.4..."
2,Maize and soyabeans,"MULTIPOLYGON (((34.23625 0.47557, 34.23661 0.4..."
3,Maize and soyabeans,"MULTIPOLYGON (((34.25753 0.60134, 34.25800 0.6..."
4,Maize and soyabeans,"MULTIPOLYGON (((34.25744 0.60135, 34.25733 0.6..."


In [26]:
# View the unique crop types in the crop type column.
maize_2021[non_numeric_field].unique(), mixed_maize_2021[non_numeric_field].unique()

(array(['Maize'], dtype=object),
 array(['Maize and groundnuts', 'Maize and soyabeans', 'Maize and beans',
        'Maize and soyabeans and groundnuts', 'Maize and cassava',
        'Maize and beans  and soya'], dtype=object))

In [27]:
# Change the values of the crop type field to the required types.
maize_2021[non_numeric_field] = "Maize"
mixed_maize_2021[non_numeric_field] = "Maize/Mixed"

In [28]:
# Verify the change in column values
maize_2021[non_numeric_field].unique(), mixed_maize_2021[non_numeric_field].unique()

(array(['Maize'], dtype=object), array(['Maize/Mixed'], dtype=object))

In [29]:
# Concatenate the two geodataframes into one.
feb_2021_to_aug_2021_training_data = pd.concat([maize_2021, mixed_maize_2021])
feb_2021_to_aug_2021_training_data.head()

Unnamed: 0,Crop_type,geometry
0,Maize,"MULTIPOLYGON (((34.35160 0.57589, 34.35230 0.5..."
1,Maize,"MULTIPOLYGON (((34.35127 0.57581, 34.35087 0.5..."
2,Maize,"MULTIPOLYGON (((34.35024 0.57673, 34.35035 0.5..."
3,Maize,"MULTIPOLYGON (((34.35526 0.57730, 34.35519 0.5..."
4,Maize,"MULTIPOLYGON (((34.35451 0.57792, 34.35381 0.5..."


In [30]:
# Check
assert len(feb_2021_to_aug_2021_training_data) == len(maize_2021) + len(mixed_maize_2021)
feb_2021_to_aug_2021_training_data[non_numeric_field].unique()

array(['Maize', 'Maize/Mixed'], dtype=object)

In [31]:
# Use the label encoder from the previous sections to transform the non-numerical labels to numerical labels.
feb_2021_to_aug_2021_training_data[numeric_field] = le.transform(feb_2021_to_aug_2021_training_data[non_numeric_field])
feb_2021_to_aug_2021_training_data.head()

Unnamed: 0,Crop_type,geometry,label
0,Maize,"MULTIPOLYGON (((34.35160 0.57589, 34.35230 0.5...",2
1,Maize,"MULTIPOLYGON (((34.35127 0.57581, 34.35087 0.5...",2
2,Maize,"MULTIPOLYGON (((34.35024 0.57673, 34.35035 0.5...",2
3,Maize,"MULTIPOLYGON (((34.35526 0.57730, 34.35519 0.5...",2
4,Maize,"MULTIPOLYGON (((34.35451 0.57792, 34.35381 0.5...",2


In [32]:
# Verify the numeric labels
print(feb_2021_to_aug_2021_training_data[numeric_field].unique())
print("Class Dictionary:")
print(class_dictionary)

[2 3]
Class Dictionary:
{'Cassava': 0, 'Common bean': 1, 'Maize': 2, 'Maize/Mixed': 3, 'Soybean': 4, 'Sugarcane': 5}


In [33]:
# Reset the index.
feb_2021_to_aug_2021_training_data.reset_index(drop=True, inplace=True)
feb_2021_to_aug_2021_training_data

Unnamed: 0,Crop_type,geometry,label
0,Maize,"MULTIPOLYGON (((34.35160 0.57589, 34.35230 0.5...",2
1,Maize,"MULTIPOLYGON (((34.35127 0.57581, 34.35087 0.5...",2
2,Maize,"MULTIPOLYGON (((34.35024 0.57673, 34.35035 0.5...",2
3,Maize,"MULTIPOLYGON (((34.35526 0.57730, 34.35519 0.5...",2
4,Maize,"MULTIPOLYGON (((34.35451 0.57792, 34.35381 0.5...",2
...,...,...,...
102,Maize/Mixed,"MULTIPOLYGON (((34.26370 0.60301, 34.26488 0.6...",3
103,Maize/Mixed,"MULTIPOLYGON (((34.20228 0.48371, 34.20172 0.4...",3
104,Maize/Mixed,"MULTIPOLYGON (((34.19189 0.50030, 34.19189 0.5...",3
105,Maize/Mixed,"MULTIPOLYGON (((34.19746 0.48995, 34.19742 0.4...",3


In [34]:
# Write the cleaned training data to disk.
feb_2021_to_aug_2021_training_data.to_file(os.path.join(output_folder, "feb_2021_to_aug_2021_training_data.geojson"))