# Prepare the data for upload to Geo Engine and training.

For this notebook the following pacakges are required:

In [2]:
import geopandas as gpd
import pandas as pd
import numpy as np




Download the EuroCrops data and extract the field polygons for NRW, Germany. The data is available under the [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/). It can be downloaded from [here](https://github.com/maja601/EuroCrops#vectordata_zenodo).

Then read the polygons into a geopandas dataframe:

In [3]:
nrw_crops_2021 = gpd.read_file("~/Downloads/eurocrop/NRW/DE_NRW_2021_EC21_utm32n.gpkg")
nrw_crops_2021

Unnamed: 0,ID,INSPIRE_ID,FLIK,AREA_HA,CODE,CODE_TXT,USE_CODE,USE_TXT,D_PG,CROPDIV,EFA,ELER,WJ,DAT_BEARB,EC_trans_n,EC_hcat_n,EC_hcat_c,geometry
0,4598773,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0544130746,1.5204,311,Winterraps,OE,Ölsaaten,N,N,N,N,2021,2021-03-12,Winter rape,winter_rapeseed_rape,3301060401,"POLYGON ((428647.740 5711831.893, 428651.689 5..."
1,4598772,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0544130596,2.2812,131,Wintergerste,GT,Getreide,N,N,N,N,2021,2021-03-12,Winter barley,winter_barley,3301010401,"POLYGON ((427717.449 5710011.130, 427709.347 5..."
2,4598771,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0544130402,0.8311,115,Winterweichweizen,GT,Getreide,N,N,N,N,2021,2021-03-12,Winter soft wheat,winter_common_soft_wheat,3301010101,"POLYGON ((427337.557 5710068.068, 427332.544 5..."
3,5447571,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0548091835,4.7241,459,Grünland (Dauergrünland),GL,Dauergrünland,Y,N,N,Y,2021,2021-09-24,Grassland (permanent grassland),pasture_meadow_grassland_grass,3302000000,"POLYGON ((376283.353 5665431.250, 376308.653 5..."
4,5447586,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0548091988,6.1005,459,Grünland (Dauergrünland),GL,Dauergrünland,Y,N,N,Y,2021,2021-09-24,Grassland (permanent grassland),pasture_meadow_grassland_grass,3302000000,"POLYGON ((376495.069 5665848.269, 376496.653 5..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
732840,5452946,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0544044277,0.2245,459,Grünland (Dauergrünland),GL,Dauergrünland,Y,N,N,N,2021,2021-09-28,Grassland (permanent grassland),pasture_meadow_grassland_grass,3302000000,"POLYGON ((318722.168 5716489.664, 318717.330 5..."
732841,5453270,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0544053515,0.5144,459,Grünland (Dauergrünland),GL,Dauergrünland,Y,N,N,N,2021,2021-09-28,Grassland (permanent grassland),pasture_meadow_grassland_grass,3302000000,"POLYGON ((334648.317 5714560.041, 334636.516 5..."
732842,5436890,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0543093342,0.7678,424,Ackergras,AF,Ackerfutter,N,N,N,N,2021,2021-09-16,Arable grass,pasture_meadow_grassland_grass,3302000000,"POLYGON ((381352.164 5720357.938, 381350.583 5..."
732843,5479175,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0543123433,0.0406,459,Grünland (Dauergrünland),GL,Dauergrünland,Y,N,N,N,2021,2021-10-15,Grassland (permanent grassland),pasture_meadow_grassland_grass,3302000000,"POLYGON ((412042.260 5718477.153, 412041.755 5..."


To make the exaple simpler, we just use the centroids of the polygons as points:

(A better approach would be to place multiple sampling points within each polygon.)

In [4]:
nrw_crops_2021["points"]= nrw_crops_2021.centroid
nrw_crops_2021.set_geometry("points", inplace=True)
nrw_crops_2021


Unnamed: 0,ID,INSPIRE_ID,FLIK,AREA_HA,CODE,CODE_TXT,USE_CODE,USE_TXT,D_PG,CROPDIV,EFA,ELER,WJ,DAT_BEARB,EC_trans_n,EC_hcat_n,EC_hcat_c,geometry,points
0,4598773,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0544130746,1.5204,311,Winterraps,OE,Ölsaaten,N,N,N,N,2021,2021-03-12,Winter rape,winter_rapeseed_rape,3301060401,"POLYGON ((428647.740 5711831.893, 428651.689 5...",POINT (428690.027 5711938.189)
1,4598772,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0544130596,2.2812,131,Wintergerste,GT,Getreide,N,N,N,N,2021,2021-03-12,Winter barley,winter_barley,3301010401,"POLYGON ((427717.449 5710011.130, 427709.347 5...",POINT (427819.337 5710040.545)
2,4598771,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0544130402,0.8311,115,Winterweichweizen,GT,Getreide,N,N,N,N,2021,2021-03-12,Winter soft wheat,winter_common_soft_wheat,3301010101,"POLYGON ((427337.557 5710068.068, 427332.544 5...",POINT (427320.866 5710158.178)
3,5447571,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0548091835,4.7241,459,Grünland (Dauergrünland),GL,Dauergrünland,Y,N,N,Y,2021,2021-09-24,Grassland (permanent grassland),pasture_meadow_grassland_grass,3302000000,"POLYGON ((376283.353 5665431.250, 376308.653 5...",POINT (376382.931 5665589.643)
4,5447586,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0548091988,6.1005,459,Grünland (Dauergrünland),GL,Dauergrünland,Y,N,N,Y,2021,2021-09-24,Grassland (permanent grassland),pasture_meadow_grassland_grass,3302000000,"POLYGON ((376495.069 5665848.269, 376496.653 5...",POINT (376661.541 5665982.356)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
732840,5452946,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0544044277,0.2245,459,Grünland (Dauergrünland),GL,Dauergrünland,Y,N,N,N,2021,2021-09-28,Grassland (permanent grassland),pasture_meadow_grassland_grass,3302000000,"POLYGON ((318722.168 5716489.664, 318717.330 5...",POINT (318725.847 5716507.765)
732841,5453270,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0544053515,0.5144,459,Grünland (Dauergrünland),GL,Dauergrünland,Y,N,N,N,2021,2021-09-28,Grassland (permanent grassland),pasture_meadow_grassland_grass,3302000000,"POLYGON ((334648.317 5714560.041, 334636.516 5...",POINT (334675.862 5714607.930)
732842,5436890,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0543093342,0.7678,424,Ackergras,AF,Ackerfutter,N,N,N,N,2021,2021-09-16,Arable grass,pasture_meadow_grassland_grass,3302000000,"POLYGON ((381352.164 5720357.938, 381350.583 5...",POINT (381364.720 5720469.927)
732843,5479175,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0543123433,0.0406,459,Grünland (Dauergrünland),GL,Dauergrünland,Y,N,N,N,2021,2021-10-15,Grassland (permanent grassland),pasture_meadow_grassland_grass,3302000000,"POLYGON ((412042.260 5718477.153, 412041.755 5...",POINT (412151.911 5718571.206)


Get the bounds of the points. This is the max area we can use for the training data:

In [5]:
bounds_array = nrw_crops_2021.total_bounds
xmin = bounds_array[0]
ymin = bounds_array[1]
xmax = bounds_array[2]
ymax = bounds_array[3]

(xmin, ymin, xmax, ymax)

(280415.6351451202, 5577765.099125725, 531691.9986338568, 5820033.360430809)

To limit the size of the training data, we use a smaller area with approximately 50x50km:

In [6]:
(xmin, ymin, xmax, ymax) = (421395,  5681078, 476201, 5727833) # 421395 5681078 : 476201 5727833 willingen, lippstadt, werl
(xmin, ymin, xmax, ymax, xmax - xmin, ymax - ymin)

(421395, 5681078, 476201, 5727833, 54806, 46755)

Use a spatial index to sort the points:

In [7]:
from shapely.geometry import box

nrw_crops_2021_idx_query = nrw_crops_2021.loc[nrw_crops_2021.sindex.query(box(xmin, ymin, xmax, ymax), sort=True)]
nrw_crops_2021_idx_query

Unnamed: 0,ID,INSPIRE_ID,FLIK,AREA_HA,CODE,CODE_TXT,USE_CODE,USE_TXT,D_PG,CROPDIV,EFA,ELER,WJ,DAT_BEARB,EC_trans_n,EC_hcat_n,EC_hcat_c,geometry,points
0,4598773,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0544130746,1.5204,311,Winterraps,OE,Ölsaaten,N,N,N,N,2021,2021-03-12,Winter rape,winter_rapeseed_rape,3301060401,"POLYGON ((428647.740 5711831.893, 428651.689 5...",POINT (428690.027 5711938.189)
1,4598772,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0544130596,2.2812,131,Wintergerste,GT,Getreide,N,N,N,N,2021,2021-03-12,Winter barley,winter_barley,3301010401,"POLYGON ((427717.449 5710011.130, 427709.347 5...",POINT (427819.337 5710040.545)
2,4598771,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0544130402,0.8311,115,Winterweichweizen,GT,Getreide,N,N,N,N,2021,2021-03-12,Winter soft wheat,winter_common_soft_wheat,3301010101,"POLYGON ((427337.557 5710068.068, 427332.544 5...",POINT (427320.866 5710158.178)
265,4722182,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0546141785,0.3609,459,Grünland (Dauergrünland),GL,Dauergrünland,Y,N,N,N,2021,2021-04-06,Grassland (permanent grassland),pasture_meadow_grassland_grass,3302000000,"POLYGON ((431497.348 5693793.274, 431491.715 5...",POINT (431527.388 5693772.886)
266,4722181,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0546141073,0.3480,459,Grünland (Dauergrünland),GL,Dauergrünland,Y,N,N,N,2021,2021-04-06,Grassland (permanent grassland),pasture_meadow_grassland_grass,3302000000,"POLYGON ((431525.914 5693538.588, 431503.194 5...",POINT (431535.193 5693614.690)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
732423,5337105,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0545171291,0.8594,115,Winterweichweizen,GT,Getreide,N,N,N,Y,2021,2021-05-17,Winter soft wheat,winter_common_soft_wheat,3301010101,"POLYGON ((472307.957 5696538.366, 472288.909 5...",POINT (472357.075 5696612.529)
732424,5337108,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0545171813,0.4455,115,Winterweichweizen,GT,Getreide,N,N,N,Y,2021,2021-05-17,Winter soft wheat,winter_common_soft_wheat,3301010101,"POLYGON ((471927.729 5697668.163, 472056.412 5...",POINT (472016.875 5697690.039)
732425,5337107,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0545171599,0.7622,115,Winterweichweizen,GT,Getreide,N,N,N,Y,2021,2021-05-17,Winter soft wheat,winter_common_soft_wheat,3301010101,"POLYGON ((471960.746 5696305.116, 471977.685 5...",POINT (471981.413 5696219.338)
732426,5337106,https://geodaten.nrw.de/id/inspire-lu-ts/exist...,DENWLI0545171567,4.2636,411,Silomais (als Hauptfutter),AF,Ackerfutter,N,N,N,Y,2021,2021-05-17,Silage maize (as staple feed),green_silo_maize,3301090400,"POLYGON ((471606.399 5697163.552, 471654.688 5...",POINT (471704.064 5697043.769)


Now lets see how the lables are distributed.

First, for "USE_TEXT":

In [8]:
xyz = nrw_crops_2021_idx_query.groupby("USE_TXT").count()
xyz

Unnamed: 0_level_0,ID,INSPIRE_ID,FLIK,AREA_HA,CODE,CODE_TXT,USE_CODE,D_PG,CROPDIV,EFA,ELER,WJ,DAT_BEARB,EC_trans_n,EC_hcat_n,EC_hcat_c,geometry,points
USE_TXT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Ackerfutter,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241
Dauergrünland,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596
Dauerkulturen,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326
Eiweißpflanzen,728,728,728,728,728,728,728,728,728,728,728,728,728,728,728,728,728,728
Energiepflanzen,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70
Gemüse,195,195,195,195,195,195,195,195,195,195,195,195,195,195,195,195,195,195
Getreide,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971
Hackfrüchte,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099
Küchenkräuter,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Sonstiges,778,778,778,778,778,778,778,778,778,778,778,778,778,778,778,778,778,778


Then for "EC_hcat_c":

In [9]:
xyz = nrw_crops_2021_idx_query.groupby("EC_hcat_c").count()
xyz

Unnamed: 0_level_0,ID,INSPIRE_ID,FLIK,AREA_HA,CODE,CODE_TXT,USE_CODE,USE_TXT,D_PG,CROPDIV,EFA,ELER,WJ,DAT_BEARB,EC_trans_n,EC_hcat_n,geometry,points
EC_hcat_c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3301010101,6220,6220,6220,6220,6220,6220,6220,6220,6220,6220,6220,6220,6220,6220,6220,6220,6220,6220
3301010103,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77,77
3301010201,74,74,74,74,74,74,74,74,74,74,74,74,74,74,74,74,74,74
3301010203,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13
3301010301,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3304040000,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
3306000000,233,233,233,233,233,233,233,233,233,233,233,233,233,233,233,233,233,233
3306010000,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24
3308000000,937,937,937,937,937,937,937,937,937,937,937,937,937,937,937,937,937,937


In [10]:
xyz = nrw_crops_2021_idx_query.groupby("USE_CODE").count()
xyz

Unnamed: 0_level_0,ID,INSPIRE_ID,FLIK,AREA_HA,CODE,CODE_TXT,USE_TXT,D_PG,CROPDIV,EFA,ELER,WJ,DAT_BEARB,EC_trans_n,EC_hcat_n,EC_hcat_c,geometry,points
USE_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
AF,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241,5241
DA,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326
EP,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70
EW,728,728,728,728,728,728,728,728,728,728,728,728,728,728,728,728,728,728
GL,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596,19596
GM,195,195,195,195,195,195,195,195,195,195,195,195,195,195,195,195,195,195
GT,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971,13971
HF,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099,1099
HP,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
OE,1419,1419,1419,1419,1419,1419,1419,1419,1419,1419,1419,1419,1419,1419,1419,1419,1419,1419


In [None]:
code_id_map = {
    'AF': 1,
    'DA': 2,
    'EP': 3,
    'EW': 4,
    'GL': 5,
    'GM': 6,
    'GT': 7,
    'HF': 8,
    'HP': 9,
    'OE': 10,
    'PA': 11,
    'SF': 12,
    'SL': 13,
    'ZP': 14,
}

Get the points with IDs for later use:

In [12]:
nrw_crops_2021_points_code = nrw_crops_2021_idx_query[["ID", "points", "INSPIRE_ID", "USE_CODE", "USE_TXT"]]
nrw_crops_2021_points_code

Unnamed: 0,ID,points,INSPIRE_ID,USE_CODE,USE_TXT
0,4598773,POINT (428690.027 5711938.189),https://geodaten.nrw.de/id/inspire-lu-ts/exist...,OE,Ölsaaten
1,4598772,POINT (427819.337 5710040.545),https://geodaten.nrw.de/id/inspire-lu-ts/exist...,GT,Getreide
2,4598771,POINT (427320.866 5710158.178),https://geodaten.nrw.de/id/inspire-lu-ts/exist...,GT,Getreide
265,4722182,POINT (431527.388 5693772.886),https://geodaten.nrw.de/id/inspire-lu-ts/exist...,GL,Dauergrünland
266,4722181,POINT (431535.193 5693614.690),https://geodaten.nrw.de/id/inspire-lu-ts/exist...,GL,Dauergrünland
...,...,...,...,...,...
732423,5337105,POINT (472357.075 5696612.529),https://geodaten.nrw.de/id/inspire-lu-ts/exist...,GT,Getreide
732424,5337108,POINT (472016.875 5697690.039),https://geodaten.nrw.de/id/inspire-lu-ts/exist...,GT,Getreide
732425,5337107,POINT (471981.413 5696219.338),https://geodaten.nrw.de/id/inspire-lu-ts/exist...,GT,Getreide
732426,5337106,POINT (471704.064 5697043.769),https://geodaten.nrw.de/id/inspire-lu-ts/exist...,AF,Ackerfutter


Generate a new GeoPackage file with the points:

In [10]:
#group_sample = nrw_crops_2021_points_code.groupby("CODE").sample(random_state=1337, replace=True) # frac=0.1
#group_sample.reset_index(inplace=True)
#group_sample
group_sample = nrw_crops_2021_points_code.reset_index()

In [11]:
group_sample_idx_query = group_sample.loc[group_sample.sindex.query(box(xmin, ymin, xmax, ymax), sort=True)]
group_sample_idx_query.to_file("group_sample_frac1_inspireId_use_utm32n.gpkg", driver="GPKG")
