# Import, clean, and merge pre-existing validation data over Africa

Generating a preliminary cropland validation dataset using pre-extsiting datasets

In [1]:
import sys
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point
sys.path.append('../Scripts')
from deafrica_plotting import map_shapefile

## GFSAD Training/Validation data

Data generated for creating global crop extent maps.  African classification method [here](https://www.mdpi.com/2072-4292/9/10/1065).  
Getting data from two sources:
1. Subsample of Africa [training data points](https://web.croplands.org/app/data/search?page=1&page_size=200)
2. [Map reference points](https://lpdaac.usgs.gov/products/gfsad30afcev001/), used for accuracy assessments

> Joining both datasets together to produce a preliminary cropland validation dataset.

Definition of cropland:
- “…lands cultivated with plants harvested for food, feed, and fiber, include both seasonal crops (e.g., wheat, rice, corn, soybeans, cotton) and continuous plantations (e.g., coffee, tea, rubber, cocoa, oil palms). Cropland fallow are lands uncultivated during a season or a year but are farmlands and are equipped for cultivation, including plantations (Teluguntla et al., 2015). Cropland extent includes all planted crops and fallow lands. Non-croplands include all other land cover classes other than croplands and cropland fallow.”
---
`Notes`
- Crop Class == 1 in the training data points. 
- In the map refernce dataset, the 'Ref' column has cropland/non-cropland labels.


In [2]:
#Read in the training data csv
file = "data/training_validation/GFSAD2015/GFSAD_training_data.csv"
df = pd.read_csv(
    file, delimiter=",")

#convert to geodataframe
GFSAD_train = gpd.GeoDataFrame(
    df.drop(['lon', 'lat'], axis=1),
    crs='epsg:4326',
    geometry=[Point(xy) for xy in zip(df.lon, df.lat)])

In [3]:
# read in Africa boundary and clip dataset to africa
afr = gpd.read_file('data/african_countries.shp')
GFSAD_train_afr = gpd.overlay(GFSAD_train, afr, how='intersection')
# GFSAD_train_afr.plot()

In [4]:
#create a 'class' column and reclassify into a binary
GFSAD_train_afr['class'] = GFSAD_train_afr['land_use_type']
GFSAD_train_afr['class'] = np.where(GFSAD_train_afr['class'] == 1, 1, 0)

In [5]:
#Import the map reference data points, reclassify to binary
crop_ref = gpd.read_file('data/training_validation/GFSAD2015/19171_global_ref_dataset_africa.shp')
crop_ref['class'] = np.where(crop_ref['Ref'] == 'Cropland', 1, 0)

In [6]:
# join the two geodataframe together
GFSAD_cropland_validation = gpd.GeoDataFrame(pd.concat([crop_ref, GFSAD_train_afr], ignore_index=True), crs=crop_ref.crs)

In [7]:
# Clean the geodateframe by removing unnecessary columns
GFSAD_cropland_validation = GFSAD_cropland_validation.drop(['Ref','Lat', 'Long','Zone', 'id','year', 'month',
                                                         'country','land_use_type','crop_primary','crop_secondary',
                                                         'water','intensity','source_type','source_class',
                                                         'source_description', 'use_validation','ID',
                                                         'CODE', 'COUNTRY'], axis=1)

In [8]:
print("Cropland data points = " + str(len(GFSAD_cropland_validation[GFSAD_cropland_validation['class']==1])))
print("Non-Cropland data points = " + str(len(GFSAD_cropland_validation[GFSAD_cropland_validation['class']==0])))
# map_shapefile(GFSAD_cropland_validation, attribute='class')

Cropland data points = 1088
Non-Cropland data points = 1626


In [10]:
#export to file
GFSAD_cropland_validation.to_file("data/training_validation/GFSAD2015/cropland_prelim_validation_GFSAD.shp")

***
## UPDATE 12/5/2020

Not using code below anymore, instead going to rely on the GFSAD dataset as it is high quality and large enough for our purposes.

---

## [Bayas et al. (2017)](https://www.nature.com/articles/sdata2017136) Global Crop Reference Dataset 

Collected in Sept 2016 using geo-wiki.
Reference data from here:
- Any 30m cell classified as crop: http://store.pangaea.de/Publications/See_2017/crop_all.zip
- Control dataset, validated cells classified as crop: http://store.pangaea.de/Publications/See_2017/crop_con.zip 

Definition of cropland:
- "...the definition used for the campaign follows that of GEOGLAM/JECAM.  The annual cropland from a remote sensing perspective is a piece of land of a minimum of 0.25 ha (minimum width of 30 m) that is sowed/planted and harvestable at least once within the 12 months after the sowing/planting date. The annual cropland produces an herbaceous cover and is sometimes combined with some tree or woody vegetation’. According to this GEOGLAM/JECAM definition, perennial crops, agroforestry plantations, palm oil, coffee, tree crops and fallows are not included in the cropland class"

> Dataset contains only 'cropland' points, no other land classes. As the dataset contains nearly 120,000 points, its probably best to randomly sample the shapefile with `df.sample(n=2000)`


In [None]:
file = "data/training_data/global_crop_reference_dataset_See2017.csv"
# file = "data/training_data/global_crop_reference_dataset_control.csv"
df = pd.read_csv(
    file, delimiter=",")
df.head()

In [None]:
crop_train = gpd.GeoDataFrame(
    df.drop(['centroid_X', 'centroid_Y'], axis=1),
    crs='epsg:4326',
    geometry=[Point(xy) for xy in zip(df.centroid_X, df.centroid_Y)])

In [None]:
afr = gpd.read_file('data/african_countries.shp')

In [None]:
crop_train_afr = gpd.overlay(crop_train, afr, how='intersection')

In [None]:
crop_train_afr['class'] = 1

In [None]:
crop_train_afr = crop_train_afr.sample(n=2000, random_state=1)

In [None]:
# map_shapefile(crop_train_afr, attribute='ID') #can't plot all points if loading '...all_data.csv'
len(crop_train_afr)

In [None]:
crop_train_afr.to_file("data/training_data/globalCropRefernceData_Africa_2016_2000points.shp")

## CrowdVal project data

Collected using geo-wiki by/for the ESA CCI Land Cover Team to assist in validating their prototype 20m Sentinel 2A landcover product.
Data available from here: https://geo-wiki.org/Application/index.php

Class Key:
* cropland == 4
* built-up == 8

> Ignoring South Africa data at the moment because it was validated at 10m resolution and unsure how to upscale it 20m pixels

In [None]:
#open datasets
kenya = gpd.read_file('data/training_data/CrowdVal/CrowdVal_kenya_final_points.shp')
ivy_coast = gpd.read_file('data/training_data/CrowdVal/CrowdVal_Cote_dIvoire_final_points.shp')
gabon = gpd.read_file('data/training_data/CrowdVal/CrowdVal_Gabon_final_points.shp')
# south_afr = gpd.read_file('data/training_data/CrowdVal/CrowdVal_southafrica_final_points.shp')

In [None]:
#create common attribute with 'class' values
kenya['class'] = kenya['GRID_CODE'].astype('int8')
ivy_coast['class'] = ivy_coast['ValValue'].astype('int8')
gabon['class'] = gabon['ValValue'].astype('int8')

In [None]:
len(kenya) + len(ivy_coast) + len(gabon)

In [None]:
print(len(ivy_coast))
print(np.sum(ivy_coast['class'] == 4))

In [None]:
#export to file
gabon.to_file('data/training_data/CrowdVal/cleaned/gabon_crowdval_cleaned.shp')
ivy_coast.to_file('data/training_data/CrowdVal/cleaned/ivory_coast_crowdval_cleaned.shp')
kenya.to_file('data/training_data/CrowdVal/cleaned/kenya_crowdval_cleaned.shp')

## Merge datasets into one common cropland-non cropland training dataset



In [None]:
#open every dataset
# crop_ref = gpd.read_file("data/training_data/globalCropRefernceData_Africa_2016_allData.shp")
crop_ref = gpd.read_file("data/training_data/globalCropRefernceData_Africa_2016_2000points.shp")
crop_ref_control = gpd.read_file("data/training_data/globalCropRefernceData_Africa_2016_control.shp")
gfsad = gpd.read_file("data/training_data/GFSAD_training_Africa.shp")
gabon = gpd.read_file('data/training_data/CrowdVal/cleaned/gabon_crowdval_cleaned.shp')
ivy_coast = gpd.read_file('data/training_data/CrowdVal/cleaned/ivory_coast_crowdval_cleaned.shp')
kenya = gpd.read_file('data/training_data/CrowdVal/cleaned/kenya_crowdval_cleaned.shp')

In [None]:
#crowdVal & GFSAD datasets need to be reclassified into 1 = crop, 0=non-crop
gabon['class'] = np.where(gabon['class'] == 4, 1, 0)
kenya['class'] = np.where(kenya['class'] == 4, 1, 0)
ivy_coast['class'] = np.where(ivy_coast['class'] == 4, 1, 0)
gfsad['class'] = np.where(gfsad['class'] == 1, 1, 0)

In [None]:
#if subsetting, randomly sample a % of each dataset as they can be v. large
perc = 0.1
# crop_ref = crop_ref.sample(n=int(len(crop_ref)*perc), random_state=1)
kenya = kenya.sample(n=int(len(kenya)*perc), random_state=1)
gabon = gabon.sample(n=int(len(gabon)*perc), random_state=1)
ivy_coast = ivy_coast.sample(n=int(len(ivy_coast)*perc), random_state=1)

In [None]:
#merge all the datasest together
gdf_list = [crop_ref, gfsad, gabon, ivy_coast, kenya, crop_ref_control]
train = gpd.GeoDataFrame(pd.concat(gdf_list, ignore_index=True), crs=crop_ref.crs)

In [None]:
#simplify columns to just the geom and class
train = train.filter(['geometry', 'class'])

In [None]:
np.sum(train['class']==0)

In [None]:
# map_shapefile(train.sample(n=500), attribute='class')
# train.plot(figsize=(10,10), markersize=5)
len(train)

In [None]:
train.to_file('data/training_data/cropland_prelim_validation.shp')