# Import, clean, and merge training data over Africa

In [1]:
import sys
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point
sys.path.append('../Scripts')
from deafrica_plotting import map_shapefile

## GFSAD Training data

Data generated for creating global crop extent maps.  African classification method [here](https://www.mdpi.com/2072-4292/9/10/1065).  Getting data from here: https://web.croplands.org/app/data/search?page=1&page_size=200

Definition of cropland:
- “…lands cultivated with plants harvested for food, feed, and fiber, include both seasonal crops (e.g., wheat, rice, corn, soybeans, cotton) and continuous plantations (e.g., coffee, tea, rubber, cocoa, oil palms). Cropland fallow are lands uncultivated during a season or a year but are farmlands and are equipped for cultivation, including plantations (Teluguntla et al., 2015). Cropland extent includes all planted crops and fallow lands. Non-croplands include all other land cover classes other than croplands and cropland fallow.”

Crop Class == 1


In [2]:
file = "data/training_data/GFSAD_training_data.csv"
df = pd.read_csv(
    file, delimiter=",")
df.head()

Unnamed: 0,id,year,month,lat,lon,country,land_use_type,crop_primary,crop_secondary,water,intensity,source_type,source_class,source_description,use_validation
0,165750,2015,7,34.081668,68.181839,Afghanistan,1,0,0,0,0,derived,,labeled_vhri,False
1,141512,2015,4,32.180843,62.988052,Afghanistan,1,0,0,0,0,derived,,labeled_vhri,False
2,137125,2016,1,41.027053,19.620895,Albania,1,0,0,0,0,derived,,labeled_vhri,False
3,164279,2016,7,36.194971,0.466232,Algeria,1,0,0,0,0,derived,,labeled_vhri,False
4,156753,2015,9,35.508195,1.856003,Algeria,1,0,0,0,0,derived,,labeled_vhri,False


In [3]:
GFSAD_train = gpd.GeoDataFrame(
    df.drop(['lon', 'lat'], axis=1),
    crs='epsg:4326',
    geometry=[Point(xy) for xy in zip(df.lon, df.lat)])

In [4]:
afr = gpd.read_file('data/african_countries.shp')

In [5]:
GFSAD_train_afr = gpd.overlay(GFSAD_train, afr, how='intersection')
# GFSAD_train_afr.plot()

In [6]:
GFSAD_train_afr['class'] = GFSAD_train_afr['land_use_type']

In [8]:
len(GFSAD_train_afr)

966

In [10]:
# map_shapefile(GFSAD_train_afr, attribute='class')

In [None]:
GFSAD_train_afr.to_file("data/training_data/GFSAD_training_Africa.shp")

## [Bayas et al. (2017)](https://www.nature.com/articles/sdata2017136) Global Crop Reference Dataset 

Collected in Sept 2016 using geo-wiki.
Reference data from here:
- Any 30m cell classified as crop: http://store.pangaea.de/Publications/See_2017/crop_all.zip
- Control dataset, validated cells classified as crop: http://store.pangaea.de/Publications/See_2017/crop_con.zip 

Definition of cropland:
- "...the definition used for the campaign follows that of GEOGLAM/JECAM.  The annual cropland from a remote sensing perspective is a piece of land of a minimum of 0.25 ha (minimum width of 30 m) that is sowed/planted and harvestable at least once within the 12 months after the sowing/planting date. The annual cropland produces an herbaceous cover and is sometimes combined with some tree or woody vegetation’. According to this GEOGLAM/JECAM definition, perennial crops, agroforestry plantations, palm oil, coffee, tree crops and fallows are not included in the cropland class"

Dataset contains only 'cropland' points, no other land classes. As the dataset contains nearly 120,000 points, its probably best to randomly sample the shapefile with `df.sample(n=2000)`


In [20]:
# file = "data/training_data/global_crop_reference_dataset_See2017.csv"
file = "data/training_data/global_crop_reference_dataset_control.csv"
df = pd.read_csv(
    file, delimiter=",")
df.head()

Unnamed: 0,locationid,userid,centroid_X,centroid_Y
0,1642116,222222,-1.25119,52.952381
1,1642116,222222,-1.250595,52.952381
2,1642116,222222,-1.25119,52.952976
3,1642116,222222,-1.250595,52.952976
4,1642116,222222,-1.25119,52.953571


In [21]:
crop_train = gpd.GeoDataFrame(
    df.drop(['centroid_X', 'centroid_Y'], axis=1),
    crs='epsg:4326',
    geometry=[Point(xy) for xy in zip(df.centroid_X, df.centroid_Y)])

In [23]:
afr = gpd.read_file('data/african_countries.shp')

In [24]:
crop_train_afr = gpd.overlay(crop_train, afr, how='intersection')

In [25]:
crop_train_afr['class'] = 1

In [26]:
# map_shapefile(crop_train_afr, attribute='ID') #can't plot all points if loading '...all_data.csv'
len(crop_train_afr)

514

In [None]:
crop_train_afr.to_file("data/training_data/globalCropRefernceData_Africa_2016_allData.shp")

## CrowdVal project data

Collected using geo-wiki by/for the ESA CCI Land Cover Team to assist in validating thir prototype 20m Sentinel 2A landcover product.
Data available from here: https://geo-wiki.org/Application/index.php

Class Key:
* cropland == 4
* built-up == 8

> Ignoring South Africa data at the moment because it was validated at 10m resolution and unsure how to upscale it 20m pixels

In [16]:
#open datasets
kenya = gpd.read_file('data/training_data/CrowdVal/CrowdVal_kenya_final_points.shp')
ivy_coast = gpd.read_file('data/training_data/CrowdVal/CrowdVal_Cote_dIvoire_final_points.shp')
gabon = gpd.read_file('data/training_data/CrowdVal/CrowdVal_Gabon_final_points.shp')
# south_afr = gpd.read_file('data/training_data/CrowdVal/CrowdVal_southafrica_final_points.shp')

In [17]:
#create common attribute with 'class' values
kenya['class'] = kenya['GRID_CODE'].astype('int8')
ivy_coast['class'] = ivy_coast['ValValue'].astype('int8')
gabon['class'] = gabon['ValValue'].astype('int8')

In [19]:
len(kenya) + len(ivy_coast) + len(gabon)

8741

In [None]:
#export to file
gabon.to_file('data/training_data/CrowdVal/cleaned/gabon_crowdval_cleaned.shp')
ivy_coast.to_file('data/training_data/CrowdVal/cleaned/ivory_coast_crowdval_cleaned.shp')
kenya.to_file('data/training_data/CrowdVal/cleaned/kenya_crowdval_cleaned.shp')

## Merge datasets into one common cropland-non cropland training dataset

In [28]:
#open every dataset
# crop_ref = gpd.read_file("data/training_data/globalCropRefernceData_Africa_2016_allData.shp")
crop_ref = gpd.read_file("data/training_data/globalCropRefernceData_Africa_2016_control.shp")
gfsad = gpd.read_file("data/training_data/GFSAD_training_Africa.shp")
gabon = gpd.read_file('data/training_data/CrowdVal/cleaned/gabon_crowdval_cleaned.shp')
ivy_coast = gpd.read_file('data/training_data/CrowdVal/cleaned/ivory_coast_crowdval_cleaned.shp')
kenya = gpd.read_file('data/training_data/CrowdVal/cleaned/kenya_crowdval_cleaned.shp')

In [29]:
#crowdVal & GFSAD datasets need to be reclassified into 1 = crop, 0=non-crop
gabon['class'] = np.where(gabon['class'] == 4, 1, 0)
kenya['class'] = np.where(kenya['class'] == 4, 1, 0)
ivy_coast['class'] = np.where(ivy_coast['class'] == 4, 1, 0)
gfsad['class'] = np.where(gfsad['class'] == 1, 1, 0)

In [30]:
#if subsetting, randomly sample a % of each dataset as they can be v. large
perc = 0.5
# crop_ref = crop_ref.sample(n=int(len(crop_ref)*perc), random_state=1)
kenya = kenya.sample(n=int(len(kenya)*perc), random_state=1)
gabon = gabon.sample(n=int(len(gabon)*perc), random_state=1)
ivy_coast = ivy_coast.sample(n=int(len(ivy_coast)*perc), random_state=1)

In [31]:
#merge all the datasest together
gdf_list = [crop_ref, gfsad, gabon, ivy_coast, kenya]
train = gpd.GeoDataFrame(pd.concat(gdf_list, ignore_index=True), crs=crop_ref.crs)

In [32]:
#simplify columns to just the geom and class
train = train.filter(['geometry', 'class'])

In [33]:
# map_shapefile(train.sample(n=500), attribute='class')
# train.plot(figsize=(10,10), markersize=5)
len(train)

5850

In [34]:
train.to_file('data/training_data/cropland_training_data.shp')

## Exracting random points from polygon training dataset

In [65]:
gdf_polys  = gpd.read_file("data/training_data/GLCNMO_2008_crop_africa.shp")

#### Method 1: random points within bounding coords of entire shapefile

Works but is no good for a sparse set of polygons as very few points land inside the polygons

In [70]:
# find the bounds of your geodataframe
x_min, y_min, x_max, y_max = gdf_polys.total_bounds

# set sample size
n = 100000
# generate random data within the bounds
x = np.random.uniform(x_min, x_max, n)
y = np.random.uniform(y_min, y_max, n)

# convert them to a points GeoSeries
gdf_points = gpd.GeoSeries(gpd.points_from_xy(x, y))
# only keep those points within polygons
gdf_points = gdf_points[gdf_points.within(gdf_polys.unary_union)]

#### Method 2: Random points per polygon

This works, but currently returns points appended as columns to original df, need to get them into a more usuable format where the class matches the point

In [12]:
from shapely.geometry import Point
def random_point_in_shp(shp):
    within = False
    while not within:
        x = np.random.uniform(shp.bounds[0], shp.bounds[2])
        y = np.random.uniform(shp.bounds[1], shp.bounds[3])
        within = shp.contains(Point(x, y))
    return Point(x,y)

In [59]:
res = gpd.GeoDataFrame()
res

In [76]:
for num in range(25):
    geo_df['Point{}'.format(num)] = geo_df['geometry'].apply(random_point_in_shp)
    