# Update training data with manually drawn polygons

This notebook will merge manually drawn crop/non-crop polygons (done either QGIS or ArcGIS) with the training data collected using Collect Earth.

During each iteration of this procedure, update the suffix of the output file with the date of creation in format YYYYMMDD, this will help keep track of which iteration of training data is used for which set of classifications.

***

*Filename guide:*

* `<aez>_training_data_<YYYYMMDD>.geojson`: The training dataset that includes CEO data, manually collected polygons, and any pre-existing datasets.
* `ceo_td_polys.geojson` : training data polygons retrievd from Collect Earth, these are combined the manually collected polygons and any pre-existing datasets to produce the `<aez>_training_data_<date_of_creation>.geojson` file


In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt

## Analysis Parameters

In [2]:
date_suffix='20211029'

ceo_td_path = 'data/ceo_td_polys.geojson' #shouldn't need to change this

manual_poly_path = 'data/sahel_manual_crop_polys.shp' #the file you've been adding new TD polygons too in GIS.

extras_path = 'data/senegal_crop.geojson' #data provide by IPs (cleaned)

## Open vector files

In [3]:
#add manually collected polygons
manual = gpd.read_file(manual_poly_path)
ceo = gpd.read_file(ceo_td_path)
extras = gpd.read_file(extras_path)

## Reclassify Class field

In [4]:
manual['Class'] = np.where(manual['Class']=='crop', 1, manual['Class'])
manual['Class'] = np.where(manual['Class']=='non-crop', 0, manual['Class'])

## Merge files together

In [5]:
training_data = pd.concat([manual,ceo, extras]).reset_index(drop=True)

## Ensure class is in integer type

In [6]:
training_data['Class'] = training_data['Class'].astype(int)

## Counts for each class

In [7]:
print('No. of samples: '+str(len(training_data)))
print('Crop samples = '+str(len(training_data[training_data['Class']==1])))
print('Non-Crop samples = '+str(len(training_data[training_data['Class']==0])))

No. of samples: 5075
Crop samples = 1814
Non-Crop samples = 3261


In [8]:
training_data=training_data.drop('smpl_class', axis=1)

## Export to disk

This file will be the new training data to pass into the `1_Extract_training_data.ipynb` notebook

In [9]:
training_data.to_file('data/sahel_training_data_'+date_suffix+'.geojson', driver='GeoJSON')

## Cleaning IP provided training data from Senegal

In [None]:
a_path = 'data/Parcelles Kédougou 21 Décembre.shp'
b_path = 'data/Parcelles Kolda 21 Décembre.shp'
c_path = 'data/Parcelles Tambacounda 21 Décembre.shp'

### Open

In [None]:
a=gpd.read_file(a_path)
b=gpd.read_file(b_path)
c=gpd.read_file(c_path)

### combine all datasets

In [None]:
_all = pd.concat([a,b,c]).reset_index(drop=True)

### calculate area in hectares and filter

In [None]:
_all = _all.to_crs('epsg:6933')
_all['area_ha'] = _all['geometry'].area / 10000
_all = _all[_all['area_ha']<=1.0] #less than 1 hectare
_all = _all[_all['area_ha']>=0.1] #greater than 0.1 hectare

### Sample crop-specific datasets

In [None]:
n = 25

rice_low = _all[_all['CodeSpé']=='1'].sample(n=n).reset_index(drop=True)
rice_high = _all[_all['CodeSpé']=='2'].sample(n=n).reset_index(drop=True)
millet = _all[_all['CodeSpé']=='3'].sample(n=n).reset_index(drop=True)
maize = _all[_all['CodeSpé']=='4'].sample(n=50).reset_index(drop=True) #more for maize
peas = _all[_all['CodeSpé']=='6'].sample(n=n).reset_index(drop=True)

### concat and clean-up

In [None]:
out = pd.concat([rice_low,rice_high, millet, maize, peas]).reset_index(drop=True)
cols = list(out.columns)
cols.remove('geometry')
out['Class'] = 1
out = out.drop(cols, axis=1)
out.head()

In [None]:
out.to_crs('epsg:4326').to_file('data/senegal_crop.shp')