# Update training data with manually drawn polygons

This notebook will merge manually drawn crop/non-crop polygons (done either QGIS or ArcGIS) with the training data collected using Collect Earth.

During each iteration of this procedure, update the suffix of the output file with the date of creation in format YYYYMMDD, this will help keep track of which iteration of training data is used for which set of classifications.

***

*Filename guide:*

* `<aez>_training_data_<YYYYMMDD>.geojson`: The training dataset that includes CEO data, manually collected polygons, and any pre-existing datasets.
* `ceo_td_polys.geojson` : training data polygons retrievd from Collect Earth, these are combined the manually collected polygons and any pre-existing datasets to produce the `<aez>_training_data_<date_of_creation>.geojson` file


In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt

## Analysis Parameters

In [None]:
date_suffix='20211209' # keep date suffix for all code run using this training data

ceo_td_path = 'data/ceo_td_polys.geojson' #shouldn't need to change this

manual_poly_path = 'data/southeast_manual_crop_polys.shp' #the file you've been adding new TD polygons too in GIS.

aez = 'data/Southern_SE.shp'

## Open vector files

In [None]:
#add manually collected polygons
manual = gpd.read_file(manual_poly_path)
ceo = gpd.read_file(ceo_td_path)
aez = gpd.read_file(aez)

## Clip CEO data to sub-region

In [None]:
ceo = gpd.overlay(ceo, aez, how='intersection')

## Reclassify Class field

In [None]:
manual['Class'] = np.where(manual['Class']=='crop', 1, manual['Class'])
manual['Class'] = np.where(manual['Class']=='non-crop', 0, manual['Class'])

## Merge files together

In [None]:
training_data = pd.concat([manual,ceo]).reset_index(drop=True)

## Ensure class is in integer type

In [None]:
training_data['Class'] = training_data['Class'].astype(int)

## Counts for each class

In [None]:
print('No. of samples: '+str(len(training_data)))
print('Crop samples = '+str(len(training_data[training_data['Class']==1])))
print('Non-Crop samples = '+str(len(training_data[training_data['Class']==0])))

## Export to disk

This file will be the new training data to pass into the `1_Extract_training_data.ipynb` notebook

In [None]:
training_data.to_file('data/Southeast_training_data_'+date_suffix+'.geojson', driver='GeoJSON')