![Digital Earth Pacific](../dep.png)

### Tonga LULC Field Data Alignment and Cleanup across Multiple Surveys

In [None]:
import os
import sys
sys.path.insert(0, '..')
import depal as dep
import geopandas as gpd
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
dep.init()

#### Load Both Surveys

Your code goes in the cells below. Add more cells here by clicking on the + button above.

In [None]:
gdf1 = gpd.read_file("datapoints_010423.geojson")
gdf2 = gpd.read_file("datapoints_260723.gpkg")

#### Cleanup and Merge

In [None]:
gdf1.rename(columns={"LULC_class": "LULC"}, inplace=True)
gdf2.loc[gdf2['LULC'].isnull(), 'LULC'] = gdf2['Agriculture']
gdf2.loc[gdf2['LULC'].isnull(), 'LULC'] = gdf2['Coastal']
gdf2.loc[gdf2['LULC'].isnull(), 'LULC'] = gdf2['Forest']
gdf2.loc[gdf2['LULC'].isnull(), 'LULC'] = gdf2['Hazards']
gdf2.loc[gdf2['LULC'].isnull(), 'LULC'] = gdf2['Mining']
gdf2.loc[gdf2['LULC'].isnull(), 'LULC'] = gdf2['Other_LULC']
gdf2 = gdf2[gdf2["LULC"].notna()]

In [None]:
gdf3 = pd.concat([gdf1, gdf2])

In [None]:
gdf = gdf3[['LULC', 'geometry']]
gdf = gdf[gdf.LULC != ""]

In [None]:
len(gdf)

In [None]:
summary = gdf.LULC.value_counts()
pd.DataFrame(summary.items())

#### Alignment (Change Composition of Classes Here)

Baseline Identified Classes:

- Settlements
- Forest_Land (Natural, Plantation)
- Grass_Shrub_Land
- Crop_Land_Vegetation
- Bare_Burnt_Land
- Sand
- Wetland_Mangroves
- Agro_Forestry (Trees, Crops, Pasture, Coconuts)
- Roads
- Mining_Quarry

In [None]:
#Settlements
gdf.loc[gdf['LULC'] == 'Infrastructure', 'LULC'] = 'Settlements'
gdf.loc[gdf['LULC'] == 'Solar_panels', 'LULC'] = 'Settlements'

#Forest_Land (Agro, Natural, Plantation)
gdf.loc[gdf['LULC'] == 'Forest_land', 'LULC'] = 'Forest_Land'
gdf.loc[gdf['LULC'] == 'Low_density_forest_palm', 'LULC'] = 'Forest_Land'
gdf.loc[gdf['LULC'] == 'Natural_scattered_forest', 'LULC'] = 'Forest_Land'
gdf.loc[gdf['LULC'] == 'Natural_Dense_forest', 'LULC'] = 'Forest_Land'
gdf.loc[gdf['LULC'] == 'Agroforestry_coconuts', 'LULC'] = 'Forest_Land'
gdf.loc[gdf['LULC'] == 'Agroforestry', 'LULC'] = 'Forest_Land'
gdf.loc[gdf['LULC'] == 'Hardwood_tree_species', 'LULC'] = 'Forest_Land'
gdf.loc[gdf['LULC'] == 'Softwood_tree_species', 'LULC'] = 'Forest_Land'
gdf.loc[gdf['LULC'] == 'Integrated_Livestock_Agriculture', 'LULC'] = 'Forest_Land'

#Crop_Land_Vegetation
gdf.loc[gdf['LULC'] == 'Vegetation', 'LULC'] = 'Crop_Land_Vegetation'
gdf.loc[gdf['LULC'] == 'Cropland', 'LULC'] = 'Crop_Land_Vegetation'
gdf.loc[gdf['LULC'] == 'Monocropping', 'LULC'] = 'Crop_Land_Vegetation'
gdf.loc[gdf['LULC'] == 'Mixed_cropping', 'LULC'] = 'Crop_Land_Vegetation'
gdf.loc[gdf['LULC'] == 'Cropland_Agriculture', 'LULC'] = 'Crop_Land_Vegetation'
gdf.loc[gdf['LULC'] == 'Weeds', 'LULC'] = 'Crop_Land_Vegetation'

#Grass_Shrub_Land
gdf.loc[gdf['LULC'] == 'Shrubs', 'LULC'] = 'Grass_Shrub_Land'
gdf.loc[gdf['LULC'] == 'Grassland', 'LULC'] = 'Grass_Shrub_Land'
gdf.loc[gdf['LULC'] == 'Shrubland', 'LULC'] = 'Grass_Shrub_Land'
gdf.loc[gdf['LULC'] == 'Lawn_grass', 'LULC'] = 'Grass_Shrub_Land'
gdf.loc[gdf['LULC'] == 'Natural_Shrubs', 'LULC'] = 'Grass_Shrub_Land'
gdf.loc[gdf['LULC'] == 'Grazing_land', 'LULC'] = 'Grass_Shrub_Land'
gdf.loc[gdf['LULC'] == 'Pasture_land', 'LULC'] = 'Grass_Shrub_Land'

#Wetland_Mangroves
gdf.loc[gdf['LULC'] == 'Mangroves', 'LULC'] = 'Wetland_Mangroves'
gdf.loc[gdf['LULC'] == 'Wetland', 'LULC'] = 'Wetland_Mangroves'
gdf.loc[gdf['LULC'] == 'Mudflats', 'LULC'] = 'Wetland_Mangroves'
#gdf.loc[gdf['LULC'] == 'Shallow_ocean', 'LULC'] = 'Wetland_Mangroves'

#Bare_Burnt_Land
gdf.loc[gdf['LULC'] == 'Bare_land', 'LULC'] = 'Bare_Burnt_Land'
gdf.loc[gdf['LULC'] == 'Burned_land', 'LULC'] = 'Bare_Burnt_Land'

#Roads
gdf.loc[gdf['LULC'] == 'Roads_paved', 'LULC'] = 'Roads'
gdf.loc[gdf['LULC'] == 'Roads_unpaved', 'LULC'] = 'Roads'

#Mining
gdf.loc[gdf['LULC'] == 'Rock', 'LULC'] = 'Mining'
gdf.loc[gdf['LULC'] == 'Quarry', 'LULC'] = 'Mining'
gdf.loc[gdf['LULC'] == 'Mining areas', 'LULC'] = 'Mining'
gdf.loc[gdf['LULC'] == 'Active_mining', 'LULC'] = 'Mining'
gdf.loc[gdf['LULC'] == 'Closed_abandoned_mining', 'LULC'] = 'Mining'

#### Remove classes of smaller sample sizes

In [None]:
#Remove smaller sample size
remove_list = ['Shallow_ocean', 'Coral_reef', 'Seagrass', 'Deep_ocean', 'Seagrasses', 'Aquaculture', 'Invasive_species', 'Climbing_vines', 'Surface_water', 'Seaweed', 'Roads']
for r in remove_list:
    gdf = gdf[gdf["LULC"] != r]

#### Codify Class Codes

In [None]:
class_list = list(gdf.LULC.unique())

gdf['code'] = 0
code = 1
for c in class_list:
    gdf.loc[gdf['LULC'] == c, 'code'] = code
    code = code + 1   

#### Summarise

In [None]:
summary = gdf.LULC.value_counts()
pd.DataFrame(summary.items())

In [None]:
len(gdf)

#### Save output for Machine Learning Process

In [None]:
gdf.to_file("datapoints_final.gpkg", layer='LULC', driver="GPKG", overwrite=True)

In [None]:
dep.cleanup()