# Forest Data Explore
**Source:** Sassan Satchi from NASA JPL provided annual forest flux data for us to use in TRACE Places.
**Goal:** Look at single year of data to better understand dataset. Fairly messy scratchwork in here.

@mattyarri

**Last Updated:** December 3, 2021

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import statsmodels
import numpy as np
import geopandas as gpd
from keplergl import KeplerGl

pd.options.mode.chained_assignment = None  # default='warn'

from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pio.templates.default = "none"
%config InlineBackend.figure_format ='retina'

## Data Read-in + Messy Column Check

In [15]:
import pandas
url = "https://docs.google.com/spreadsheets/d/18jO9Rccs_pWSBVUEGbQWRVOAbddvLr4d/gviz/tq?tqx=out:csv"
df = pandas.read_csv(url)
df['Label'] = df['Country'] + ', ' + df['Province']
df.head()

Unnamed: 0.1,Unnamed: 0,Country,Province,Forest Area (ha),Nonforest Area (ha),Forest loss Yr-2019 (ha),Forest C Yr-2019 (GgC),Nonforest C Yr-2019 (GgC),Total C Yr-2019 (GgC),Deforestation Emission Yr-2019 (GgC),Degradation Emission Yr-2019 (GgC),Fire Emission Yr-2019 (GgC),Removal Yr-2019 (tons C),Label
0,16,Afghanistan,Khost,5781.328964,26786.93542,6.954579,0.587734,4644.850254,4645.438194,0.132179,0.0,0.593043,44975.32547,"Afghanistan, Khost"
1,17,Afghanistan,Kunar,118422.9614,113126.77,14.02864,1706.720829,12172.55783,13879.27914,0.656149,5.9968,0.0,-214932.546,"Afghanistan, Kunar"
2,19,Afghanistan,Laghman,15730.00183,30937.96692,0.659929,13.940511,5662.818432,5676.758289,0.025304,0.139418,0.0,-69262.23636,"Afghanistan, Laghman"
3,21,Afghanistan,Nangarhar,11270.89844,26970.36133,0.286653,72.677568,9697.94178,9770.619392,0.013591,0.0,0.0,174216.9857,"Afghanistan, Nangarhar"
4,23,Afghanistan,Nuristan,53666.25977,84450.43945,0.876654,497.455329,27199.81384,27697.26944,0.043111,0.424497,0.0,229840.9194,"Afghanistan, Nuristan"


In [4]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,Country,Province,Forest Area (ha),Nonforest Area (ha),Forest loss Yr-2019 (ha),Forest C Yr-2019 (GgC),Nonforest C Yr-2019 (GgC),Total C Yr-2019 (GgC),Deforestation Emission Yr-2019 (GgC),Degradation Emission Yr-2019 (GgC),Fire Emission Yr-2019 (GgC),Removal Yr-2019 (tons C)
658,898,Ghana,Volta,459665.3,1321548.0,6756.485086,11469.54346,49239.15482,60708.69827,418.146253,110.987842,1780.133128,-3045126.0
507,676,Dominican Republic,Santiago,209841.9,47572.6,808.502594,10539.91795,2800.414324,13340.33203,39.313544,33.849936,0.0,5209.476
1129,1535,Lesotho,Mokhotlong,586.8011,36201.73,1.178126,0.196975,234.380171,234.577149,0.009999,0.0,3.664005,137959.4
547,743,Ecuador,Napo,1063027.0,69848.48,1919.636177,99059.30328,4073.725224,103133.0261,259.499371,277.521193,0.0,-1450686.0
1985,2776,Tajikistan,Khatlon,6642.281,68711.57,0.662491,0.0,18104.46739,18104.46739,0.00024,0.0,0.012292,-545603.9
1895,2657,Syria,Hamah,28137.15,22825.67,704.535824,176.52525,6310.898781,6487.423897,41.677795,0.0,75.397484,-179740.3
876,1151,Ireland,Roscommon,13341.49,44650.4,414.689639,902.3754,2633.725882,3536.101341,28.482206,0.0,0.0,53635.11
1126,1530,Lesotho,Butha-Buthe,142.1505,34718.21,1.057096,0.0,547.990739,547.990739,0.006007,0.0,4.121695,-55593.54
2056,2899,Turkey,Ordu,389200.0,89669.19,384.405743,20464.7274,3594.987869,24059.71527,28.380558,0.0,0.479194,333046.3
1485,2097,Philippines,Cavite,47659.31,22322.27,97.412782,2645.177841,1848.346829,4493.525028,6.702966,11.742316,0.0,-21516.59


In [6]:
# Let's check real quick that the columns add in the way we think they do
df[['Forest C Yr-2019 (GgC)','Nonforest C Yr-2019 (GgC)','Deforestation Emission Yr-2019 (GgC)',
    'Degradation Emission Yr-2019 (GgC)','Fire Emission Yr-2019 (GgC)','Removal Yr-2019 (tons C)',
    'Total C Yr-2019 (GgC)']]

Unnamed: 0,Forest C Yr-2019 (GgC),Nonforest C Yr-2019 (GgC),Deforestation Emission Yr-2019 (GgC),Degradation Emission Yr-2019 (GgC),Fire Emission Yr-2019 (GgC),Removal Yr-2019 (tons C),Total C Yr-2019 (GgC)
0,0.587734,4644.850254,0.132179,0.000000,0.593043,4.497533e+04,4645.438194
1,1706.720829,12172.557830,0.656149,5.996800,0.000000,-2.149325e+05,13879.279140
2,13.940511,5662.818432,0.025304,0.139418,0.000000,-6.926224e+04,5676.758289
3,72.677568,9697.941780,0.013591,0.000000,0.000000,1.742170e+05,9770.619392
4,497.455329,27199.813840,0.043111,0.424497,0.000000,2.298409e+05,27697.269440
...,...,...,...,...,...,...,...
2372,3193.665028,92745.086670,149.902105,15.953777,5682.972431,-2.566144e+06,95938.758850
2373,2083.210468,84284.034730,38.672268,0.346057,119.506553,2.351748e+05,86367.240910
2374,1062.944412,121544.433600,76.554701,2.450490,2247.374058,-1.671693e+06,122607.376100
2375,97.951524,60811.439510,5.060663,0.000000,142.735392,7.221533e+05,60909.393310


In [32]:
component_columns = ['Forest C Yr-2019 (GgC)','Nonforest C Yr-2019 (GgC)','Deforestation Emission Yr-2019 (GgC)',
    'Degradation Emission Yr-2019 (GgC)','Fire Emission Yr-2019 (GgC)']
df['Test Sum'] = df[component_columns].sum(axis=1)
df[['Test Sum','Total C Yr-2019 (GgC)']]

(df['Test Sum'] - df['Total C Yr-2019 (GgC)']).sum() # Check for parity everywhere
px.histogram((df['Test Sum'] - df['Total C Yr-2019 (GgC)']),title = 'Parity Check for Total C')

printmd("**--> Hmm, definitely off a little bit here still**")

Unnamed: 0,Test Sum,Total C Yr-2019 (GgC)
0,4646.163210,4645.438194
1,13885.931608,13879.279140
2,5676.923665,5676.758289
3,9770.632938,9770.619392
4,27697.736777,27697.269440
...,...,...
2372,101787.580012,95938.758850
2373,86525.770076,86367.240910
2374,124933.757260,122607.376100
2375,61057.187089,60909.393310


5213535.893855852

**--> Hmm, definitely off a little bit here still**

In [25]:
#Let's plot these components real quick just to visually take a look at how they add up
t_df = df[component_columns + ['Total C Yr-2019 (GgC)','Label']].sample(4)
t_df
fig = px.bar(t_df.melt(id_vars = 'Label'),facet_row='Label',x='variable',y='value',color='variable',height = 600)
fig = fig.update_yaxes(matches=None)
fig

Unnamed: 0,Forest C Yr-2019 (GgC),Nonforest C Yr-2019 (GgC),Deforestation Emission Yr-2019 (GgC),Degradation Emission Yr-2019 (GgC),Fire Emission Yr-2019 (GgC),Total C Yr-2019 (GgC),Label
1407,39403.69415,18566.19072,9.627111,78.001089,759.752929,57969.88297,"Nepal, Far-Western"
625,64496.9101,44455.88303,307.256013,0.0,0.0,108952.774,"France, Provence-Alpes-Côte d'Azur"
45,0.0,30526.46065,0.019449,0.0,51.746096,30526.46065,"Argentina, Mendoza"
2262,40054.77524,69240.34882,211.677373,224.162757,937.174618,109295.1279,"Venezuela, Barinas"


In [39]:
#It looks like the "Total C" column is solely a sum of "Forest C" and "Nonforest C"? Let's check that to be sure
df['Test Sum'] = df['Forest C Yr-2019 (GgC)'] + df['Nonforest C Yr-2019 (GgC)']
df[['Test Sum','Total C Yr-2019 (GgC)']]
(df['Test Sum'] - df['Total C Yr-2019 (GgC)']).sum() # Check for parity everywhere
px.histogram((df['Test Sum'] - df['Total C Yr-2019 (GgC)']),title = 'Parity Check for Total C')
printmd('**--> This looks fairly confirmed as just being the sum of Forest and Non-Forest Flux. Error of 1 is essentially zero**')

Unnamed: 0,Test Sum,Total C Yr-2019 (GgC)
0,4645.437988,4645.438194
1,13879.278659,13879.279140
2,5676.758943,5676.758289
3,9770.619348,9770.619392
4,27697.269169,27697.269440
...,...,...
2372,95938.751698,95938.758850
2373,86367.245198,86367.240910
2374,122607.378012,122607.376100
2375,60909.391034,60909.393310


-1.7853421096266118

**--> This looks fairly confirmed as just being the sum of Forest and Non-Forest Flux. Error of 1 is essentially zero**

In [35]:
#Let's plot that total C column and see if they're all positive values or not
px.histogram(df['Total C Yr-2019 (GgC)'],title = 'Distribution of "Total C" by Subprovince')

#Let's plot all of the subcomponents as well, just to get a sense of sign distributions
for c in component_columns:
    px.histogram(df[c],title = c + ': Subprovince Distribution')
    
#Plot removal as well
px.histogram(df['Removal Yr-2019 (tons C)'], title = 'Removal Yr-2019 (tons C): Subprovince Distribution')

In [40]:
#Triple check: let's make sure carbon removal isn't in the total column?
df['Test Sum'] = df['Forest C Yr-2019 (GgC)'] + df['Nonforest C Yr-2019 (GgC)'] + df['Removal Yr-2019 (tons C)']
df[['Test Sum','Total C Yr-2019 (GgC)']]
(df['Test Sum'] - df['Total C Yr-2019 (GgC)']).sum() # Check for parity everywhere
px.histogram((df['Test Sum'] - df['Total C Yr-2019 (GgC)']),title = 'Parity Check for Total C')
printmd('**--> Definitely not, the carbon removal column adds a lot of noise/error**')

Unnamed: 0,Test Sum,Total C Yr-2019 (GgC)
0,4.962076e+04,4645.438194
1,-2.010533e+05,13879.279140
2,-6.358548e+04,5676.758289
3,1.839876e+05,9770.619392
4,2.575382e+05,27697.269440
...,...,...
2372,-2.470205e+06,95938.758850
2373,3.215420e+05,86367.240910
2374,-1.549086e+06,122607.376100
2375,7.830627e+05,60909.393310


-3403165801.2485766

**--> Definitely not, the carbon removal column adds a lot of noise/error**