## SQL Database for deforestation

### Import Dependencies

In [1]:
import os
import pandas as pd
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from sqlalchemy import text

In [2]:
# Define the destination folder and database path
destination_folder = 'data'
database_filename = 'deforestation_data.sqlite'
destination_path = os.path.join(destination_folder, database_filename)

# Create an SQLite engine
engine = create_engine(f'sqlite:///{destination_path}')

In [3]:
# Load CSVs into DataFrames
deforestation = pd.read_csv('data/deforestation_raw.csv', low_memory = False)

# Check the DataFrame
print(deforestation.head())

       country  threshold      area  extent_2000  extent_2010  gain_2000-2020  \
0  Afghanistan          0  64385715     64385715     64385715           10741   
1  Afghanistan         10  64385715       432115       126247           10741   
2  Afghanistan         15  64385715       302660       106867           10741   
3  Afghanistan         20  64385715       284357       105733           10741   
4  Afghanistan         25  64385715       254867        72395           10741   

   loss_2001  loss_2002  loss_2003  loss_2004  ...  emissions_2019  \
0        103        214        267        225  ...             NaN   
1         92        190        253        207  ...             NaN   
2         91        186        247        205  ...             NaN   
3         89        180        245        203  ...             NaN   
4         89        180        245        202  ...             NaN   

   emissions_2020  emissions_2021  emissions_2022  emissions_2023  \
0             NaN      

In [4]:
# Load DataFrame
deforestation.to_sql('deforestation', con=engine, if_exists='replace', index=False)

# Verify tables
deforestation_from_db = pd.read_sql('SELECT * FROM deforestation', con=engine)


# Check the data loaded
print(deforestation_from_db.head())

       country  threshold      area  extent_2000  extent_2010  gain_2000-2020  \
0  Afghanistan          0  64385715     64385715     64385715           10741   
1  Afghanistan         10  64385715       432115       126247           10741   
2  Afghanistan         15  64385715       302660       106867           10741   
3  Afghanistan         20  64385715       284357       105733           10741   
4  Afghanistan         25  64385715       254867        72395           10741   

   loss_2001  loss_2002  loss_2003  loss_2004  ...  emissions_2019  \
0        103        214        267        225  ...             NaN   
1         92        190        253        207  ...             NaN   
2         91        186        247        205  ...             NaN   
3         89        180        245        203  ...             NaN   
4         89        180        245        202  ...             NaN   

   emissions_2020  emissions_2021  emissions_2022  emissions_2023  \
0             NaN      

In [5]:
# Check missing values
# Percentage of missing values
missing_percentage = deforestation.isnull().sum() * 100 / len(deforestation)
print(missing_percentage)
print()
missing_values = deforestation.isnull().sum()
print(missing_values)

country                        0.000000
threshold                      0.000000
area                           0.000000
extent_2000                    0.000000
extent_2010                    0.000000
                                ...    
subnational1                   6.316916
iso                           84.706237
loss__year                    84.706237
tree_cover_loss               84.706237
tree_cover_loss_from_fires    84.706237
Length: 63, dtype: float64

country                           0
threshold                         0
area                              0
extent_2000                       0
extent_2010                       0
                              ...  
subnational1                   1888
iso                           25317
loss__year                    25317
tree_cover_loss               25317
tree_cover_loss_from_fires    25317
Length: 63, dtype: int64


In [None]:
# Sort from most to least
missing_values_sorted = missing_percentage.sort_values(ascending=False)

# Display the sorted list of missing values
print(missing_values_sorted)

In [None]:
deforestation.info()

In [None]:
deforestation.describe()