EDA of final dataset after data engineering

In [None]:
# general usage ML libraries 
import math
import numpy as np 
import pandas as pd
import geopandas as gpd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sn

pd.set_option('display.max_rows', 25)

# vector operations
import geopandas as gpd
import shapely.geometry
import shapely.wkt

# raster operations
import os
from osgeo import gdal
import rasterio.mask 
import rasterstats
import rasterio 
from rasterio.plot import show
from rasterio.warp import reproject, Resampling, calculate_default_transform
from rasterio.features import rasterize

import subprocess

# CRS transformations
import pyproj

# interpretability
import shap

# text processing
import textwrap

In [None]:
DATA_PATH = 'data_final/'

# Land cover data
LC_YEAR = [1990, 2000, 2006, 2012, 2018]
LC_PATHS = [DATA_PATH+'lc/LC_'+str(year)+'_ND_REC.tif' for year in LC_YEAR]
LC_MODE_3_PATHS = [DATA_PATH+'neigh/mode/LC_'+str(year)+'_MODE_3.tif' for year in LC_YEAR]
LC_MODE_5_PATHS = [DATA_PATH+'neigh/mode/LC_'+str(year)+'_MODE_5.tif' for year in LC_YEAR]
LC_AGRI_3_PATHS = [DATA_PATH+'neigh/agri/LC_'+str(year)+'_PER_AGRI_3.tif' for year in LC_YEAR]
LC_FOREST_3_PATHS = [DATA_PATH+'neigh/forest/LC_'+str(year)+'_PER_FOREST_3.tif' for year in LC_YEAR]

# distance to urban areas
DISTANCE_PATHS = [DATA_PATH+'distance/URB_DIST_'+str(year)+'.tif' for year in LC_YEAR]

# most recent fires year on top
FIREYEAR = 'data_final/fires/FIREYEAR.tif'

# fire binary variable per time period
FIRE_YEAR = ['2001-2005', '2012-2017', '2018', '2019', '2020', '2021', '2022']
YEAR_PATHS = ['FIRE_ANTE_2006.tif', 'FIRE_2012_TO_2018.tif', 'FIRE_2018.tif', 'FIRE_2019.tif', 'FIRE_2020.tif', 
                  'FIRE_2021.tif', 'FIRE_2022.tif']

FIREYEAR_PATHS = ['data_final/fires/'+year_path for year_path in YEAR_PATHS]

# topographic data
DEM_PATH = 'data_final/topographic/dem.tif'
SLOPE_PATH = 'data_final/topographic/SLOPE_ND.tif'
ASPECT_PATH = 'data_final/topographic/aspect.tif'
HILLSHADE_PATH = 'data_final/topographic/hillshade.tif'

# protected areas
PROTECTED_PATH = 'data_final/protected/raster_natural_protected.tif'

# vegetation seasonal data
VEGETATION_YEAR = ['2017', '2018', '2019', '2020', '2021']
VEGETATION_SEASON = ['season1', 'season2']
VEGETATION_VARS = ['QFLAG', 'MINV', 'MAXV', 'AMPL', 'RSLOPE', 'LSLOPE']
VEGETATION_PATHS = ['data_final/season/'+year+'/merged_'+year+'_'+season+'_'+var+'.tif' for year in VEGETATION_YEAR for var in VEGETATION_VARS
                   for season in VEGETATION_SEASON]

VEG_IND_VARS = ['ndvi', 'vi', 'evi']
VEG_IND_YEAR = VEGETATION_YEAR[1:] + ['2022']


# population density
DENSITY_PATH = 'data_final/population/population_density_mode.tif'

# distance to populated areas
DENSITY_VALUES = ['2', '3', '4']

# emissivity and land surface temperature
TEMP_YEAR = ['2018', '2019', '2020', '2021', '2022']
EMISSIVITY_PATHS = ['data_final/emissivity/emissivity_'+year+'.tif' for year in TEMP_YEAR]
LST_PATHS = ['data_final/emissivity/lst_'+year+'.tif' for year in TEMP_YEAR]

In [None]:
fireyear_raster = {}
fireyear_array = {}
fireyear_array_bin = {}

[fireyear_raster.update({year : rasterio.open(FIREYEAR_PATHS[i]) }) for (i, year) in enumerate(FIRE_YEAR)]

In [None]:
import xml.etree.ElementTree as ET

# Parse the XML file
tree = ET.parse('metadata/land_cover_meta.xml')

# Get the root element of the tree
root = tree.getroot()

# Create an empty dictionary to store the palette entries
palette_entries = {}

# Iterate over the paletteEntry elements
for entry in root.iter('paletteEntry'):
	# Extract the color, value, and label attributes
	color = entry.get('color')
	value = int(entry.get('value'))
	label = entry.get('label')
    
	# Add the entry to the dictionary
	palette_entries[value] = {'color': color, 'label': label.split(' - ')[1]}
    
palette_entries[0] = {'color': '#ffffff', 'label': 'No Data'}

print(palette_entries)

In [None]:
def wrap_label(label):
    return "\n".join(textwrap.wrap(label, width=20))

In [None]:
color_map = {key: (value['color'], wrap_label(value['label'])) for key, value in sorted(palette_entries.items())}
print(color_map)

In [None]:
# Open the raster dataset
#with rasterio.open(LC_PATHS[-1]) as src:
# Read the raster values into a NumPy array
#raster = src.read()
raster = rasterio.open(LC_PATHS[-1]).read().squeeze()

print(np.bincount(raster.reshape((-1))))

# Create a figure and axis
fig, ax = plt.subplots(figsize=(15, 15))

# Create a list of colors for the colormap
colors_list = [color for color, label in color_map.values()]

# Create a colormap object from the list of colors
cmap = ListedColormap(colors_list)

# Display the raster using the colormap and value range
im = ax.imshow(raster, cmap=cmap)

# Create a legend
legend_elements = [
    matplotlib.patches.Patch(facecolor=color, edgecolor='k', label=label)
    for color, label in color_map.values()
]
ax.legend(handles=legend_elements, loc='lower left', ncol=2)

# Save the figure
plt.savefig('plots/land_cover.png', bbox_inches='tight')

# Show the plot
plt.show()

In [None]:
# Historical inventory of Forest Fire Burnt Area
historical_burnt_areas_path = '../data/historical_burnt_areas/effis_layer/modis.ba.poly.shp'

In [None]:
burnt_areas = gpd.read_file(historical_burnt_areas_path)
burnt_areas.head()

In [None]:
burnt_areas['FIREDATE'] = pd.to_datetime(burnt_areas['FIREDATE'])

In [None]:
burnt_areas['FIREYEAR'] = burnt_areas['FIREDATE'].apply(lambda x: x.year)
burnt_areas.head()

In [None]:
burnt_areas_pyrenees = burnt_areas[(burnt_areas['COUNTRY'] == 'FR') & ((burnt_areas['PROVINCE'] == 'Pyrénées-Atlantiques') | (burnt_areas['PROVINCE'] == 'Hautes-Pyrénées'))].to_crs(epsg='3035')

In [None]:
burnt_areas_pyrenees['FIREYEAR'].value_counts()

In [None]:
bins = burnt_areas_pyrenees['FIREYEAR'].max() - burnt_areas_pyrenees['FIREYEAR'].min() + 1 
print(bins)
print(len(burnt_areas_pyrenees[burnt_areas_pyrenees.FIREYEAR == 2021]))

# Set the style of the plot
sn.set_style('darkgrid')

# Set the font size of the tick labels
sn.set_context('talk', font_scale=0.5)

# Create the histogram
ax = burnt_areas_pyrenees['FIREYEAR'].hist(bins=bins, rwidth=0.9, color='orange')

# Add a title and x-axis label
#ax.set_title('Number of Fires per year in our area of interest')
ax.set_xlabel('Year', fontsize=18)
ax.tick_params(axis='both', which='major', labelsize=12)
plt.tight_layout()

# Save the figure
plt.savefig('plots/fire_years_histogram.png', bbox_inches='tight')

# Show the plot
plt.show()

In [None]:
# Boundaries french departments
french_departments_boundaries_path = '../data/departments_boundaries/departements-20170102-simplified/departements-20170102.shp'

In [None]:
sn.reset_orig()

In [None]:
# Plot the raster data using matplotlib
fig = plt.figure(figsize=(15, 10))

years = ['before 2022', 'in 2022']
year_paths = ['data_final/fires/FIREYEAR_TRAIN.tif', 'data_final/fires/FIRE_2022.tif']

# Open the raster file
for idx, p in enumerate(year_paths):    
    ax = plt.subplot2grid((1, 2), (0, idx), colspan=1)
    
    src = rasterio.open(p)
    # Read the raster data and metadata
    raster_data = (src, 1)

    # Read the contour shapefile with geopandas
    contours = gpd.read_file(french_departments_boundaries_path)
    contours = contours[(contours.code_insee == '64') | (contours.code_insee == '65')].to_crs(epsg='3035')

    colormap = ListedColormap(["white", "red"])

    show(raster_data, ax=ax, cmap=colormap)

    contours.plot(ax=ax, facecolor='none', edgecolor='black')
    
    ax.set_title(f'Fires reported {years[idx]}')

# Save the figure
plt.savefig('plots/fires.png', bbox_inches='tight')
# Show the plot
plt.show()

In [None]:
# Set the style of the plot
sn.set_style('darkgrid')

# Set the font size of the tick labels
sn.set_context('talk', font_scale=0.5)

pixel_distribution = [0.02346117155329486, 0.004508364275833487, 0.0005876928337913102, 0.03140133376777727, 0.0070092236453937785, 0.019533750863363748, 0.013105371033784043]
years = ['2001-2005', '2012-2017', '2018', '2019', '2020', '2021', '2022']

plt.bar(x=years, height=pixel_distribution, color='turquoise')

# Add a title and x-axis label
#plt.title('Pixels with fire reports proportion distribution', fontsize=20)
plt.xlabel('Time period', fontsize=15)
plt.ylabel('Proportion of pixels with fire reports', fontsize=15)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.tick_params(axis='x', which='major', labelsize=9)
plt.tight_layout()

# Save the figure
plt.savefig('plots/pixel_distribution.png', bbox_inches='tight')

# Show the plot
plt.show()

In [None]:
import rasterio
from rasterio.mask import mask
from shapely.geometry import mapping

# Open the raster file
with rasterio.open(FIREYEAR_PATHS[-1]) as src:
	# Read the data and metadata
	data = src.read(masked=True)
	transform = src.transform

# Open the shapefile
dep_boundaries = gpd.read_file(french_departments_boundaries_path, encoding='utf-8')
dep_boundaries.head()

# keep departments of interest
pyrenees_boundaries = dep_boundaries[(dep_boundaries.code_insee == '64') | (dep_boundaries.code_insee == '65')]
pyrenees_boundaries.head()


In [None]:
import numpy as np 
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 4)

In [None]:
dep_boundaries = gpd.read_file(french_departments_boundaries_path, encoding='utf-8')
dep_boundaries.head()

In [None]:
# Boundaries french departments
french_departments_boundaries_path = '../data/departments_boundaries/departements-20170102-simplified/departements-20170102.shp'

# Historical inventory of Forest Fire Burnt Area
historical_burnt_areas_path = '../data/historical_burnt_areas/effis_layer/modis.ba.poly.shp'

# Digital Elevation Model and derived topographic variable
eudem_path = '../data/eudem/euden/EUD_CP-DEMS_3500025000-AA.tif'
slope_path = '../data/eudem/slope/EUD_CP-SLOP_3500025000-AA.tif'
aspect_path = '../data/eudem/aspect/EUD_CP-ASPC_3500025000-AA.tif'
hillshare_path = '../data/eudem/hillshade/EUD_CP-HLSD_3500025000-AA.tif'

# Land Cover
land_cover_1990_path = '../data/corine_cover/land_cover_1990/u2000_clc1990_v2020_20u1_raster100m/DATA/U2000_CLC1990_V2020_20u1.tif'
land_cover_1990_path = '../data/corine_cover/land_cover_2018/u2000_clc1990_v2020_20u1_raster100m/DATA/U2000_CLC1990_V2020_20u1.tif'

# Protected area map
protected_areas_path = '../data/protected_areas/ens/ens.shp'

#### Boundaries of franch departments

In [None]:
dep_boundaries = gpd.read_file(french_departments_boundaries_path, encoding='utf-8')
dep_boundaries.head()

In [None]:
# keep departments of interest
pyrenees_boundaries = dep_boundaries[(dep_boundaries.code_insee == '64') | (dep_boundaries.code_insee == '65')]
pyrenees_boundaries.head()

In [None]:
# display area of interst
pyrenees_boundaries.plot()

plt.title('Pyrenées')
plt.show()

In [None]:
burnt_areas = gpd.read_file(historical_burnt_areas_path)
burnt_areas.head()

In [None]:
burnt_areas_pyrenees = burnt_areas[(burnt_areas['COUNTRY'] == 'FR') & ((burnt_areas['PROVINCE'] == 'Pyrénées-Atlantiques') |(burnt_areas['PROVINCE'] =

In [None]:
burnt_areas_pyrenees['COMMUNE'].value_counts()[:10].plot(kind='bar')

plt.title('Top 10 communes with the highest number of fires')

plt.show()