In [1]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import pyproj
from tqdm import tqdm
import numpy as np

In [2]:
import os
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

In [4]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)
    print("Geopandas version")
    print(gpd.__version__)
    print("Xarray version")
    print(xr.__version__)
    print("Pyproj version")
    print(pyproj.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2
Geopandas version
0.14.2
Xarray version
2023.6.0
Pyproj version
3.6.1


In [7]:
fire_label = pd.read_parquet('../Clean_Data/Model_Data/fire_label_downsample.parquet')
fire_label_OOT = pd.read_parquet('../Clean_Data/Model_Data/fire_label_data_2023_OOT.parquet')

In [8]:
fire_label.shape, fire_label.dtypes

((649228, 5),
 lon               float64
 lat               float64
 day        datetime64[ns]
 IS_FIRE             int32
 month               int32
 dtype: object)

In [9]:
fire_label_OOT.shape, fire_label_OOT.dtypes

((5834976, 4),
 lon               float64
 lat               float64
 day        datetime64[ns]
 IS_FIRE             int32
 dtype: object)

In [10]:
fire_label.head()

Unnamed: 0,lon,lat,day,IS_FIRE,month
164175,-124.308333,40.233333,2013-07-12,1,7
203998,-124.266667,40.316667,2007-07-27,1,7
228923,-124.266667,40.191667,2007-10-25,1,10
230275,-124.266667,40.191667,2011-07-08,1,7
230695,-124.266667,40.191667,2012-08-31,1,8


In [3]:
# Force garbage collection
import gc
gc.collect()

54

In [11]:
features = pd.read_parquet('../Clean_Data/Weather_Data_w_Veg_Filter_Fire_12KM_Merged.parquet')

In [11]:
features.shape

(134727600, 17)

In [12]:
# merge the fire_label and features, inner join on the lon, lat, and day
mod_data = pd.merge(features, fire_label, on=['lon', 'lat', 'day'], how='inner')
mod_data.shape

(649228, 19)

In [13]:
# remove column 'month'
mod_data = mod_data.drop(columns=['month'])

In [14]:
mod_data.columns

Index(['lon', 'lat', 'day', 'dead_fuel_moisture_1000hr',
       'dead_fuel_moisture_100hr', 'pdsi', 'category', 'air_temperature_x',
       'relative_humidity_x', 'max_wind_speed', 'air_temperature_y',
       'relative_humidity_y', 'precipitation_amount', 'specific_humidity',
       'surface_downwelling_shortwave_flux_in_air', 'wind_from_direction',
       'wind_speed', 'IS_FIRE'],
      dtype='object')

In [15]:
# write the merged data to parquet
mod_data.to_parquet('../Clean_Data/Model_Data/fire_label_weather_downsampled.parquet')

In [16]:
mod_data_OOT = pd.merge(features, fire_label_OOT, on=['lon', 'lat', 'day'], how='inner')
mod_data_OOT.shape

(5834976, 18)

In [18]:
mod_data_OOT.columns

Index(['lon', 'lat', 'day', 'dead_fuel_moisture_1000hr',
       'dead_fuel_moisture_100hr', 'pdsi', 'category', 'air_temperature_x',
       'relative_humidity_x', 'max_wind_speed', 'air_temperature_y',
       'relative_humidity_y', 'precipitation_amount', 'specific_humidity',
       'surface_downwelling_shortwave_flux_in_air', 'wind_from_direction',
       'wind_speed', 'IS_FIRE'],
      dtype='object')

In [19]:
mod_data_OOT.to_parquet('../Clean_Data/Model_Data/fire_label_weather_2023_OOT.parquet')

--- merge all features and label together ----

In [21]:
fire_label_weather_downsampled = pd.read_parquet('../Clean_Data/Model_Data/fire_label_weather_downsampled.parquet')
fire_label_weather_2023_OOT = pd.read_parquet('../Clean_Data/Model_Data/fire_label_weather_2023_OOT.parquet')

In [20]:
veg_data = pd.read_parquet('../Clean_Data/lon_lat_pair_weather_match_veg.parquet')
slope_data = pd.read_parquet('../Clean_Data/lon_lat_pair_weather_match_slope.parquet')

In [22]:
veg_data.dtypes, slope_data.dtypes

(lon         float64
 lat         float64
 type        float32
 distance    float64
 dtype: object,
 lon          float64
 lat          float64
 slope_avg    float32
 slope_max    float32
 distance     float64
 dtype: object)

In [23]:
# drop column 'distance' and assign type to str
veg_data = veg_data.drop(columns=['distance'])
veg_data['type'] = veg_data['type'].astype(str)

In [24]:
# drop column 'distance'
slope_data = slope_data.drop(columns=['distance'])

In [25]:
# check if any duplicated rows
veg_data.duplicated().sum()

0

In [26]:
# check if any duplicated rows
slope_data.duplicated().sum()

0

In [27]:
fire_label_weather_downsampled.shape, fire_label_weather_2023_OOT.shape, veg_data.shape, slope_data.shape

((649228, 18), (5834976, 18), (17703, 3), (62160, 4))

In [28]:
# left join the mod_data and veg_data on lon and lat
mod_data_downsampled = pd.merge(fire_label_weather_downsampled, veg_data, on=['lon', 'lat'], how='left')
# left join the mod_data and slope_data on lon and lat
mod_data_downsampled = pd.merge(mod_data_downsampled, slope_data, on=['lon', 'lat'], how='left')
mod_data_downsampled.shape

(649228, 21)

In [29]:
mod_data_OOT = pd.merge(fire_label_weather_2023_OOT, veg_data, on=['lon', 'lat'], how='left')
mod_data_OOT = pd.merge(mod_data_OOT, slope_data, on=['lon', 'lat'], how='left')
mod_data_OOT.shape

(5834976, 21)

In [30]:
# drop lon, lat
mod_data_downsampled = mod_data_downsampled.drop(columns=['lon', 'lat'])
mod_data_downsampled.to_parquet('../Clean_Data/Model_Data/Features_w_Label_downsampled.parquet')

In [31]:
mod_data_OOT = mod_data_OOT.drop(columns=['lon', 'lat'])
mod_data_OOT.to_parquet('../Clean_Data/Model_Data/Features_w_Label_2023_OOT.parquet')