In [1]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import pyproj
from tqdm import tqdm
import numpy as np

In [2]:
import os
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

In [3]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)
    print("Geopandas version")
    print(gpd.__version__)
    print("Xarray version")
    print(xr.__version__)
    print("Pyproj version")
    print(pyproj.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2
Geopandas version
0.14.2
Xarray version
2023.6.0
Pyproj version
3.6.1


In [9]:
# Force garbage collection
import gc
gc.collect()

0

In [10]:
fire_label = pd.read_parquet('../Clean_Data/Fire_Data/calfire_fod_fpa_2001_2020_fire_label_w_size.parquet')

In [11]:
fire_label.shape

(126571680, 7)

In [12]:
fire_label.dtypes

lon                                 float64
lat                                 float64
day                          datetime64[ns]
IS_FIRE                               int32
NWCG_CAUSE_CLASSIFICATION            object
min_FIRE_SIZE                       float64
max_FIRE_SIZE                       float64
dtype: object

In [13]:
fire_label.head()

Unnamed: 0,lon,lat,day,IS_FIRE,NWCG_CAUSE_CLASSIFICATION,min_FIRE_SIZE,max_FIRE_SIZE
0,-124.391667,40.441667,2001-01-01,0,,0.0,0.0
1,-124.391667,40.441667,2001-01-02,0,,0.0,0.0
2,-124.391667,40.441667,2001-01-03,0,,0.0,0.0
3,-124.391667,40.441667,2001-01-04,0,,0.0,0.0
4,-124.391667,40.441667,2001-01-05,0,,0.0,0.0


In [14]:
# check min and max of the day
fire_label['day'].min(), fire_label['day'].max()

(Timestamp('2001-01-01 00:00:00'), Timestamp('2020-12-31 00:00:00'))

CONFIRMED. AS EXPECTED. NO FIRE LABEL AFTER 2020

In [16]:
# Add a column Year to the dataframe
fire_label['Year'] = fire_label['day'].dt.year
fire_label['Year'].value_counts()

Year
2004    6388855
2002    6371811
2001    6368061
2003    6367296
2014    6363672
2016    6351446
2013    6350317
2015    6350163
2012    6332613
2005    6329204
2017    6323457
2018    6320812
2019    6318460
2010    6309207
2006    6305972
2011    6294488
2020    6293301
2009    6288737
2008    6281645
2007    6262163
Name: count, dtype: int64

In [17]:
# save the data by Year
for year in range(2001, 2021):
    fire_label[fire_label['Year'] == year].to_parquet(f'../Clean_Data/Model_Data/Evaluation/Fire_Label/calfire_fod_fpa_{year}_fire_label_w_size.parquet')

In [19]:
# check mean of IS_FIRE
fire_label['IS_FIRE'].mean()*100

0.11995890391910734

In [23]:
fire_0 = fire_label[fire_label['IS_FIRE'] == 0]
fire_1 = fire_label[fire_label['IS_FIRE'] == 1]
fire_0 = fire_0.sample(n=100*len(fire_1), random_state=1)
fire_label = pd.concat([fire_0, fire_1])
# fire_label.to_parquet(f'../Clean_Data/Model_Data/Downsample/Fire_Label/calfire_fod_fpa_{year}_fire_label_w_size_downsample.parquet')

In [26]:
fire_label.to_parquet(f'../Clean_Data/Model_Data/Downsample/Fire_Label/calfire_fod_fpa_2001_2020_fire_label_w_size_downsample.parquet')

In [24]:
fire_label['Year'].value_counts()

Year
2004    773255
2001    772091
2014    771848
2003    770840
2016    768860
2002    768633
2015    768244
2013    767888
2005    767770
2017    767543
2012    766053
2018    765725
2006    765484
2010    764617
2019    764282
2020    764017
2009    762709
2011    762375
2007    762073
2008    760927
Name: count, dtype: int64

In [25]:
fire_label['IS_FIRE'].value_counts()

IS_FIRE
0    15183400
1      151834
Name: count, dtype: int64

In [15]:
# for year in range(2001, 2021):
#     # read the data
#     fire_label = pd.read_parquet(f'../Clean_Data/Fire_Data/calfire_fod_fpa_{year}_fire_label_w_size.parquet')
#     # add month column
#     #fire_label['Month'] = fire_label['day'].dt.month
#     fire_0 = fire_label[fire_label['IS_FIRE'] == 0]
#     fire_1 = fire_label[fire_label['IS_FIRE'] == 1]
#     # downsample fire_0 by only keep 10*len(fire_1) samples
#     fire_0 = fire_0.sample(n=100*len(fire_1), random_state=1)
#     # combine fire_0 and fire_1
#     fire_label = pd.concat([fire_0, fire_1])
#     # drop month column
#     #fire_label = fire_label.drop(columns=['Month'])
#     # save the data
#     fire_label.to_parquet(f'../Clean_Data/Model_Data/Downsample/Fire_Label/calfire_fod_fpa_{year}_fire_label_w_size_downsample.parquet')