In [1]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import pyproj
from tqdm import tqdm
import numpy as np

In [2]:
import os
import matplotlib.pyplot as plt

In [3]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)
    print("Pyproj version")
    print(pyproj.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2
Pyproj version
3.6.1


In [4]:
# Force garbage collection
import gc
gc.collect()

0

In [5]:
input_fire_dir = '../../Clean_Data/Fire_Data/Extended_Fire_Data/'
file_name = 'calfire_fod_fpa_1994_2020_fire_label.parquet'
fire_label = pd.read_parquet(os.path.join(input_fire_dir, file_name))

In [6]:
fire_label.shape

(127833607, 6)

In [7]:
fire_label.dtypes

lon                     float64
lat                     float64
day              datetime64[ns]
IS_FIRE                   int32
min_FIRE_SIZE           float64
max_FIRE_SIZE           float64
dtype: object

In [8]:
fire_label.head()

Unnamed: 0,lon,lat,day,IS_FIRE,min_FIRE_SIZE,max_FIRE_SIZE
0,-124.391667,40.441667,1994-01-01,0,0.0,0.0
1,-124.391667,40.441667,1994-01-02,0,0.0,0.0
2,-124.391667,40.441667,1994-01-03,0,0.0,0.0
3,-124.391667,40.441667,1994-01-04,0,0.0,0.0
4,-124.391667,40.441667,1994-01-05,0,0.0,0.0


In [9]:
# check min and max of the day
fire_label['day'].min(), fire_label['day'].max()

(Timestamp('1994-01-01 00:00:00'), Timestamp('2020-12-31 00:00:00'))

In [10]:
# check mean of IS_FIRE
fire_label['IS_FIRE'].mean()*100

0.07196855518596139

## Save water year fire label

In [14]:
save_path = '../../Clean_Data/Model_Data/Evaluation/Fire_Label/Water_Year'
if not os.path.exists(save_path):
    os.makedirs(save_path)

In [15]:
log_messages = []
log_messages.append(f"Log messages for processing water years: {pd.Timestamp.now()}")

for year in tqdm(range(1995, 2021), desc="Processing Water Years"):

    # add separator line
    log_messages.append("-" * 50)
    # Define the start and end dates for the water year
    start_date = pd.Timestamp(year=year -1, month=10, day=1)
    end_date = pd.Timestamp(year=year, month=9, day=30)

    # Filter the fire_label DataFrame for the current water year
    fire_label_wy = fire_label[(fire_label['day'] >= start_date) & (fire_label['day'] <= end_date)]

    # Save the filtered DataFrame to a parquet file
    file_path = os.path.join(save_path, f'calfire_fod_fpa_fire_label_wy_{year}.parquet')
    fire_label_wy.to_parquet(file_path)

    log_messages.append(f"Processed water year {year}: {len(fire_label_wy)} records saved to {file_path}")
    # log the min and max of the day
    log_messages.append(f"Water year {year} - Min day: {fire_label_wy['day'].min()}, Max day: {fire_label_wy['day'].max()}")

Processing Water Years: 100%|██████████| 26/26 [01:14<00:00,  2.88s/it]


In [16]:
log_save_path = '../../Logs/Clean_Extended_Data/'

with open(f'{log_save_path}/fire_label_save_to_each_water_year.txt', 'w') as log_file:
    log_file.write('\n'.join(log_messages))

## Downsample

In [17]:
fire_label['IS_FIRE'].value_counts()

IS_FIRE
0    127741607
1        92000
Name: count, dtype: int64

In [18]:
fire_0 = fire_label[fire_label['IS_FIRE'] == 0]
fire_1 = fire_label[fire_label['IS_FIRE'] == 1]
# assert error if num of fire_0 + fire_1 != num of fire_label
assert len(fire_0) + len(fire_1) == len(fire_label), "Mismatch in total number of records"

In [19]:
fire_0 = fire_0.sample(n=100*len(fire_1), random_state=123)
fire_label = pd.concat([fire_0, fire_1])

In [20]:
fire_label['IS_FIRE'].value_counts() # fire rate is ~ 1%

IS_FIRE
0    9200000
1      92000
Name: count, dtype: int64

In [28]:
save_path = '../../Clean_Data/Model_Data/Downsample/Fire_Label'
file_name = 'calfire_fod_fpa_1994_2000_fire_label_downsampled.parquet'
if not os.path.exists(save_path):
    os.makedirs(save_path)
fire_label.to_parquet(os.path.join(save_path, file_name))