In [1]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import pyproj
from tqdm import tqdm

In [2]:
import os

In [3]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)
    print("Pyproj version")
    print(pyproj.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2
Pyproj version
3.6.1


In [4]:
# Force garbage collection
import gc
gc.collect()

0

In [5]:
input_path = '../../Clean_Data/Extended_Feature_Data/'
file_name =  'Weather_Data_w_Veg_SubRegion_Filter_Merged_Add_population_lai_Completed.parquet'
features = pd.read_parquet(input_path + file_name)

In [5]:
features.dtypes

day                                          datetime64[ns]
lat                                                 float64
lon                                                 float64
SWE                                                 float32
year                                                  int32
dead_fuel_moisture_1000hr                           float32
dead_fuel_moisture_100hr                            float32
max_air_temperature                                 float64
max_relative_humidity                               float32
min_air_temperature                                 float64
min_relative_humidity                               float32
precipitation_amount                                float32
specific_humidity                                   float32
surface_downwelling_shortwave_flux_in_air           float32
wind_from_direction                                 float32
wind_speed                                          float32
population_density                      

In [6]:
features.shape

(127137467, 19)

In [7]:
# check missing rate
features.isnull().mean()

day                                          0.0
lat                                          0.0
lon                                          0.0
SWE                                          0.0
year                                         0.0
dead_fuel_moisture_1000hr                    0.0
dead_fuel_moisture_100hr                     0.0
max_air_temperature                          0.0
max_relative_humidity                        0.0
min_air_temperature                          0.0
min_relative_humidity                        0.0
precipitation_amount                         0.0
specific_humidity                            0.0
surface_downwelling_shortwave_flux_in_air    0.0
wind_from_direction                          0.0
wind_speed                                   0.0
population_density                           0.0
LAI                                          0.0
wind_direction_category                      0.0
dtype: float64

In [6]:
# check range of day
features['day'].min(), features['day'].max()

(Timestamp('1994-01-01 00:00:00'), Timestamp('2020-09-30 00:00:00'))

## Merge static features

In [9]:
veg_data = pd.read_parquet('../../Clean_Data/lon_lat_pair_weather_match_veg_v2.parquet')
slope_data = pd.read_parquet('../../Clean_Data/lon_lat_pair_weather_match_slope.parquet')
road_density_data = pd.read_parquet('../../Clean_Data/road_density_match_weather_grid.parquet')

# newly added in extended data
powerline_data = pd.read_parquet('../../Clean_Data/Extended_Data_w_Veg_Filter/Powerline/transmission_line_density.parquet')
subregion_data = pd.read_parquet('../../Clean_Data/Extended_Data_w_Veg_Filter/SubRegion/lon_lat_pair_weather_match_subregion.parquet')

In [10]:
# print shape of each data into a sentence
print(f"Vegetation data shape: {veg_data.shape}")
print(f"Slope data shape: {slope_data.shape}")
print(f"Road density data shape: {road_density_data.shape}")
print(f"Powerline data shape: {powerline_data.shape}")
print(f"Subregion data shape: {subregion_data.shape}")

Vegetation data shape: (17703, 7)
Slope data shape: (62160, 5)
Road density data shape: (17703, 4)
Powerline data shape: (14383, 3)
Subregion data shape: (13048, 3)


In [21]:
veg_data = veg_data[['lon', 'lat','fire_attribute','veg']]
slope_data = slope_data[['lon', 'lat', 'slope_avg', 'slope_max']]
road_density_data = road_density_data[['lon', 'lat', 'road_density_km_km2']]

In [22]:
# join 5 data on lon and lat in a for loop
data_list = [veg_data, slope_data, road_density_data, powerline_data, subregion_data]

static_features = subregion_data[['lon', 'lat']].drop_duplicates()
for data in data_list:
    # assert error if lon and lat are not unique
    if not data[['lon', 'lat']].drop_duplicates().shape[0] == data.shape[0]:
        raise ValueError(f"Data contains non-unique lon and lat pairs: {data.shape}") 
    static_features = static_features.merge(data, on=['lon', 'lat'], how='inner')

In [23]:
static_features.shape

(13048, 9)

In [24]:
static_features.columns

Index(['lon', 'lat', 'fire_attribute', 'veg', 'slope_avg', 'slope_max',
       'road_density_km_km2', 'line_density_km_per_cell', 'SubRegion'],
      dtype='object')

In [25]:
# write it to ../../Clean_Data/static_features.parquet
static_features.to_parquet('../../Clean_Data/static_features.parquet', index=False)

## Merge daily feature w static features and label

### Downsample

In [26]:
save_path = '../../Clean_Data/Model_Data/Downsample/Fire_Label'
file_name = 'calfire_fod_fpa_1994_2000_fire_label_downsampled.parquet'
fire_label = pd.read_parquet(save_path + '/' + file_name)

In [27]:
fire_label.columns

Index(['lon', 'lat', 'day', 'IS_FIRE', 'min_FIRE_SIZE', 'max_FIRE_SIZE'], dtype='object')

In [28]:
features['day'].min(), features['day'].max()

(Timestamp('1994-01-01 00:00:00'), Timestamp('2020-09-30 00:00:00'))

In [29]:
fire_label['day'].min(), fire_label['day'].max()

(Timestamp('1994-01-01 00:00:00'), Timestamp('2020-12-31 00:00:00'))

In [31]:
features.shape, fire_label.shape

((127137467, 19), (9292000, 6))

In [32]:
mod_data = pd.merge(features, fire_label, on=['lon', 'lat', 'day'], how='inner')
mod_data = pd.merge(mod_data, static_features, on=['lon', 'lat'], how='inner')

In [33]:
mod_data.shape

(9180502, 29)

In [34]:
mod_data.columns

Index(['day', 'lat', 'lon', 'SWE', 'year', 'dead_fuel_moisture_1000hr',
       'dead_fuel_moisture_100hr', 'max_air_temperature',
       'max_relative_humidity', 'min_air_temperature', 'min_relative_humidity',
       'precipitation_amount', 'specific_humidity',
       'surface_downwelling_shortwave_flux_in_air', 'wind_from_direction',
       'wind_speed', 'population_density', 'LAI', 'wind_direction_category',
       'IS_FIRE', 'min_FIRE_SIZE', 'max_FIRE_SIZE', 'fire_attribute', 'veg',
       'slope_avg', 'slope_max', 'road_density_km_km2',
       'line_density_km_per_cell', 'SubRegion'],
      dtype='object')

In [36]:
# confirm that no water, urban, or agriculture in veg
data_check = mod_data[mod_data['veg'].str.contains('Water|Urban|Agriculture')]
data_check.shape

(0, 29)

In [37]:
mod_data['veg'].unique()

array(['Native Coastal Sage Scrub ', 'Native Conifer Forest ',
       'Native Grassland ', 'Native Chapparal ', 'Native Desert ',
       'Riparian ', 'Native Inland Scrub ', 'Native Conifer Alpine ',
       'Non-native forest ', 'Non-native grassland ', 'Barren ',
       'Native Wetland ', 'Non-native shrub ', 'Native Oak Woodland '],
      dtype=object)

In [38]:
mod_data['IS_FIRE'].value_counts(normalize=True)

IS_FIRE
0    0.990047
1    0.009953
Name: proportion, dtype: float64

In [39]:
mod_data.to_parquet(f"../../Clean_Data/Model_Data/Downsample/Features_w_Label/features_w_label_downsample_1994_2020.parquet")

#### remove riparian

In [5]:
mod_data = pd.read_parquet(f'../../Clean_Data/Model_Data/Downsample/Features_w_Label/features_w_label_downsample_1994_2020.parquet')

In [15]:
# remove veg containing 'Riparian'
mod_data = mod_data[~mod_data['veg'].str.contains('Riparian')]
save_path = '../../Clean_Data/Model_Data/Downsample/Features_w_Label'
mod_data.to_parquet(f"{save_path}/features_w_label_downsample_1994_2020_no_riparian.parquet")

In [16]:
del mod_data

### Each Water Year

In [7]:
# read static features
static_features = pd.read_parquet('../../Clean_Data/static_features.parquet')

In [8]:
log_messages = []
# append current timestamp to log_messages
log_messages.append(f"Log messages for processing water years feature merge label: {pd.Timestamp.now()}")

file_path = '../../Clean_Data/Model_Data/Evaluation/Fire_Label/Water_Year'
save_path = '../../Clean_Data/Model_Data/Evaluation/Features_w_Label/Extended_Data_Water_Year'
if not os.path.exists(save_path):
    os.makedirs(save_path)

for year in tqdm(range(1995, 2021), desc="Processing Water years"):
    # add separator line in log
    log_messages.append("="*50)
    log_messages.append(f"Processing water year: {year}")
    file_name = os.path.join(file_path, f'calfire_fod_fpa_fire_label_wy_{year}.parquet')
    fire_label = pd.read_parquet(file_name)

    mod_data = pd.merge(features, fire_label, on=['lon', 'lat', 'day'], how='inner')
    mod_data = pd.merge(mod_data, static_features, on=['lon', 'lat'], how='inner')

    start_date = pd.Timestamp(year=year -1, month=10, day=1)
    end_date = pd.Timestamp(year=year, month=9, day=30)
    mod_data = mod_data[(mod_data['day'] >= start_date) & (mod_data['day'] <= end_date)]

    log_message = f"Feature w label data saved, min day: {mod_data['day'].min()}, max day: {mod_data['day'].max()}"
    log_messages.append(log_message)

    # save to parquet
    mod_data.to_parquet(f"{save_path}/{year}_features_w_label.parquet")
    log_messages.append(f"Data saved for year {year} with shape: {mod_data.shape}")
    log_messages.append(f"Saved path: {save_path}/{year}_features_w_label.parquet")

    gc.collect()
    # free up memory
    del mod_data, fire_label

Processing Water years: 100%|██████████| 26/26 [36:29<00:00, 84.23s/it]


In [10]:
# Save the log messages to a log file
with open('../../Logs/Clean_Extended_Data/water_year_evaluation_data_log.txt', 'w') as log_file:
    log_file.write('\n'.join(log_messages))

In [11]:
del features

In [16]:
results = []  # Use a list instead of DataFrame
save_path = '../../Clean_Data/Model_Data/Evaluation/Features_w_Label/Extended_Data_Water_Year'
for year in tqdm(range(1995, 2021), desc="Summarizing Water Years"):

    fire_label_path = os.path.join(save_path, f'{year}_features_w_label.parquet')
    mod_data = pd.read_parquet(fire_label_path)
    
    # check missing rate of all columns
    missing_rates = mod_data.isnull().mean()
    # filter out columns with positive missing rate
    missing_rates = missing_rates[missing_rates > 0]
    if not missing_rates.empty:
        print(f"Missing rates for year {year}:")
        print(missing_rates)
    
    # count the number of fires
    num_fires = mod_data['IS_FIRE'].sum()
    total_rows = mod_data.shape[0]
    
    # calculate the percentage of fires
    fire_percentage = (num_fires / total_rows) * 100
    min_day, max_day = mod_data['day'].min(), mod_data['day'].max()

    results.append({
        'Water Year': year,
        'Start Datetime': min_day,
        'End Datetime': max_day,
        'Total Rows': total_rows,
        'Number of Fires': num_fires,
        'Fire Percentage': fire_percentage
    })

result = pd.DataFrame(results)

Summarizing Water Years: 100%|██████████| 26/26 [00:53<00:00,  2.04s/it]


In [18]:
# rename last column to 'Fire Percentage (%)'
result.rename(columns={'Fire Percentage': 'Fire Percentage (%)'}, inplace=True)

In [19]:
result

Unnamed: 0,Water Year,Start Datetime,End Datetime,Total Rows,Number of Fires,Fire Percentage (%)
0,1995,1994-10-01,1995-09-30,4703262,3331,0.070823
1,1996,1995-10-01,1996-09-30,4699385,4553,0.096885
2,1997,1996-10-01,1997-09-30,4693711,3958,0.084326
3,1998,1997-10-01,1998-09-30,4714847,2859,0.060638
4,1999,1998-10-01,1999-09-30,4703945,4183,0.088925
5,2000,1999-10-01,2000-09-30,4736858,3785,0.079905
6,2001,2000-10-01,2001-09-30,4733613,3774,0.079728
7,2002,2001-10-01,2002-09-30,4734891,3722,0.078608
8,2003,2002-10-01,2003-09-30,4737424,3344,0.070587
9,2004,2003-10-01,2004-09-30,4748570,3742,0.078803


In [17]:
import gc

In [18]:
input_path = '../../Clean_Data/Model_Data/Evaluation/Features_w_Label/Extended_Data_Water_Year'
save_path = '../../Clean_Data/Model_Data/Evaluation/Features_w_Label/Extended_Data_Water_Year_no_riparian'
if not os.path.exists(save_path):
    os.makedirs(save_path)
for year in tqdm(range(1995, 2021), desc="Summarizing Water Years"):

    fire_label_path = os.path.join(input_path, f'{year}_features_w_label.parquet')
    mod_data = pd.read_parquet(fire_label_path)
    
    # remove veg containing 'Riparian'
    mod_data = mod_data[~mod_data['veg'].str.contains('Riparian')]
    # save
    mod_data.to_parquet(f"{save_path}/{year}_features_w_label.parquet")

    # free up memory
    del mod_data
    # gc
    gc.collect()

Summarizing Water Years: 100%|██████████| 26/26 [05:07<00:00, 11.85s/it]
