In [111]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import pyproj
from tqdm import tqdm
import numpy as np

In [112]:
import os
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

In [113]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)
    print("Geopandas version")
    print(gpd.__version__)
    print("Xarray version")
    print(xr.__version__)
    print("Pyproj version")
    print(pyproj.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2
Geopandas version
0.14.2
Xarray version
2023.6.0
Pyproj version
3.6.1


In [114]:
weather_data = pd.read_parquet('../Clean_Data/Weather_Data_w_Veg_Filter_Fire_12KM/wind_speed_filtered.parquet')

In [115]:
weather_data.shape # this weather data is after inner join with vegetation data, within 12km of fire event

(134727600, 4)

In [116]:
weather_data.head()

Unnamed: 0,lon,lat,day,wind_speed
0,-124.391667,40.441667,2001-01-01,4.8
1,-124.391667,40.441667,2001-01-02,2.1
2,-124.391667,40.441667,2001-01-03,3.3
3,-124.391667,40.441667,2001-01-04,2.0
4,-124.391667,40.441667,2001-01-05,4.2


In [117]:
# only keep columns (lon, lat, and day)
weather_data = weather_data[['lon', 'lat', 'day']]

In [120]:
# drop duplicates
weather_data = weather_data.drop_duplicates()

In [121]:
weather_data.shape

(134727600, 3)

In [122]:
# check if there is any missing value
weather_data.isnull().sum()

lon    0
lat    0
day    0
dtype: int64

In [123]:
# read ../Clean_Data/fire_data_map_weather_lon_lat.parquet
fire_data = pd.read_parquet('../Clean_Data/fire_data_map_weather_lon_lat.parquet')

In [124]:
fire_data.shape

(7626, 22)

In [125]:
columns_to_show = ['lon','lat','ALARM_DATE','CONT_DATE']
fire_data[columns_to_show].head()

Unnamed: 0,lon,lat,ALARM_DATE,CONT_DATE
0,-122.891667,41.525,2023-06-17,2023-06-17
1,-121.975,39.816667,2023-06-02,2023-06-02
2,-120.891667,38.275,2023-07-01,2023-07-02
3,-120.975,38.441667,2023-07-11,2023-07-11
4,-121.016667,38.233333,2023-07-11,2023-07-12


In [126]:
# Convert date columns to datetime
fire_data['ALARM_DATE'] = pd.to_datetime(fire_data['ALARM_DATE'])
fire_data['CONT_DATE'] = pd.to_datetime(fire_data['CONT_DATE'])

In [127]:
fire_data_part1_date_order_wrong = fire_data[fire_data['CONT_DATE'] < fire_data['ALARM_DATE']]

In [128]:
fire_data_part2_date_missing = fire_data[fire_data['CONT_DATE'].isna() | fire_data['ALARM_DATE'].isna()]

In [129]:
fire_data_part3_normal = fire_data[fire_data['CONT_DATE'] >= fire_data['ALARM_DATE']]

In [130]:
fire_data_part1_date_order_wrong.shape[0] + fire_data_part2_date_missing.shape[0] + fire_data_part3_normal.shape[0] == fire_data.shape[0]

True

For those have CONT_DATE < ALARM_DATE, removing those unit (lon, lat, day) from the final grids

In [131]:
# Function to expand each row
def expand_dates(row):
    date_range = pd.date_range(start=row['ALARM_DATE'], end=row['CONT_DATE'])
    return pd.DataFrame({
        'lon': row['lon'],
        'lat': row['lat'],
        'DATE': date_range
    })

In [132]:
fire_data_part1_date_order_wrong = fire_data_part1_date_order_wrong[columns_to_show]
# assign CONT_DATE to ALARM_DATE and ALARM_DATE to CONT_DATE
fire_data_part1_date_order_wrong['CONT_DATE'], fire_data_part1_date_order_wrong['ALARM_DATE'] = fire_data_part1_date_order_wrong['ALARM_DATE'], fire_data_part1_date_order_wrong['CONT_DATE']

In [133]:
fire_data_part1_date_order_wrong

Unnamed: 0,lon,lat,ALARM_DATE,CONT_DATE
2642,-120.933333,39.983333,2017-08-29,2017-09-27
3312,-119.308333,34.316667,2015-03-13,2015-12-03
3327,-119.183333,34.233333,2015-08-13,2015-11-08
4195,-118.225,34.566667,2012-03-07,2012-07-03
4350,-119.35,37.4,2012-04-22,2012-09-04
6506,-116.891667,34.15,2005-07-07,2005-11-16
6509,-121.6,40.941667,2005-09-15,2005-09-16
6529,-120.016667,38.4,2005-06-11,2005-09-27
6537,-118.1,34.525,2005-04-14,2005-07-13
6608,-120.766667,39.275,2004-11-21,2004-11-24


In [134]:
fire_data_part1_date_order_wrong.dtypes

lon                  float64
lat                  float64
ALARM_DATE    datetime64[ns]
CONT_DATE     datetime64[ns]
dtype: object

In [136]:
# Function to expand each row
def expand_dates(row):
    date_range = pd.date_range(start=row['ALARM_DATE'], end=row['CONT_DATE'])
    return pd.DataFrame({
        'lon': row['lon'],
        'lat': row['lat'],
        'DATE': date_range
    })

# Apply the function to each row and concatenate the results
part1_expanded_fire_data = pd.concat(fire_data_part1_date_order_wrong.apply(expand_dates, axis=1).to_list(), ignore_index=True)

In [137]:
# rename DATE to day
part1_expanded_fire_data = part1_expanded_fire_data.rename(columns={'DATE': 'day'})

In [138]:
part1_expanded_fire_data.shape

(978, 3)

In [139]:
part1_expanded_fire_data.dtypes

lon           float64
lat           float64
day    datetime64[ns]
dtype: object

In [140]:
weather_data.dtypes

lon           float64
lat           float64
day    datetime64[ns]
dtype: object

In [141]:
# print the # of rows in weather_data before removing (in sentence)
print(f'weather_data has {weather_data.shape[0]} rows before removing part1_expanded_fire_data')
# remove part1_expanded_fire_data from weather_data
weather_data = weather_data.merge(part1_expanded_fire_data, on=['lon', 'lat', 'day'], how='left', indicator=True)
weather_data = weather_data[weather_data['_merge'] == 'left_only']
weather_data = weather_data.drop(columns=['_merge'])
# print the # of rows in weather_data after removing (in sentence)
print(f'weather_data has {weather_data.shape[0]} rows after removing part1_expanded_fire_data')

weather_data has 134727600 rows before removing part1_expanded_fire_data
weather_data has 134726622 rows after removing part1_expanded_fire_data


For those have either CONT_DATE or ALARM_DATE missing, removing those unit (lon, lat, day) from the final grids

In [142]:
fire_data_part2_date_missing.shape

(285, 22)

In [143]:
columns_to_show = ['lon','lat','YEAR_']

In [144]:
fire_data_part2_date_missing = fire_data_part2_date_missing[columns_to_show]

In [145]:
fire_data_part2_date_missing.head()

Unnamed: 0,lon,lat,YEAR_
227,-123.516667,41.275,2023
928,-122.016667,38.025,2021
1225,-123.225,41.941667,2020
1230,-123.683333,41.15,2020
1237,-123.641667,41.025,2020


In [146]:
# check if there is any missing value
fire_data_part2_date_missing.isnull().sum()

lon      0
lat      0
YEAR_    0
dtype: int64

In [147]:
# print shape before removing
print(f'fire_data_part2_date_missing has {fire_data_part2_date_missing.shape[0]} rows before dedup')
# remove dup
fire_data_part2_date_missing = fire_data_part2_date_missing.drop_duplicates()
# print shape after removing
print(f'fire_data_part2_date_missing has {fire_data_part2_date_missing.shape[0]} rows after dedup')

fire_data_part2_date_missing has 285 rows before dedup
fire_data_part2_date_missing has 269 rows after dedup


In [148]:
# add a colum YEAR_ to weather_data based on day
weather_data['YEAR_'] = weather_data['day'].dt.year

In [149]:
weather_data.head()

Unnamed: 0,lon,lat,day,YEAR_
0,-124.391667,40.441667,2001-01-01,2001
1,-124.391667,40.441667,2001-01-02,2001
2,-124.391667,40.441667,2001-01-03,2001
3,-124.391667,40.441667,2001-01-04,2001
4,-124.391667,40.441667,2001-01-05,2001


In [150]:
# left anti join weather_data with fire_data_part2_date_missing (lon, lat, YEAR_)
# print the # of rows in weather_data before removing (in sentence)
print(f'weather_data has {weather_data.shape[0]} rows before removing fire_data_part2_date_missing')
# remove fire_data_part2_date_missing from weather_data
weather_data = weather_data.merge(fire_data_part2_date_missing, on=['lon', 'lat', 'YEAR_'], how='left', indicator=True)
weather_data = weather_data[weather_data['_merge'] == 'left_only']
weather_data = weather_data.drop(columns=['_merge'])
# print the # of rows in weather_data after removing (in sentence)
print(f'weather_data has {weather_data.shape[0]} rows after removing fire_data_part2_date_missing')

weather_data has 134726622 rows before removing fire_data_part2_date_missing
weather_data has 134636764 rows after removing fire_data_part2_date_missing


For normal fire data, for each location, for days
- alarm date -4, alarm date -1: remove
- alarm date: label 1
- alarm date - cont date: remove

In [151]:
columns_to_show = ['lon','lat','ALARM_DATE','CONT_DATE']
fire_data_part3_normal = fire_data_part3_normal[columns_to_show]
fire_data_part3_normal.head()

Unnamed: 0,lon,lat,ALARM_DATE,CONT_DATE
0,-122.891667,41.525,2023-06-17,2023-06-17
1,-121.975,39.816667,2023-06-02,2023-06-02
2,-120.891667,38.275,2023-07-01,2023-07-02
3,-120.975,38.441667,2023-07-11,2023-07-11
4,-121.016667,38.233333,2023-07-11,2023-07-12


In [152]:
# check if there is any duplicate
fire_data_part3_normal.duplicated().sum()

67

In [153]:
# remove duplicates
fire_data_part3_normal = fire_data_part3_normal.drop_duplicates()
# print shape after removing
print(f'fire_data_part3_normal has {fire_data_part3_normal.shape[0]} rows after dedup')

fire_data_part3_normal has 7264 rows after dedup


In [154]:
fire_data_part3_normal.dtypes

lon                  float64
lat                  float64
ALARM_DATE    datetime64[ns]
CONT_DATE     datetime64[ns]
dtype: object

In [155]:
# Function to expand each row
def expand_dates(row):
    start_date = row['ALARM_DATE'] - pd.Timedelta(days=4)
    end_date = row['CONT_DATE']
    date_range = pd.date_range(start=start_date, end=end_date)
    return pd.DataFrame({
        'lon': row['lon'],
        'lat': row['lat'],
        'DATE': date_range,
        'IS_ALARM_DATE': date_range == row['ALARM_DATE']
    })

# Apply the function to each row and concatenate the results
expanded_fire_data_part3_normal = pd.concat(fire_data_part3_normal.apply(expand_dates, axis=1).to_list(), ignore_index=True)

In [156]:
# check row 1: row 6
expanded_fire_data_part3_normal.iloc[0:6]

Unnamed: 0,lon,lat,DATE,IS_ALARM_DATE
0,-122.891667,41.525,2023-06-13,False
1,-122.891667,41.525,2023-06-14,False
2,-122.891667,41.525,2023-06-15,False
3,-122.891667,41.525,2023-06-16,False
4,-122.891667,41.525,2023-06-17,True
5,-121.975,39.816667,2023-05-29,False


In [157]:
fire_data_part3_normal.dtypes

lon                  float64
lat                  float64
ALARM_DATE    datetime64[ns]
CONT_DATE     datetime64[ns]
dtype: object

In [158]:
# edge case
fire_data_part3_normal[(fire_data_part3_normal['lon'] > -119.36) & (fire_data_part3_normal['lon'] < -119.34)
                       & (fire_data_part3_normal['lat'] > 35) & (fire_data_part3_normal['lat'] < 35.1)]

Unnamed: 0,lon,lat,ALARM_DATE,CONT_DATE
36,-119.35,35.025,2023-06-05,2023-06-06
240,-119.35,35.025,2023-06-04,2023-06-04


In [159]:
# for thie kind of case, from raw date, we will only keeo it as True if both rows are True.
expanded_fire_data_part3_normal[(expanded_fire_data_part3_normal['lon'] >= -119.36) & (expanded_fire_data_part3_normal['lon'] <= -119.34) 
                                & (expanded_fire_data_part3_normal['lat'] > 35) & (expanded_fire_data_part3_normal['lat'] < 35.1)
                                & (expanded_fire_data_part3_normal['DATE'] == '2023-06-04')]

Unnamed: 0,lon,lat,DATE,IS_ALARM_DATE
248,-119.35,35.025,2023-06-04,False
2359,-119.35,35.025,2023-06-04,True


In [160]:
expanded_fire_data_part3_normal.shape

(104114, 4)

In [161]:
expanded_fire_data_part3_normal.dtypes

lon                     float64
lat                     float64
DATE             datetime64[ns]
IS_ALARM_DATE              bool
dtype: object

In [162]:
# group by lon, lat, and DATE, and aggregate IS_ALARM_DATE (only True if all are True)
expanded_fire_data_part3_normal = expanded_fire_data_part3_normal.groupby(['lon', 'lat', 'DATE']).agg({'IS_ALARM_DATE': 'all'}).reset_index()

In [163]:
# for thie kind of case, from raw date, we will only keeo it as True if both rows are True.
expanded_fire_data_part3_normal[(expanded_fire_data_part3_normal['lon'] >= -119.36) & (expanded_fire_data_part3_normal['lon'] <= -119.34) 
                                & (expanded_fire_data_part3_normal['lat'] > 35) & (expanded_fire_data_part3_normal['lat'] < 35.1)
                                & (expanded_fire_data_part3_normal['DATE'] == '2023-06-04')]

Unnamed: 0,lon,lat,DATE,IS_ALARM_DATE
68494,-119.35,35.025,2023-06-04,False


In [164]:
# check if there is any missing value
expanded_fire_data_part3_normal.isnull().sum()

lon              0
lat              0
DATE             0
IS_ALARM_DATE    0
dtype: int64

In [165]:
# check if there is any duplicate  
expanded_fire_data_part3_normal.duplicated().sum()

0

In [166]:
expanded_fire_data_part3_normal = expanded_fire_data_part3_normal.rename(columns={'DATE': 'day'})

In [167]:
# freq table of IS_ALARM_DATE
expanded_fire_data_part3_normal['IS_ALARM_DATE'].value_counts()

IS_ALARM_DATE
False    95523
True      7163
Name: count, dtype: int64

In [168]:
# left join weather_data with expanded_fire_data_part3_normal (lon, lat, DATE)
# print the # of rows in weather_data before removing (in sentence)
print(f'weather_data has {weather_data.shape[0]} rows before merging')
# merge expanded_fire_data_part3_normal with weather_data
weather_data = weather_data.merge(expanded_fire_data_part3_normal, on=['lon', 'lat', 'day'], how='left')
# print the # of rows in weather_data after merging (in sentence)
print(f'weather_data has {weather_data.shape[0]} rows after merging')

weather_data has 134636764 rows before merging
weather_data has 134636764 rows after merging


In [170]:
# remove rows when IS_ALARM_DATE is False
# print the # of rows in weather_data before removing (in sentence)
print(f'weather_data has {weather_data.shape[0]} rows before removing IS_ALARM_DATE is False')
# only keep rows when IS_ALARM_DATE is True or NA
weather_data = weather_data[weather_data['IS_ALARM_DATE'].isna() | weather_data['IS_ALARM_DATE']]
# print the # of rows in weather_data after removing (in sentence)
print(f'weather_data has {weather_data.shape[0]} rows after removing IS_ALARM_DATE is False')

weather_data has 134636764 rows before removing IS_ALARM_DATE is False
weather_data has 134548841 rows after removing IS_ALARM_DATE is False


In [171]:
# check IS_ALARM_DATE freq table
weather_data['IS_ALARM_DATE'].value_counts()

IS_ALARM_DATE
True    6428
Name: count, dtype: int64

In [172]:
# if IS_ALARM_DATE is NA, fill it with 0, else fill it with 1
weather_data['IS_ALARM_DATE'] = weather_data['IS_ALARM_DATE'].fillna(0).astype(int)

In [173]:
# rename IS_ALARM_DATE to IS_FIRE
weather_data = weather_data.rename(columns={'IS_ALARM_DATE': 'IS_FIRE'})

In [174]:
weather_data.head()

Unnamed: 0,lon,lat,day,YEAR_,IS_FIRE
0,-124.391667,40.441667,2001-01-01,2001,0
1,-124.391667,40.441667,2001-01-02,2001,0
2,-124.391667,40.441667,2001-01-03,2001,0
3,-124.391667,40.441667,2001-01-04,2001,0
4,-124.391667,40.441667,2001-01-05,2001,0


In [175]:
# remove YEAR_ column
weather_data = weather_data.drop(columns=['YEAR_'])

In [176]:
# write weather_data
weather_data.to_parquet('../Clean_Data/fire_label_data.parquet')