In [20]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import pyproj
from tqdm import tqdm
import numpy as np

In [21]:
import os
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

In [22]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)
    print("Geopandas version")
    print(gpd.__version__)
    print("Xarray version")
    print(xr.__version__)
    print("Pyproj version")
    print(pyproj.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2
Geopandas version
0.14.2
Xarray version
2023.6.0
Pyproj version
3.6.1


In [23]:
pd.set_option('display.max_colwidth', None)

In [24]:
data_dir = '../Weather_Data/'
files = os.listdir(data_dir)
reference_ds = xr.open_dataset(os.path.join(data_dir, files[-1]))

# Extract lon and lat coordinates
lon = reference_ds.coords['lon'].values
lat = reference_ds.coords['lat'].values

# Create a DataFrame with all combinations of lon and lat
lon_lat_combinations = pd.DataFrame([(lo, la) for lo in lon for la in lat], columns=['lon', 'lat'])

lon_lat_combinations.shape

(62160, 2)

In [25]:
lai_Dat = xr.open_dataset('../New_Data/glass.lai.2001.2021.CA.daily_interpolated.nc')

In [26]:
lai_Dat

In [27]:
# Extract lon and lat coordinates
lon = lai_Dat.coords['lon'].values
lat = lai_Dat.coords['lat'].values

# Create a DataFrame with all combinations of lon and lat
lai_Dat_lon_lat = pd.DataFrame([(lo, la) for lo in lon for la in lat], columns=['lon', 'lat'])

lai_Dat_lon_lat.shape

(62160, 2)

In [28]:
test_merge = lon_lat_combinations.merge(lai_Dat_lon_lat, on=['lon', 'lat'], how='inner')

In [29]:
test_merge.shape

(62160, 2)

Confirmed that grid matched w existing weather data

In [30]:
lai_Dat = lai_Dat.to_dataframe().reset_index()

In [31]:
lai_Dat['time'] = lai_Dat['time'].astype(int)
lai_Dat['time'] = pd.to_datetime(lai_Dat['time'].astype(str), format='%Y%m%d')

In [32]:
lai_Dat.head()

Unnamed: 0,time,lon,lat,LAI
0,2001-01-01,-124.766667,41.983333,
1,2001-01-01,-124.766667,41.941667,
2,2001-01-01,-124.766667,41.9,
3,2001-01-01,-124.766667,41.858333,
4,2001-01-01,-124.766667,41.816667,


In [33]:
# check unique values of time
lai_Dat['time'].unique()

<DatetimeArray>
['2001-01-01 00:00:00', '2001-01-02 00:00:00', '2001-01-03 00:00:00',
 '2001-01-04 00:00:00', '2001-01-05 00:00:00', '2001-01-06 00:00:00',
 '2001-01-07 00:00:00', '2001-01-08 00:00:00', '2001-01-09 00:00:00',
 '2001-01-10 00:00:00',
 ...
 '2021-12-18 00:00:00', '2021-12-19 00:00:00', '2021-12-20 00:00:00',
 '2021-12-21 00:00:00', '2021-12-22 00:00:00', '2021-12-23 00:00:00',
 '2021-12-24 00:00:00', '2021-12-25 00:00:00', '2021-12-26 00:00:00',
 '2021-12-27 00:00:00']
Length: 7666, dtype: datetime64[ns]

In [37]:
# calculate days between 2001-01-01 and 2021-12-27
(pd.Timestamp('2021-12-27') - pd.Timestamp('2001-01-01')).days + 1

7666

In [38]:
lai_Dat['LAI'].isna().sum() / lai_Dat.shape[0]

0.6730774452940511

## Filter w Fire 12KM (FOD_FPA)

In [39]:
reference_ds = pd.read_parquet('../Clean_Data/Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM/dead_fuel_moisture_1000hr_filtered.parquet')

In [40]:
# extract lon and lat and dedup
reference_ds_lon_lat = reference_ds[['lon', 'lat']].drop_duplicates()

In [41]:
reference_ds_lon_lat.shape

(17530, 2)

In [43]:
lai_Dat.shape

(476518560, 4)

In [44]:
lai_Dat_filtered = pd.merge(reference_ds_lon_lat, lai_Dat, on=['lon', 'lat'], how='inner')

In [45]:
lai_Dat_filtered.shape

(134384980, 4)

In [46]:
lai_Dat_filtered['LAI'].isna().sum() / lai_Dat_filtered.shape[0]

0.03394447057997107

In [47]:
# write to parquet file
output_file = f'../Clean_Data/Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM/LAI_200101_202112_filtered.parquet'
lai_Dat_filtered.to_parquet(output_file)

## MERGE w the FINAL WEATHER CLEANED DATA

In [48]:
all_features = pd.read_parquet('../Clean_Data/Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM_Merged_Cleaned_2001_2020.parquet')

In [49]:
lai_Dat_filtered = pd.read_parquet('../Clean_Data/Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM/LAI_200101_202112_filtered.parquet')

In [50]:
all_features.shape, lai_Dat_filtered.shape

((128056650, 20), (134384980, 4))

In [51]:
lai_Dat_filtered.dtypes

lon            float64
lat            float64
time    datetime64[ns]
LAI            float32
dtype: object

In [52]:
lai_Dat_filtered.rename(columns={'time': 'day'}, inplace=True)

In [54]:
# merge all_features with SNOW_dat on lon, lat, day
all_features = pd.merge(all_features, lai_Dat_filtered, on=['lon', 'lat', 'day'], how='left')

In [55]:
all_features.shape

(128056650, 21)

In [56]:
# check missing rate of SWE
all_features['LAI'].isna().sum() / all_features.shape[0]

0.034028775545822884

In [57]:
# write it back to parquet file
output_file = f'../Clean_Data/Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM_Merged_Cleaned_2001_2020.parquet'
all_features.to_parquet(output_file)