In [1]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import pyproj
from tqdm import tqdm
import numpy as np

In [2]:
import os
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

In [4]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)
    print("Geopandas version")
    print(gpd.__version__)
    print("Xarray version")
    print(xr.__version__)
    print("Pyproj version")
    print(pyproj.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2
Geopandas version
0.14.2
Xarray version
2023.6.0
Pyproj version
3.6.1


In [5]:
pd.set_option('display.max_colwidth', None)

In [6]:
data_dir = '../Weather_Data/'
files = os.listdir(data_dir)
reference_ds = xr.open_dataset(os.path.join(data_dir, files[-1]))

# Extract lon and lat coordinates
lon = reference_ds.coords['lon'].values
lat = reference_ds.coords['lat'].values

# Create a DataFrame with all combinations of lon and lat
lon_lat_combinations = pd.DataFrame([(lo, la) for lo in lon for la in lat], columns=['lon', 'lat'])

lon_lat_combinations.shape

(62160, 2)

In [7]:
pdsi_Dat = xr.open_dataset('../New_Data/pdsi.2001.2023.CA.daily_interpolated.nc')

In [8]:
pdsi_Dat

In [9]:
# Extract lon and lat coordinates
lon = pdsi_Dat.coords['lon'].values
lat = pdsi_Dat.coords['lat'].values

# Create a DataFrame with all combinations of lon and lat
pdsi_Dat_lon_lat = pd.DataFrame([(lo, la) for lo in lon for la in lat], columns=['lon', 'lat'])

pdsi_Dat_lon_lat.shape

(62160, 2)

In [10]:
test_merge = lon_lat_combinations.merge(pdsi_Dat_lon_lat, on=['lon', 'lat'], how='inner')

In [11]:
test_merge.shape

(62160, 2)

Confirmed that grid matched w existing weather data

In [12]:
pdsi_Dat = pdsi_Dat.to_dataframe().reset_index()

In [13]:
pdsi_Dat.dtypes

day         datetime64[ns]
lon                float64
lat                float64
pdsi               float32
category           float32
dtype: object

In [14]:
pdsi_Dat['pdsi'].isna().sum() / pdsi_Dat.shape[0]

0.01151866151866152

In [15]:
pdsi_Dat['category'].isna().sum() / pdsi_Dat.shape[0]

0.6141327315836245

In [16]:
# drop category column
pdsi_Dat = pdsi_Dat.drop(columns=['category'])

## Filter w Fire 12KM (FOD_FPA)

In [17]:
reference_ds = pd.read_parquet('../Clean_Data/Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM/dead_fuel_moisture_1000hr_filtered.parquet')
# extract lon and lat and dedup
reference_ds_lon_lat = reference_ds[['lon', 'lat']].drop_duplicates()

In [18]:
reference_ds_lon_lat.shape

(17530, 2)

In [19]:
pdsi_Dat.shape

(522144000, 4)

In [20]:
pdsi_Dat_filtered = pd.merge(reference_ds_lon_lat, pdsi_Dat, on=['lon', 'lat'], how='inner')

In [21]:
pdsi_Dat_filtered.shape

(147252000, 4)

In [22]:
pdsi_Dat_filtered['pdsi'].isna().sum() / pdsi_Dat_filtered.shape[0]

0.0025099828864803193

In [23]:
# write to parquet file
output_file = f'../Clean_Data/Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM/pdsi_200101_202112_filtered.parquet'
pdsi_Dat_filtered.to_parquet(output_file)

## MERGE w the FINAL WEATHER CLEANED DATA

In [30]:
all_features = pd.read_parquet('../Clean_Data/Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM_Merged_Cleaned_2001_2020.parquet')

In [25]:
pdsi_Dat_filtered = pd.read_parquet('../Clean_Data/Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM/pdsi_200101_202112_filtered.parquet')

In [26]:
all_features.shape, pdsi_Dat_filtered.shape

((128056650, 21), (147252000, 4))

In [31]:
all_features.dtypes

lon                                                 float64
lat                                                 float64
day                                          datetime64[ns]
dead_fuel_moisture_1000hr                           float32
dead_fuel_moisture_100hr                            float32
pdsi                                                float32
pdsi_class                                          float32
max_air_temperature                                 float64
max_relative_humidity                               float32
max_wind_speed                                      float32
min_air_temperature                                 float64
min_relative_humidity                               float32
precipitation_amount                                float32
specific_humidity                                   float32
surface_downwelling_shortwave_flux_in_air           float32
wind_from_direction                                 float32
wind_speed                              

In [32]:
# rename pdsi as 'old_pdsi'
all_features.rename(columns={'pdsi': 'pdsi_pre_interpolated'}, inplace=True)

In [33]:
all_features.dtypes

lon                                                 float64
lat                                                 float64
day                                          datetime64[ns]
dead_fuel_moisture_1000hr                           float32
dead_fuel_moisture_100hr                            float32
pdsi_pre_interpolated                               float32
pdsi_class                                          float32
max_air_temperature                                 float64
max_relative_humidity                               float32
max_wind_speed                                      float32
min_air_temperature                                 float64
min_relative_humidity                               float32
precipitation_amount                                float32
specific_humidity                                   float32
surface_downwelling_shortwave_flux_in_air           float32
wind_from_direction                                 float32
wind_speed                              

In [27]:
pdsi_Dat_filtered.dtypes

lon            float64
lat            float64
day     datetime64[ns]
pdsi           float32
dtype: object

In [34]:
# merge all_features with SNOW_dat on lon, lat, day
all_features = pd.merge(all_features, pdsi_Dat_filtered, on=['lon', 'lat', 'day'], how='left')

In [35]:
all_features.shape

(128056650, 22)

In [37]:
all_features.dtypes

lon                                                 float64
lat                                                 float64
day                                          datetime64[ns]
dead_fuel_moisture_1000hr                           float32
dead_fuel_moisture_100hr                            float32
pdsi_pre_interpolated                               float32
pdsi_class                                          float32
max_air_temperature                                 float64
max_relative_humidity                               float32
max_wind_speed                                      float32
min_air_temperature                                 float64
min_relative_humidity                               float32
precipitation_amount                                float32
specific_humidity                                   float32
surface_downwelling_shortwave_flux_in_air           float32
wind_from_direction                                 float32
wind_speed                              

In [36]:
all_features['pdsi'].isna().sum() / all_features.shape[0]

0.0025099828864803193

In [38]:
# write it back to parquet file
output_file = f'../Clean_Data/Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM_Merged_Cleaned_2001_2020.parquet'
all_features.to_parquet(output_file)