This script is to map the snow data w weather data

In [1]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import pyproj
from tqdm import tqdm
import numpy as np

In [2]:
import os
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

In [3]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)
    print("Geopandas version")
    print(gpd.__version__)
    print("Xarray version")
    print(xr.__version__)
    print("Pyproj version")
    print(pyproj.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2
Geopandas version
0.14.2
Xarray version
2023.6.0
Pyproj version
3.6.1


In [4]:
pd.set_option('display.max_colwidth', None)

In [5]:
files = os.listdir('../Weather_Data/')
files

['dead_fuel_moisture_1000hr.2001.2023.CA.nc',
 'dead_fuel_moisture_100hr.2001.2023.CA.nc',
 'expanded_Palmer_Drought_Severity_Index.2001.2023.CA.nc',
 'max_air_temperature.2001.2023.CA-008.nc',
 'max_relative_humidity.2001.2023.CA.nc',
 'max_wind_speed.2001.2023.CA.nc',
 'min_air_temperature.2001.2023.CA-009.nc',
 'min_relative_humidity.2001.2023.CA.nc',
 'Palmer_Drought_Severity_Index.2001.2023.CA.nc',
 'precipitation_amount.2001.2023.CA.nc',
 'specific_humidity.2001.2023.CA.nc',
 'surface_downwelling_shortwave_flux.2001.2023.CA.nc',
 'wind_from_direction.2001.2023.CA.nc',
 'wind_speed.2001.2023.CA.nc']

In [None]:
data_dir = '../Weather_Data/'
# Load the first dataset to use as a reference
reference_ds = xr.open_dataset(os.path.join(data_dir, files[-1]))

In [7]:
print(f"Dimensions: {reference_ds.dims}")
print(f"Coordinates: {reference_ds.coords}")
print(f"Data variables: {reference_ds.data_vars}")

Dimensions: Frozen({'day': 8400, 'lon': 259, 'lat': 240})
Coordinates: Coordinates:
  * day      (day) datetime64[ns] 2001-01-01 2001-01-02 ... 2023-12-31
  * lon      (lon) float64 -124.8 -124.7 -124.7 -124.6 ... -114.1 -114.1 -114.0
  * lat      (lat) float64 41.98 41.94 41.9 41.86 ... 32.15 32.11 32.07 32.03
Data variables: Data variables:
    wind_speed  (day, lat, lon) float32 ...


In [8]:
# Extract lon and lat coordinates
lon = reference_ds.coords['lon'].values
lat = reference_ds.coords['lat'].values

# Create a DataFrame with all combinations of lon and lat
lon_lat_combinations = pd.DataFrame([(lo, la) for lo in lon for la in lat], columns=['lon', 'lat'])

In [9]:
lon_lat_combinations.shape

(62160, 2)

In [11]:
SNOW_Dat = xr.open_dataset('../New_Data/SWE.200101.202309.CA.nc')

In [12]:
SNOW_Dat

In [13]:
SNOW_Dat = SNOW_Dat.to_dataframe().reset_index()

In [14]:
SNOW_Dat.head()

Unnamed: 0,time,lon,lat,SWE
0,2001-01-01,-124.766667,41.983333,0.0
1,2001-01-01,-124.766667,41.941667,0.0
2,2001-01-01,-124.766667,41.9,0.0
3,2001-01-01,-124.766667,41.858333,0.0
4,2001-01-01,-124.766667,41.816667,0.0


In [15]:
SNOW_Dat['SWE'].isna().sum() / SNOW_Dat.shape[0]

0.026801801801801802

In [16]:
SNOW_Dat_lon_lat = SNOW_Dat[['lon', 'lat']].drop_duplicates()
SNOW_Dat_lon_lat.shape

(62160, 2)

In [17]:
lon_lat_combinations.shape

(62160, 2)

In [19]:
# merge the slope data with the lon_lat_combinations
test_merge = lon_lat_combinations.merge(SNOW_Dat_lon_lat, on=['lon', 'lat'], how='inner')

In [20]:
test_merge.shape

(62160, 2)

follow the similar code in old code 02 03 Data Clean

In [22]:
SNOW_Dat.shape

(516425280, 4)

In [23]:
# save data to parquet file
SNOW_Dat.to_parquet('../Clean_Data/Weather_Data/SWE_200101_202309.parquet')

## Filter w Veg

In [4]:
# read SNOW data
SNOW_Dat = pd.read_parquet('../Clean_Data/Weather_Data/SWE_200101_202309.parquet')

In [26]:
SNOW_Dat.shape

(516425280, 4)

In [6]:
SNOW_Dat.dtypes

time    datetime64[ns]
lon            float64
lat            float64
SWE            float32
dtype: object

In [7]:
SNOW_Dat['time'].unique()

<DatetimeArray>
['2001-01-01 00:00:00', '2001-01-02 00:00:00', '2001-01-03 00:00:00',
 '2001-01-04 00:00:00', '2001-01-05 00:00:00', '2001-01-06 00:00:00',
 '2001-01-07 00:00:00', '2001-01-08 00:00:00', '2001-01-09 00:00:00',
 '2001-01-10 00:00:00',
 ...
 '2023-09-21 00:00:00', '2023-09-22 00:00:00', '2023-09-23 00:00:00',
 '2023-09-24 00:00:00', '2023-09-25 00:00:00', '2023-09-26 00:00:00',
 '2023-09-27 00:00:00', '2023-09-28 00:00:00', '2023-09-29 00:00:00',
 '2023-09-30 00:00:00']
Length: 8308, dtype: datetime64[ns]

In [24]:
lon_lat_pair_weather_match_veg = pd.read_parquet('../Clean_Data/lon_lat_pair_weather_match_veg.parquet')

In [27]:
SNOW_Dat_filtered = pd.merge(lon_lat_pair_weather_match_veg[['lon', 'lat']], SNOW_Dat, on=['lon', 'lat'], how='inner')
output_file = f'../Clean_Data/Weather_Data_w_Veg_Filter/SWE_200101_202309_filtered.parquet'
SNOW_Dat_filtered.to_parquet(output_file)

In [28]:
SNOW_Dat_filtered.shape

(147076524, 4)

## Filter w Fire 12KM (FOD_FPA)

In [30]:
# check files in Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM
files = os.listdir('../Clean_Data/Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM/')
files

['dead_fuel_moisture_1000hr_filtered.parquet',
 'dead_fuel_moisture_100hr_filtered.parquet',
 'expanded_Palmer_Drought_Severity_Index_filtered.parquet',
 'max_air_temperature_filtered.parquet',
 'max_relative_humidity_filtered.parquet',
 'max_wind_speed_filtered.parquet',
 'min_air_temperature_filtered.parquet',
 'min_relative_humidity_filtered.parquet',
 'precipitation_amount_filtered.parquet',
 'specific_humidity_filtered.parquet',
 'surface_downwelling_shortwave_flux_filtered.parquet',
 'wind_from_direction_filtered.parquet',
 'wind_speed_filtered.parquet']

In [31]:
# read first one
reference_ds = pd.read_parquet('../Clean_Data/Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM/dead_fuel_moisture_1000hr_filtered.parquet')

In [33]:
# extract lon and lat and dedup
reference_ds_lon_lat = reference_ds[['lon', 'lat']].drop_duplicates()

In [34]:
reference_ds_lon_lat.shape

(17530, 2)

In [35]:
# read ../Clean_Data/Weather_Data_w_Veg_Filter/SWE_200101_202309_filtered.parquet
SNOW_Dat = pd.read_parquet('../Clean_Data/Weather_Data_w_Veg_Filter/SWE_200101_202309_filtered.parquet')
# inner join with reference_ds_lon_lat on lon and lat
SNOW_Dat_filtered = pd.merge(reference_ds_lon_lat, SNOW_Dat, on=['lon', 'lat'], how='inner')

In [36]:
SNOW_Dat_filtered.shape

(145639240, 4)

In [39]:
145639240/147076524

0.9902276450319155

In [38]:
SNOW_Dat_filtered['SWE'].isna().sum() / SNOW_Dat_filtered.shape[0]

0.0419281232173417

In [40]:
# write to parquet file
output_file = f'../Clean_Data/Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM/SWE_200101_202309_filtered.parquet'
SNOW_Dat_filtered.to_parquet(output_file)

## MERGE w the FINAL WEATHER CLEANED DATA

In [41]:
all_features = pd.read_parquet('../Clean_Data/Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM_Merged_Cleaned_2001_2020.parquet')

In [42]:
SNOW_dat = pd.read_parquet('../Clean_Data/Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM/SWE_200101_202309_filtered.parquet')

In [43]:
all_features.shape, SNOW_dat.shape

((128056650, 18), (145639240, 4))

In [46]:
# in SNOW_dat, rename time to day
SNOW_dat.rename(columns={'time': 'day'}, inplace=True)

In [50]:
SNOW_dat.dtypes

lon           float64
lat           float64
day    datetime64[ns]
SWE           float32
dtype: object

In [47]:
# merge all_features with SNOW_dat on lon, lat, day
all_features = pd.merge(all_features, SNOW_dat, on=['lon', 'lat', 'day'], how='left')

In [48]:
all_features.shape

(128056650, 19)

In [51]:
# check missing rate of SWE
all_features['SWE'].isna().sum() / all_features.shape[0]

0.0419281232173417

In [52]:
# fill missing value with 0 for SWE
all_features['SWE'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_features['SWE'].fillna(0, inplace=True)


In [53]:
all_features['SWE'].isna().sum() / all_features.shape[0]

0.0

In [54]:
# write it back to parquet file
output_file = f'../Clean_Data/Weather_Data_w_Veg_Filter_FOD_FPA_Fire_12KM_Merged_Cleaned_2001_2020.parquet'
all_features.to_parquet(output_file)