For Palmer_Drought_Severity_Index, it only contains value every 5 days, this script is to fill in the gap

In [1]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import pyproj
from tqdm import tqdm
import numpy as np

In [2]:
import os
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

In [3]:
# check python version and all packages version
def check_python_version():
    import sys
    print("Python version")
    print (sys.version)
    print("Pandas version")
    print(pd.__version__)
    print("Geopandas version")
    print(gpd.__version__)
    print("Xarray version")
    print(xr.__version__)
    print("Pyproj version")
    print(pyproj.__version__)

check_python_version()

Python version
3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pandas version
2.2.2
Geopandas version
0.14.2
Xarray version
2023.6.0
Pyproj version
3.6.1


In [4]:
pd.set_option('display.max_colwidth', None)

In [5]:
files = os.listdir('../Weather_Data/')
files

['dead_fuel_moisture_1000hr.2001.2023.CA.nc',
 'dead_fuel_moisture_100hr.2001.2023.CA.nc',
 'expanded_Palmer_Drought_Severity_Index.2001.2023.CA.nc',
 'max_air_temperature.2001.2023.CA-008.nc',
 'max_relative_humidity.2001.2023.CA.nc',
 'max_wind_speed.2001.2023.CA.nc',
 'min_air_temperature.2001.2023.CA-009.nc',
 'min_relative_humidity.2001.2023.CA.nc',
 'Palmer_Drought_Severity_Index.2001.2023.CA.nc',
 'precipitation_amount.2001.2023.CA.nc',
 'specific_humidity.2001.2023.CA.nc',
 'surface_downwelling_shortwave_flux.2001.2023.CA.nc',
 'wind_from_direction.2001.2023.CA.nc',
 'wind_speed.2001.2023.CA.nc']

In [8]:
data_dir = '../Weather_Data/'
# Load the first dataset to use as a reference
pdsi_dat = xr.open_dataset(os.path.join(data_dir, files[8]))

In [9]:
pdsi_dat

In [7]:
print(f"File: {files[6]}")
print(f"Dimensions: {pdsi_dat.dims}")
print(f"Coordinates: {pdsi_dat.coords}")
print(f"Data variables: {pdsi_dat.data_vars}")

File: Palmer_Drought_Severity_Index.2001.2023.CA.nc
Dimensions: Frozen({'day': 1679, 'lon': 259, 'lat': 240})
Coordinates: Coordinates:
  * day      (day) datetime64[ns] 2001-01-05 2001-01-10 ... 2023-12-31
  * lon      (lon) float64 -124.8 -124.7 -124.7 -124.6 ... -114.1 -114.1 -114.0
  * lat      (lat) float64 41.98 41.94 41.9 41.86 ... 32.15 32.11 32.07 32.03
Data variables: Data variables:
    pdsi      (day, lat, lon) float32 ...
    category  (day, lat, lon) float32 ...


In [37]:
# Create a new date range from 2001-01-01 to 2023-12-31
new_date_range = pd.date_range(start='2001-01-01', end='2023-12-31', freq='D')

# Reindex the dataset to the new date range
expanded_ds = pdsi_dat.reindex({'day': new_date_range})

# fill the missing values
expanded_ds = expanded_ds.bfill('day')

# Print the new dimensions and coordinates
print(f"New Dimensions: {expanded_ds.dims}")
print(f"New Coordinates: {expanded_ds.coords}")

# Check the min and max day
min_day = expanded_ds['day'].min().values
max_day = expanded_ds['day'].max().values

print(f"Min day: {min_day}")
print(f"Max day: {max_day}")

New Dimensions: Frozen({'day': 8400, 'lon': 259, 'lat': 240})
New Coordinates: Coordinates:
  * day      (day) datetime64[ns] 2001-01-01 2001-01-02 ... 2023-12-31
  * lon      (lon) float64 -124.8 -124.7 -124.7 -124.6 ... -114.1 -114.1 -114.0
  * lat      (lat) float64 41.98 41.94 41.9 41.86 ... 32.15 32.11 32.07 32.03
Min day: 2001-01-01T00:00:00.000000000
Max day: 2023-12-31T00:00:00.000000000


In [35]:
# before fill
pdsi_dat.sel(day=slice('2023-09-17', '2023-10-07'), lon=pdsi_dat.lon.values[100], lat=pdsi_dat.lat.values[100])['pdsi'].values

array([2.5699997, 2.5499992, 2.5100002, 2.4799995, 2.4400005],
      dtype=float32)

In [39]:
# after fill
expanded_ds.sel(day=slice('2023-09-13', '2023-10-07'), lon=pdsi_dat.lon.values[100], lat=pdsi_dat.lat.values[100])['pdsi'].values

array([2.5699997, 2.5699997, 2.5699997, 2.5699997, 2.5699997, 2.5499992,
       2.5499992, 2.5499992, 2.5499992, 2.5499992, 2.5100002, 2.5100002,
       2.5100002, 2.5100002, 2.5100002, 2.4799995, 2.4799995, 2.4799995,
       2.4799995, 2.4799995, 2.4400005, 2.4400005, 2.4400005, 2.4400005,
       2.4400005], dtype=float32)

Looks good!

In [42]:
# save the expanded dataset rename it as expanded + files[6]
expanded_ds.to_netcdf(os.path.join(data_dir, 'expanded_' + files[6]))