-
Notifications
You must be signed in to change notification settings - Fork 18
/
pdsi.py
145 lines (115 loc) · 5.67 KB
/
pdsi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""Code for downloading the Palmer Drought Severity Index (PDSI) from gridMET data. Use the CLI
to download, for example:
python -m wsfr_download pdsi 2005 2007 2009
or use the `bulk` command to download many sources at once based on a config file.
python -m wsfr_download bulk data_download/hindcast_test_config.yml
You can also import this module and use it as a library.
See the challenge website for more about this approved data source:
https://www.drivendata.org/competitions/254/reclamation-water-supply-forecast-dev/page/801/#palmer-drought-severity-index-pdsi-from-gridmet
"""
from datetime import datetime, timedelta
from pathlib import Path
from typing import Annotated
from loguru import logger
import netCDF4 as nc
import requests
import typer
from wsfr_download.config import DATA_ROOT
from wsfr_download.utils import DownloadResult, log_download_results
def build_url(start_date: str, end_date: str) -> str:
"""Generate the URL to request PDSI data for a specific time range
Args:
start_date (str): Start date in the format %Y-%m-%d
end_date (str): End date in the format %Y-%m-%d
"""
url = "https://thredds.northwestknowledge.net/thredds/ncss/agg_met_pdsi_"
url += "1979_CurrentYear_CONUS.nc?var=daily_mean_palmer_drought_severity_"
url += "index&disableLLSubset=on&disableProjSubset=on&horizStride=1&"
url += f"time_start={start_date}T00%3A00%3A00Z&time_end={end_date}"
url += "T00%3A00%3A00Z&timeStride=1&accept=netcdf"
return url
def download_time_range(start: str, end: str, out_file: Path) -> DownloadResult:
"""Download PDSI data for the continental US in the specified time
frame in NetCDF4 format.
Args:
start (str): Start date in the format %Y-%m-%d
end_date (str): End date in the format %Y-%m-%d
out_file (Path): Path to save the downloaded file
"""
url = build_url(start, end)
response = requests.get(url)
if response.status_code != 200:
logger.warning(
f"Could not download file. "
f"{response.status_code}: {response.reason} error for {url}"
)
return DownloadResult.SKIPPED_NO_DATA
with out_file.open("wb") as fp:
fp.write(response.content)
return DownloadResult.SUCCESS
def validate_netcdf4(file_path: Path, fy_start_month: int, fy_end_month: int) -> bool:
"""Validate a NetCDF4 file for PDSI. Returns True is the file is valid and
False if not.
Args:
file_path (Path): Path to file
fy_start_month (int): Forecast year start month
fy_end_month (int): Forecast year end month
"""
try:
ds = nc.Dataset(file_path)
except OSError:
logger.warning(f"Deleting file that is not a valid NetCDF4: {file_path}")
file_path.unlink()
return False
ds_keys = set(ds.variables.keys())
if ds_keys != {"daily_mean_palmer_drought_severity_index", "day", "lat", "lon"}:
logger.warning(f"Data at {file_path} has unexpected variable keys: {ds_keys}")
calendar_start = datetime(1900, 1, 1)
min_date = calendar_start + timedelta(days=min(ds.variables["day"][:]))
max_date = calendar_start + timedelta(days=max(ds.variables["day"][:]))
logger.info(
"Downloaded data for {} dates with date range {} to {}.",
len(ds.variables["day"]),
str(min_date),
str(max_date),
)
ds.close()
return True
def download_pdsi(
forecast_years: Annotated[list[int], typer.Argument(help="Forecast years to download for.")],
fy_start_month: Annotated[int, typer.Option(help="Forecast year start month.")] = 10,
fy_start_day: Annotated[int, typer.Option(help="Forecast year start day.")] = 1,
fy_end_month: Annotated[int, typer.Option(help="Forecast year end month.")] = 7,
fy_end_day: Annotated[int, typer.Option(help="Forecast year end day.")] = 22,
skip_existing: Annotated[bool, typer.Option(help="Whether to skip an existing file.")] = True,
):
"""Download Palmer Drought Severity Index data from the
THREDDS server (NetcdfSubset):
https://www.drought.gov/data-maps-tools/us-gridded-palmer-drought-severity-index-pdsi-gridmet
Each forecast year begins on the specified date of the previous calendar
year, and ends on the specified date of the current calendar year. By
default, each forecast year starts on October 1 and ends July 21; e.g.,
by default, FY2021 starts on 2020-10-01 and ends on 2021-07-21.
When provided, the end date defined by `fy_end_month` and `fy_end_day` is exclusive,
i.e. for July 22, the most recent data will be for July 21.
"""
logger.info("Downloading Palmer Drought Severity Index data...")
download_results = []
for fy in forecast_years:
start = datetime(fy - 1, fy_start_month, fy_start_day).strftime("%Y-%m-%d")
end_date = datetime(fy, fy_end_month, fy_end_day)
end = (end_date - timedelta(days=1)).strftime("%Y-%m-%d")
logger.info(f"Downloading PDSI for forecast year {fy} ({start} to {end})")
out_file = DATA_ROOT / f"pdsi/FY{str(fy)}/pdsi_{start}_{end}.nc"
if skip_existing and out_file.exists():
if validate_netcdf4(out_file, fy_start_month, fy_end_month):
download_results.append(DownloadResult.SKIPPED_EXISTING)
continue
out_file.parent.mkdir(exist_ok=True, parents=True)
result = download_time_range(start, end, out_file)
if result == DownloadResult.SUCCESS:
# Validate downloaded files if new file was downloaded
validate_netcdf4(out_file, fy_start_month, fy_end_month)
download_results.append(result)
log_download_results(download_results)
logger.success("PDSI download complete.")