In [1]:
# MMF
#
# Script will map GHCND stations to CONUS grid.
# Also, for each station, will list availability of daily temperature 
# observations (max/min/mean), wind speed, and SWE.
#
# Required inputs:
# - Must download all GHCND data - see readme.txt in GHCND folder
# - GHCND inventory - a list of observations and years available for each station
# - GHCND stations list - just lists all GHCND stations with location, elevation, etc.
# - Years of CONUS simulation (used to report temperature, snow, precip, wind speed
#   availability for each GHCND gauge)
# - Latitude and longitude for each CONUS grid cell center
#
# Outputs:
# - csv file with each row as a station. Columns report station metadata
#   (name, site ID, latitude, longitude, elevation) along with CONUS
#   mapping (CONUS pfb index, X index, Y index) and data availability
#   (whether temperature, precip, wind speed, and snow are available for
#   each GHCND station during the CONUS simulation period)

###################################

# Path to GHCND file with list of observations and years available
ghcnd_inventory_path = "/glade/p/univ/ucsm0002/CONUS_modern/Analysis_Validation/Validation/GHCND_MetStations/ghcnd-inventory.txt"

# Path to GHCND file with list of stations 
ghcnd_stations_path = "/glade/p/univ/ucsm0002/CONUS_modern/Analysis_Validation/Validation/GHCND_MetStations/ghcnd-stations.txt"

# Water years you want included when choosing stations
# NOTE: GHCND file inventory uses calendar years, not water years
yrs = 2002:2003

# Path to lat/lon grid file
CONUSlatlon = read.table("/glade/p/univ/ucsm0002/CONUS2/domain_files/CONUS2.0.Final.LatLong.sa",skip=1)
# Output file 
# this will be a csv containing list of stations for comparison:
# station ID, lat/lon of station, station name, station elevation,
# availability of core variables - TMIN/TAVG/TMAX/PRCP/WESD/SNOW/SNWD/AWND,
# and the CONUS index, x-index and y-index for comparison cells
outfile = "./GHCND_mapped.csv"



In [None]:
###################################
# Read in station data

# Reading these in as fixed width 
# NOTE: if you want to reproduce this, you need to remove all
# "#" characters from ghcnd_stations_path file OR
# set "comment.char" to something that probably won't be in the
# file (here I set it to $), otherwise R will think they are comments
ghcnd_stations = read.fwf(ghcnd_stations_path,
                          widths = c(11,-1,8,-1,9,-1,
                                     6,-1,2,-1,30,-1,
                                     3,-1,3,-1,5),
                          header=F,
                          comment.char="$")
names(ghcnd_stations) = c("ID",
                          "LATITUDE",
                          "LONGITUDE",
                          "ELEVATION_m",
                          "STATE",
                          "NAME",
                          "GSN_FLAG",
                          "HCN_FLAG",
                          "WMO_ID")
ghcnd_inventory = read.fwf(ghcnd_inventory_path,
                           widths = c(11,-1,8,-1,
                                      9,-1,4,-1,
                                      4,-1,4))
names(ghcnd_inventory) = c("ID",
                           "LATITUDE",
                           "LONGITUDE",
                           "ELEMENT",
                           "FIRSTYEAR",
                           "LASTYEAR")

###################################

In [None]:
###################################
# Dataframe to fill

# N Rows = number of unique station IDs
# N Columns = 16:
# - Station ID
# - Station Name
# - Station Lat
# - Station Lon
# - Station Elevation
# - CONUS index (PF index)
# - CONUS x index
# - CONUS y index
# - TMIN availability
# - TAVG availability
# - TMAX availability 
# - PRCP availability
# - WESD availability
# - SNOW availability
# - SNWD availability
# - AWND availability
# First, to make this a bit more efficient, do a cursory look at gauges
# and remove all that are outside of a bounding box containing CONUS
range(CONUSlatlon[,1])
range(CONUSlatlon[,2])
maxlat = 51
minlat = 30
maxlon = -75
minlon = -122
out_domain1 = which((ghcnd_stations$LATITUDE > maxlat) | (ghcnd_stations$LATITUDE < minlat))
out_domain2 = which((ghcnd_stations$LONGITUDE > maxlon) | (ghcnd_stations$LONGITUDE < minlon))
ghcnd_stations = ghcnd_stations[-union(out_domain1,out_domain2),]


# Start with full list of these bounding-box stations
# Later we will remove the ones we won't use
dat = matrix(NA,nrow=nrow(ghcnd_stations),ncol=16)
dat = data.frame(dat)
names(dat) = c("ID",
               "NAME",
               "LATITUDE",
               "LONGITUDE",
               "ELEVATION_m",
               "CON_id",
               "CON_x",
               "CON_y",
               "TMIN","TAVG","TMAX",
               "PRCP",
               "WESD","SNOW","SNWD",
               "AWND")

###################################
# Map and fill dataframe


# Loop through stations
# For stations not within domain or containing no available data
# in our desired date range, the corresponding row will be empty.
# NOTE: I need to rewrite this with map function or parallelize it
# takes too long
for(s in 1:nrow(ghcnd_stations)){

  # Station id
  id = as.character(ghcnd_stations$ID[s])

  # Subset station inventory data
  subs = ghcnd_inventory[which(as.character(ghcnd_inventory$ID)==id),]

  # Check to see if ANY data exist within our timeframe.
  # If no core variables contain data for desired water years,
  # skip this station.
  if(length(subs$ID)==0){next}
  avail_years = min(subs$FIRSTYEAR):max(subs$LASTYEAR) # Calendar years
  my_yrs = (min(yrs)-1):max(yrs) # Calendar years that I want
  if(length(intersect(avail_years,my_yrs))==0){
    print(paste(id,": Skipping, no data available for this timeframe.",sep=""))
    next
  }

  # Map station and check to see if it is within the CONUS.
  # If outside domain, skip this station.
  slat = ghcnd_stations$LATITUDE[s]
  slon = ghcnd_stations$LONGITUDE[s]
  dists = sqrt((CONUSlatlon[,1]-slat)^2 + (CONUSlatlon[,2]-slon)^2)
  CONUS_id = which.min(dists)
  if(earth.dist(slon,slat,CONUSlatlon[CONUS_id,2],CONUSlatlon[CONUS_id,1]) > 1){
    print(paste(id,": Skipping, outside of domain.",sep=""))
    next
  }

  # If we have available data and the station is within our
  # domain, then we start saving information.
  print(paste(id,": Available data, saving information!",sep=""))
  dat$ID[s] = id
  dat$NAME[s] = ghcnd_stations$NAME[s]
  dat$LATITUDE[s] = slat
  dat$LONGITUDE[s] = slon
  dat$ELEVATION_m[s] = ghcnd_stations$ELEVATION_m[s]

                                                                                  