# About
Retrieve data for California weather stations.

NOTE: Within this folder, I'm prefixing files I create with "melio-" to distinguish them from original GHCN-Daily files.

## Setup:
Imports, environment, constants.

In [None]:
!pip install boto3



In [None]:
import csv
import boto3
import botocore
from tqdm import tqdm    # For nice loading graphic

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
BASE_FOLDER_PATH = "/content/drive/My Drive/ML6140  - Project/Raw Data/NOAA_GHCN-Daily/"

In [None]:
# Confirm the path is correct.
# NOTE: If the file is in "Shared with me", not "My Drive," manually add a
# shortcut to link from "My Drive" to the shared folder.
!ls "{BASE_FOLDER_PATH}"

ca-2013-2023-station-data  melio-ca-stations-2013-2023.csv  retrieveData.ipynb
ghcnd-inventory.txt	   melio-ca-stations.csv
ghcnd-stations.txt	   readme.txt


In [None]:
# GHCND file of all stations.
ALL_STATIONS_TXT_FILENAME = "ghcnd-stations.txt"
# filtered for just California stations and fewer columns.
MELIO_CA_STATIONS_CSV_FILENAME = "melio-ca-stations.csv"
MELIO_CA_STATIONS_CSV_FIELDS = ["id", "latitude", "longitude", "elevation"]

In [None]:
# GHCND file listing elements measured by station and year.
INVENTORY_TXT_FILENAME = "ghcnd-inventory.txt"
# Filter CA stations based on years and elements measured.
MELIO_CA_STATIONS_2013_2023_CSV_FILENAME = "melio-ca-stations-2013-2023.csv"

In [None]:
# For the csv files of measurements for each station.
MEASUREMENTS_BY_STATION_FOLDER = "ca-2013-2023-station-data/"

# For downloading from S3.
S3_OBJECT_KEY_TEMPLATE = "csv/by_station/{station_id}.csv"  # Key
S3_BUCKET_NAME = "noaa-ghcn-pds"
# Template for filenames of downloaded csv files from S3.
DEST_FILENAME_TEMPLATE = (BASE_FOLDER_PATH + MEASUREMENTS_BY_STATION_FOLDER
                          + "{station_id}.csv")

##Get California stations.

The dataset includes a file, ghcnd-stations.txt with the following information:

| Variable | Columns | Type |
| -------- | ------- | ---- |
| ID | 1-11 | Character |
| LATITUDE | 13-20 | Real |
| LONGITUDE | 22-30 | Real |
| ELEVATION | 32-37 | Real |
| STATE | 39-40 | Character |
| NAME | 42-71 | Character |
| GSN FLAG | 73-75 | Character |
| HCN/CRN FLAG | 77-79 | Character |
| WMO ID | 81-85 | Character |
NOTE: the above starts indexing at 1.

<br>

**Task: Extract only the rows with California as the state. Create a csv file with each California station's ID, latitude, longitude and elevation.**


In [None]:
california_state_id = "CA"

In [None]:
california_stations = []
with open(BASE_FOLDER_PATH + ALL_STATIONS_TXT_FILENAME, "r") as all_stations_infile:
  for line in all_stations_infile:
    if (line[38:40] == california_state_id):
      # Keep just the station id, lat, lon and elevation.
      # Other data entries are unnecessary and/or may be blank.
      california_stations.append(line.split()[:5])

In [None]:
# Confirm that id, lat, lon and elevation exist. Eliminate state column.
for i in range(len(california_stations)):
  if california_stations[i][-1] != "CA":
    print(line)
  california_stations[i] = california_stations[i][:-1]

In [None]:
with open(
    BASE_FOLDER_PATH + MELIO_CA_STATIONS_CSV_FILENAME, "w", newline=''
    ) as ca_stations_outfile:
  writer = csv.writer(ca_stations_outfile)
  # writer.writerow(MELIO_CA_STATIONS_CSV_FIELDS)
  writer.writerows(california_stations)

## Get info about which elements are tracked and narrow down stations to ones measuring PRCP between 2012 and 2023.

The ghcnd-inventory.txt file records the start and stop years of each station’s records of each measurement. Gather data on that using station IDs.

| Variable | Columns | Type |
| -------- | ------- | ---- |
ID | 1-11 | Character
LATITUDE | 13-20 | Real
LONGITUDE | 22-30 | Real
ELEMENT | 32-35 | Character
FIRSTYEAR | 37-40 | Integer
LASTYEAR | 42-45 | Integer

<br>

Takeaways:
- Of the 3k+ stations, only 641 track precipitation (PRCP) during the time window we're studying (2012-2023).
- Even fewer (411) track snow.
- 141 stations track multi-day precipitation, which is so few as to probably not be useful.
- 324 track temperature and all but one of these also track PRCP.
- Only 85 track thunder (WT03). That still might be an interesting feature to look at, but there aren't enough stations around the state to make that useful.

<br>

**Task: Write a new csv with just the stations measuring precipitation between 2013 and 2023. These stations cover most of the other element measurements made during this period.**

In [None]:
elements_tracked = {}

In [None]:
with open(
    BASE_FOLDER_PATH + MELIO_CA_STATIONS_CSV_FILENAME, newline=""
    ) as california_stations_infile:
  station_ids = set()
  station_reader = csv.reader(california_stations_infile)
  for row in station_reader:
    station_ids.add(row[0])
  print(len(station_ids))

  with open(BASE_FOLDER_PATH + INVENTORY_TXT_FILENAME) as inventory_infile:
    for row in inventory_infile:
      if row[0:11] in station_ids:
        if int(row[36:40]) <= 2012 and int(row[41:45]) == 2023:
          if row[31:35] not in elements_tracked:
            elements_tracked[row[31:35]] = []
          elements_tracked[row[31:35]].append(row[0:11])

3065


In [None]:
# Measurements taken
elements_tracked.keys()

dict_keys(['PRCP', 'SNOW', 'DAPR', 'MDPR', 'SNWD', 'WESD', 'WESF', 'WT03', 'WT05', 'WT11', 'TMAX', 'TMIN', 'TOBS', 'WT01', 'WDMV', 'EVAP', 'MNPN', 'MXPN', 'SN32', 'SN33', 'SN35', 'SX32', 'SX33', 'SX35', 'WT04', 'WT06', 'WDFG', 'WSFG', 'TAVG', 'AWDR', 'AWND', 'WSFI', 'PGTM', 'WDF2', 'WDF5', 'WSF2', 'WSF5', 'WT02', 'WT08', 'ADPT', 'ASLP', 'ASTP', 'AWBT', 'RHAV', 'RHMN', 'RHMX', 'WT07', 'WT10', 'WT09'])

In [None]:
for key in elements_tracked.keys():
  print(f"{key}: {len(elements_tracked[key])}")

PRCP: 641
SNOW: 411
DAPR: 141
MDPR: 143
SNWD: 219
WESD: 48
WESF: 18
WT03: 85
WT05: 17
WT11: 13
TMAX: 324
TMIN: 324
TOBS: 203
WT01: 93
WDMV: 11
EVAP: 11
MNPN: 10
MXPN: 10
SN32: 1
SN33: 1
SN35: 1
SX32: 1
SX33: 1
SX35: 1
WT04: 5
WT06: 3
WDFG: 1
WSFG: 1
TAVG: 48
AWDR: 2
AWND: 80
WSFI: 2
PGTM: 71
WDF2: 78
WDF5: 78
WSF2: 78
WSF5: 78
WT02: 67
WT08: 73
ADPT: 32
ASLP: 32
ASTP: 32
AWBT: 32
RHAV: 32
RHMN: 32
RHMX: 32
WT07: 5
WT10: 1
WT09: 1


In [None]:
overlap_count = 0
for id in elements_tracked["WT03"]:
  if id in elements_tracked["TMAX"]:
    overlap_count += 1
print(f"{overlap_count} of the {len(elements_tracked['WT03'])} in WT03 are in TMAX.")

84 of the 85 in WT03 are in TMAX.


In [None]:
ca_stations_2013to2023 = []
with open(
    BASE_FOLDER_PATH + MELIO_CA_STATIONS_CSV_FILENAME, newline=""
    ) as california_stations_infile:
  station_reader = csv.reader(california_stations_infile)
  for row in station_reader:
    if row[0] in elements_tracked["PRCP"]:
      ca_stations_2013to2023.append(row)

In [None]:
with open(
    BASE_FOLDER_PATH
    + MELIO_CA_STATIONS_2013_2023_CSV_FILENAME, "w", newline=""
    ) as ca_stations_2013to2023_outfile:
  writer = csv.writer(ca_stations_2013to2023_outfile)
  # writer.writerow(MELIO_CA_STATIONS_CSV_FIELDS)
  writer.writerows(ca_stations_2013to2023)

## Download files from S3
Use boto to download files from the NOAA AWS S3 bucket
https://noaa-ghcn-pds.s3.amazonaws.com/index.html

- bucket: "noaa-ghcn-pds"
- object: "csv/by_station/\<stationID\>.csv"

Setup Destination

In [None]:
!rm -r "{BASE_FOLDER_PATH + MEASUREMENTS_BY_STATION_FOLDER}"    # Replace existing (if exists).
!mkdir "{BASE_FOLDER_PATH + MEASUREMENTS_BY_STATION_FOLDER}"

In [None]:
!ls "{BASE_FOLDER_PATH + MEASUREMENTS_BY_STATION_FOLDER}"    # Should be empty.

Setup S3

In [None]:
# Anonymous request: https://stackoverflow.com/questions/34865927/can-i-use-boto3-anonymously#:~:text=With%20boto%20I%20could%20connect,passing%20the%20anon%3D%20keyword%20argument.
config=botocore.client.Config(signature_version=botocore.UNSIGNED)
s3 = boto3.resource("s3", config=config)

Download files

In [None]:
with open(
    BASE_FOLDER_PATH + MELIO_CA_STATIONS_2013_2023_CSV_FILENAME, newline=""
    ) as stations_infile:
  reader = csv.DictReader(stations_infile,
                          MELIO_CA_STATIONS_CSV_FIELDS)
  for row in tqdm(reader, desc="Downloading files from S3"):
    s3_key = S3_OBJECT_KEY_TEMPLATE.format(station_id=row["id"])
    dest_filename = DEST_FILENAME_TEMPLATE.format(station_id=row["id"])

    try:
      s3.Bucket(S3_BUCKET_NAME).download_file(s3_key, dest_filename)
      # print("Downloaded file {}".format(dest_filename))
    except botocore.exceptions.ClientError as err:
      if err.response["Error"]["Code"] == "404":
        print("Object {} does not exist.".format(s3_key))
      else:
        raise
  print("Done!")

Downloading files from S3: 641it [03:44,  2.86it/s]

Done!



