# Reformat the data from the original station files

Store a properly-formatted version of the data from each of the original downloaded station files.

## Imports and env

In [None]:
import csv
import os
from tqdm import tqdm    # For nice loading graphic

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Globals

In [None]:
BASE_FOLDER_PATH = "/content/drive/My Drive/ML6140  - Project/Raw Data/NOAA_GHCN-Daily/"
!ls "{BASE_FOLDER_PATH}"

ca-2013-2023-station-data	       ghcnd-stations.txt		readme.txt
ca-2013-2023-station-data-reformatted  melio-ca-stations-2013-2023.csv	reformatFiles.ipynb
ghcnd-inventory.txt		       melio-ca-stations.csv		retrieveData.ipynb


In [None]:
MEASUREMENTS_BY_STATION_FOLDER_ORIGINAL = "ca-2013-2023-station-data/"
MEASUREMENTS_BY_STATION_FOLDER_REFORMATTED = "ca-2013-2023-station-data-reformatted/"

In [None]:
WEATHER_ELEMENTS = [
    "PRCP", # precipitation (.1 mm)
    "SNOW", # snowfall (mm)
    "SNWD", # snow depth (mm)
    "TMAX", # max temp (1/10 C)
    "TMIN", # min temp (1/10 C)
    "ADPT", # avg dew point temp (1/10 C)
    "AWBT", # avg wet bulb temp (1/10 C)
    "AWND", # avg wind speed (.1 m/s)
    "EVAP", # evap from pan (.1 mm)
    "FMTM", # time of fastest wind (HHMM)
    "FRGB", # frozen ground bottom (cm)
    "FRGT", # frozen ground top (cm)
    "FRTH", # frozen ground height (cm)
    "RHAV", # avg rel humid (%)
    "RHMN", # min rel humid (%)
    "RHMX", # max rel humid (%)
    "TAVG", # avg temp (1/10 C)
]

In [None]:
REFORMATTED_FILE_HEADERS = ["YEAR", "MONTH", "DAY"] + WEATHER_ELEMENTS

## Helpers

In [None]:
def new_weather_elements_dict():
  """
  Get dictionary with all the expected element keys.
  """
  elements_dict = {}
  for elem in WEATHER_ELEMENTS:
    elements_dict[elem] = ""
  return elements_dict

In [None]:
def get_date_to_examples_reformatted_element_measurements(filename):
  """
  Get a dictionary mapping yyyymmdd dates to the element measurement dicts.

  {
    "\<yyyymmdd\>":
    {
      "PRCP": \<value\>,
      "TMAX": \<value\>,
      ...
    }
  }
  """
  # Group all measurments by date
  date_to_examples_reformatted_element_measurements = {}

  with open(
      BASE_FOLDER_PATH + MEASUREMENTS_BY_STATION_FOLDER_ORIGINAL + filename,
      newline=""
  ) as station_file:
    reader = csv.DictReader(station_file)    # Headers = first row

    for row in reader:
      reformatted_row_dict = {}
      if filename[:-4] != row["ID"]:
        # Unexpected
        raise Exception("Row ID and filename do not match.")

      if row["DATE"] not in date_to_examples_reformatted_element_measurements:
        date_to_examples_reformatted_element_measurements[row["DATE"]] = new_weather_elements_dict()
      if row["ELEMENT"] in WEATHER_ELEMENTS:
        date_to_examples_reformatted_element_measurements[row["DATE"]][row["ELEMENT"]] = row["DATA_VALUE"]

  return date_to_examples_reformatted_element_measurements

In [None]:
def add_reformatted_date_columns(date_to_examples_reformatted_element_measurements):
  """
  Add YEAR, MONTH and DAY elements to a dict mapping yyyymmdd keys to dicts.
  """
  for yyyymmdd in date_to_examples_reformatted_element_measurements.keys():
    date_to_examples_reformatted_element_measurements[yyyymmdd]["YEAR"] = int(yyyymmdd[:4])
    date_to_examples_reformatted_element_measurements[yyyymmdd]["MONTH"] = int(yyyymmdd[4:6])
    date_to_examples_reformatted_element_measurements[yyyymmdd]["DAY"] = int(yyyymmdd[6:])

In [None]:
def write_reformatted_date(filename, examples_dict):
  """
  Write the given dictionary with REFORMATTED_FILE_HEADERS fieldnames as a csv
  to the file indicated in the
  BASE_FOLDER_PATH + MEASUREMENTS_BY_STATION_FOLDER_REFORMATTED folder.
  """
  with open(
      BASE_FOLDER_PATH + MEASUREMENTS_BY_STATION_FOLDER_REFORMATTED + filename,
      "w",
      newline=""
  ) as station_file:
    writer = csv.DictWriter(station_file, fieldnames=REFORMATTED_FILE_HEADERS)
    writer.writeheader()
    for key in examples_dict.keys():
      writer.writerow(examples_dict[key])

## Main

In [None]:
filenames = os.listdir(BASE_FOLDER_PATH + MEASUREMENTS_BY_STATION_FOLDER_ORIGINAL)
len(filenames)

641

In [None]:
for filename in tqdm(filenames, total=len(filenames)):
  data = get_date_to_examples_reformatted_element_measurements(filename)
  add_reformatted_date_columns(data)
  write_reformatted_date(filename, data)

100%|██████████| 641/641 [05:57<00:00,  1.79it/s]
