In [2]:
import requests
import time
import os

stations = ["720538-00164", "999999-04222", "725244-99999"]
start_year = 2010

def download(url, outfile): 
    try:
        response = requests.get(url, timeout=5)
    except requests.exceptions.Timeout:
        print(f"Timeout when trying to read {url}, skipping...")
        return
    if response.status_code != requests.codes.ok:
        print(f"{response.status_code}: {url}, skipping...")
        time.sleep(0.5)
        return
    else:
        print(f"{response.status_code}: {url}, downloading...")
    os.makedirs(os.path.dirname(outfile), exist_ok=True)
    with open(path, "wb") as f:
        for chunk in response.iter_content(chunk_size=128):
            f.write(chunk)

for station in stations:
    for year in range(start_year, 2022):
        file_name = f"{station}-{year}.gz"
        directory = os.path.join("isd-lite", str(year))
        path = os.path.join(directory, file_name)
        if os.path.exists(path):
            continue
        url = f"https://www.ncei.noaa.gov/pub/data/noaa/isd-lite/{year}/{file_name}"
        download(url, path)

summary = "isd-summary/2021.tar.gz"
if not os.path.exists(summary):
    download("https://www.ncei.noaa.gov/data/global-summary-of-the-day/archive/2021.tar.gz", summary)


200: https://www.ncei.noaa.gov/pub/data/noaa/isd-lite/2010/720538-00164-2010.gz, downloading...
Timeout when trying to read https://www.ncei.noaa.gov/pub/data/noaa/isd-lite/2012/720538-00164-2012.gz, skipping...
200: https://www.ncei.noaa.gov/pub/data/noaa/isd-lite/2016/720538-00164-2016.gz, downloading...
Timeout when trying to read https://www.ncei.noaa.gov/pub/data/noaa/isd-lite/2020/720538-00164-2020.gz, skipping...
200: https://www.ncei.noaa.gov/pub/data/noaa/isd-lite/2019/999999-04222-2019.gz, downloading...
404: https://www.ncei.noaa.gov/pub/data/noaa/isd-lite/2010/725244-99999-2010.gz, skipping...
404: https://www.ncei.noaa.gov/pub/data/noaa/isd-lite/2011/725244-99999-2011.gz, skipping...
404: https://www.ncei.noaa.gov/pub/data/noaa/isd-lite/2012/725244-99999-2012.gz, skipping...
404: https://www.ncei.noaa.gov/pub/data/noaa/isd-lite/2013/725244-99999-2013.gz, skipping...
404: https://www.ncei.noaa.gov/pub/data/noaa/isd-lite/2014/725244-99999-2014.gz, skipping...
Timeout when tr

In [None]:
import subprocess

for root, directories, file_names in os.walk("isd-lite"):
    for file_name in (file_name for file_name in file_names if os.path.splitext(file_name)[1] == ".gz"):
        source = os.path.join(root, file_name)
        target = os.path.join(root, os.path.splitext(file_name)[0])
        if not os.path.exists(target):
            subprocess.call(["gunzip", source])

In [7]:
import datetime
import csv
import numpy
from dataclasses import dataclass
import pandas
import pyarrow
from pyarrow import parquet


@dataclass
class Observation:
    year: int
    month: int
    day: int
    hour: int
    air_temperature: int
    dew_point_temperature: int
    sea_level_pressure: int
    wind_direction: int
    wind_speed_rate: int
    sky_condition_total_coverage: int
    liquid_precipitiation_one_hour: int
    liquid_precipitiation_six_hours: int


data_frames = []
for root, directories, file_names in os.walk("isd-lite"):
    for file_name in (file_name for file_name in file_names if not os.path.splitext(file_name)[1]):
        source = os.path.join(root, file_name)
        parts = file_name.split("-")
        assert(len(parts) == 3)
        summary_file_name = f"{parts[0]}{parts[1]}.csv"
        with open(os.path.join("isd-summary", summary_file_name)) as f:
            reader = csv.DictReader(f)
            first = next(reader)
        station = first["STATION"]
        latitude = float(first["LATITUDE"])
        longitude = float(first["LONGITUDE"])
        elevation = float(first["ELEVATION"])
        name = first["NAME"]
        with open(source) as f:
            observations = []
            for line in f:
                values = [int(v) for v in line.split()]
                observations.append(Observation(*values))
        data_frame = pandas.DataFrame(observations)
        datetimes = data_frame.apply(lambda x: datetime.datetime(x.year, x.month, x.day, x.hour, tzinfo=datetime.timezone.utc), axis=1)
        del data_frame["month"]
        del data_frame["day"]
        del data_frame["hour"]
        data_frame.insert(0, "datetime", datetimes)
        data_frame.insert(0, "name", name)
        data_frame.insert(0, "elevation", elevation)
        data_frame.insert(0, "longitude", longitude)
        data_frame.insert(0, "latitude", latitude)
        data_frame.insert(0, "station", station)
        data_frame = data_frame.replace(-9999, numpy.nan)
        data_frame = data_frame.assign(
            air_temperature=data_frame.air_temperature.astype("float32") / 10,
            dew_point_temperature=data_frame.dew_point_temperature.astype("float32") / 10,
            sea_level_pressure=data_frame.sea_level_pressure.astype("float32") / 10,
            wind_speed_rate=data_frame.wind_speed_rate.astype("float32") / 10,
            sky_condition_total_coverage=pandas.Categorical(data_frame.sky_condition_total_coverage, categories=range(0, 20)),
            liquid_precipitation_one_hour=data_frame.liquid_precipitiation_one_hour.astype("float32") / 10,
            liquid_precipitation_six_hours=data_frame.liquid_precipitiation_six_hours.astype("float32") / 10,
        )
        data_frames.append(data_frame)
data_frame = pandas.concat(data_frames)

In [9]:
table = pyarrow.Table.from_pandas(data_frame)
parquet.write_to_dataset(table, root_path="isd-lite/parquet", partition_cols=["year"])

KeyError: 'year'