In [None]:
import requests
import time
import os

stations = ["720538-00164", "999999-04222", "725244-99999"]
start_year = 2010

for station in stations:
    for year in range(start_year, 2022):
        file_name = f"{station}-{year}.gz"
        directory = os.path.join("isd-lite", str(year))
        path = os.path.join(directory, file_name)
        if os.path.exists(path):
            continue
        url = f"https://www.ncei.noaa.gov/pub/data/noaa/isd-lite/{year}/{file_name}"
        try:
            response = requests.get(url, timeout=5)
        except requests.exceptions.Timeout:
            print(f"Timeout when trying to read {url}, skipping...")
            continue
        if response.status_code != requests.codes.ok:
            print(f"{response.status_code}: {url}, skipping...")
            time.sleep(0.5)
            continue
        else:
            print(f"{response.status_code}: {url}, downloading...")
        os.makedirs(directory, exist_ok=True)
        with open(path, "wb") as f:
            for chunk in response.iter_content(chunk_size=128):
                f.write(chunk)


In [None]:
import subprocess

for root, directories, file_names in os.walk("isd-lite"):
    for file_name in (file_name for file_name in file_names if os.path.splitext(file_name)[1] == ".gz"):
        source = os.path.join(root, file_name)
        target = os.path.join(root, os.path.splitext(file_name)[0])
        if not os.path.exists(target):
            subprocess.call(["gunzip", source])

In [1]:
import datetime
import numpy
from dataclasses import dataclass
import pandas
import pyarrow
from pyarrow import parquet


@dataclass
class Observation:
    year: int
    month: int
    day: int
    hour: int
    air_temperature: int
    dew_point_temperature: int
    sea_level_pressure: int
    wind_direction: int
    wind_speed_rate: int
    sky_condition_total_coverage: int
    liquid_precipitiation_one_hour: int
    liquid_precipitiation_six_hours: int


for root, directories, file_names in os.walk("isd-lite"):
    for file_name in (file_name for file_name in file_names if not os.path.splitext(file_name)[1]):
        source = os.path.join(root, file_name)
        destination = os.path.join(root, os.path.splitext(file_name)[0] + ".parquet")
        with open(source) as f:
            observations = []
            for line in f:
                values = [int(v) for v in line.split()]
                observations.append(Observation(*values))
        data_frame = pandas.DataFrame(observations)
        datetimes = data_frame.apply(lambda x: datetime.datetime(x.year, x.month, x.day, x.hour, tzinfo=datetime.timezone.utc), axis=1)
        del data_frame["year"]
        del data_frame["month"]
        del data_frame["day"]
        del data_frame["hour"]
        data_frame.insert(0, "datetime", datetimes)
        data_frame = data_frame.replace(-9999, numpy.nan)
        data_frame = data_frame.assign(
            air_temperature=data_frame.air_temperature.astype("float32") / 10,
            dew_point_temperature=data_frame.dew_point_temperature.astype("float32") / 10,
            sea_level_pressure=data_frame.sea_level_pressure.astype("float32") / 10,
            wind_speed_rate=data_frame.wind_speed_rate.astype("float32") / 10,
            sky_condition_total_coverage=pandas.Categorical(data_frame.sky_condition_total_coverage, categories=range(0, 20)),
            liquid_precipitation_one_hour=data_frame.liquid_precipitiation_one_hour.astype("float32") / 10,
            liquid_precipitation_six_hours=data_frame.liquid_precipitiation_six_hours.astype("float32") / 10,
        )
        table = pyarrow.Table.from_pandas(data_frame)
        parquet.write_table(table, destination)

In [3]:
for root, directories, file_names in os.walk("isd-lite"):
    for file_name in (file_name for file_name in file_names if os.path.splitext(file_name)[1] == ".parquet"):
        path = os.path.join(root, file_name)
        table = parquet.read_table(path)
        data_frame = table.to_pandas()
        print(data_frame.head())

                   datetime  air_temperature  dew_point_temperature  \
0 2013-01-01 01:00:00+00:00             -8.0                  -10.0   
1 2013-01-01 02:00:00+00:00            -12.0                  -14.0   
2 2013-01-01 03:00:00+00:00            -12.0                  -14.0   
3 2013-01-01 04:00:00+00:00            -11.0                  -12.0   
4 2013-01-01 05:00:00+00:00            -10.0                  -12.0   

   sea_level_pressure  wind_direction  wind_speed_rate  \
0                 NaN               0              0.0   
1                 NaN               0              0.0   
2                 NaN             260              1.5   
3                 NaN             210              1.5   
4                 NaN               0              0.0   

   sky_condition_total_coverage  liquid_precipitiation_one_hour  \
0                           4.0                             NaN   
1                           0.0                             NaN   
2                      