In [1]:
#Library imports
import re
import os
import sys
import shutil
import time
import math
import gzip
import fnmatch
import random
import warnings
import numpy as np
import pandas as pd
import scipy.stats as scs
import urllib.request
import seaborn as sns
import matplotlib.pyplot as plt

from collections import OrderedDict

import scipy.stats as scs
from sklearn.neighbors import BallTree, KDTree

import fiona
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point, Polygon, LineString
from pyproj import Proj
import geoplot as gplt
import geoplot.crs as gcrs

# Allows access to scripts and modules relative to the parent directory.
parent = os.getcwd()
sys.path.append(os.path.join(parent, "functions"))

# Project specific user driven functions
from cleaning_functions import *

# My open source reusable user driven function repository.
from random_lumberjacks.src.random_lumberjacks.cleaning.cleaning_functions import *
from random_lumberjacks.src.random_lumberjacks.model.model_classes import *
from random_lumberjacks.src.random_lumberjacks.visualization.visualization_functions import *
from random_lumberjacks.src.random_lumberjacks.parsing.parse_noaa import *

#Notebook arguments
%matplotlib inline

In [2]:
station_key = pd.read_csv("data/noaa/isd-history.csv")

In [3]:
all_states = station_key[(station_key["CTRY"]=="US")]["STATE"].dropna().unique()
airshed = ['DE','IN','KY','MD','MI','MI','NC','NJ','NY','OH','PA','SC','TN','VA','VT','WV']
not_airshed = np.setdiff1d(all_states, airshed)

In [4]:
#Converting the end date to datetime to be able to select relevant years.
station_key["END"] = pd.to_datetime(station_key["END"], format="%Y%m%d")

#All data must have coordinates, be within the US, not be explicitly outside of the airshed, and be from 2001 or later.
relevant_stations = station_key[(station_key["CTRY"]=="US") & (station_key["STATE"].isin(not_airshed) == False) &
                                (station_key["END"] > "2001") & (station_key["LAT"].isna()==False)].reset_index().drop(columns = "index")


In [5]:
relevant_stations

Unnamed: 0,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END
0,621010,99999,MOORED BUOY,US,,,50.600,-2.933,-999.0,20080721,2008-07-21
1,621110,99999,MOORED BUOY,US,,,58.900,-0.200,-999.0,20041118,2004-11-18
2,621130,99999,MOORED BUOY,US,,,58.400,0.300,-999.0,20040726,2004-07-26
3,621160,99999,MOORED BUOY,US,,,58.100,1.800,-999.0,20040829,2004-08-29
4,621170,99999,MOORED BUOY,US,,,57.900,0.100,-999.0,20040726,2004-07-26
...,...,...,...,...,...,...,...,...,...,...,...
1533,A06773,334,TUCKER GUTHRIE MEMORIAL AIRPORT,US,KY,KI35,36.859,-83.358,473.1,20140731,2020-08-24
1534,A06800,120,TAZEWELL COUNTY AIRPORT,US,VA,KJFZ,37.067,-81.800,808.6,20140731,2020-08-24
1535,A06884,416,LURAY CAVERNS AIRPORT,US,VA,KLUA,38.667,-78.501,275.2,20140731,2020-08-24
1536,A07086,468,CARL R KELLER FIELD AIRPORT,US,OH,KPCW,41.516,-82.869,179.8,20140731,2020-08-24


In [6]:
cbp_cmc = pd.read_pickle("data/cbp_cmc.pickle")

In [7]:
locations_a = cbp_cmc.groupby(["Station"]).first().reset_index()[["Station", "Latitude", "Longitude"]]
locations_b = relevant_stations[["USAF", "LAT", "LON"]]
location_key = locations_a.copy()

In [8]:
tree = BallTree(np.deg2rad(locations_b[["LAT", "LON"]].values), metric='haversine')

In [9]:
distances, indices = tree.query(np.deg2rad(locations_a[["Latitude", "Longitude"]]), k = 3)
indices = pd.DataFrame(indices, columns = [f"id{i}" for i in np.arange(1,4)])
distances = pd.DataFrame(distances*3959, columns = [f"noaa_dist_mi{i}" for i in np.arange(1,4)])

In [10]:
for i, column in enumerate(indices.columns):
    location_key["USAF_"+column] = indices[column].map(lambda x: relevant_stations["USAF"][x])
    location_key["WBAN_"+column] = indices[column].map(lambda x: relevant_stations["WBAN"][x])
    location_key[f"noaa_dist_mi{i+1}"] = distances[f"noaa_dist_mi{i+1}"]

In [11]:
stations_to_query = pd.Index([])
for i in np.arange(1,4):
    group = location_key[f"USAF_id{i}"] + "-" + location_key[f"WBAN_id{i}"].map(lambda x: f"{x:05d}")
    stations_to_query = stations_to_query.union(group).unique()

In [12]:
station_filepaths = select_noaa_files(stations_to_query, "data/noaa", 2001, 2020)

In [13]:
def raw_noaa_to_dataframe(data, fixed_locs, optional_locs=None):
    columns = [item[0] for item in [*fixed_locs]]
    #Adds optional column names if they exist.
    if optional_locs:
        for block in optional_locs:
            columns.extend([item[0] for item in optional_locs[block][1]])
    noaa_list = []
    for line in data:
        var = line[108:].split(b" ")
        fixed_data = parse_fixed_noaa_data(line, fixed_locs)
        optional_data = parse_optional_noaa_data(line, optional_locs)
        data = fixed_data + optional_data
        noaa_list.append(data)
    df = pd.DataFrame(noaa_list, columns = columns, dtype=object)
    fix_noaa_df_dtypes(df, fixed_locs, optional_locs)
    return df

In [14]:
key = ["column_name", "start", "end", "dtype", "nan_value", "conversion factor"]

fixed_locs = [["USAF_ID", 4, 10, "str", None, None], ["NCEI_WBAN_ID", 10, 15, "str", None, None],
              ["Date", 15, 27, "datetime", None, None], ["Data Source", 27, 28, "str", "9", None],
              ["Latitude", 28, 34, "float64", "+99999", 1000], ["Longitude", 34, 41, "float64", "+999999", 1000],
              ["Code", 41, 46, "str", "99999", None], ["Elevation", 46, 51, "float64", "+9999", None],
              ["Call_Letter", 51, 56, "str", "99999", None], ["Quality_Control", 56, 60, "str", "99999", None],
              ["Wind_Dir", 60, 63, "float64", "999", None], ["Wind_Dir_Q", 63, 64, "str", None, None],
              ["Wind_Type", 64, 65, "str", "9", None], ["Wind_Speed", 65, 69, "float64", "9999", 10],
              ["Wind_Speed_Q", 69, 70, "str", None, None], ["Air Temperature", 87, 92, "float64", "+9999", 10],
              ["Air Temperature_Q", 92, 93, "str", None, None], ["Air_Pressure", 99, 104, "float64", "99999", 10],
              ["Air_Pressure_Q", 104, 105, "str", None, None]]

optional_locs = OrderedDict([("AA1",[11, [["Rain_Period1", 3, 5, "str", "99", None],
                                          ["Rain_Depth1", 5, 9, "float64", "9999", 10],
                                          ["Rain_Condition1", 9, 10, "str", "9", None],
                                          ["Rain_Qual1", 10, 11, "str", None, None]]]),
                             ("AA2",[11, [["Rain_Period2", 3, 5, "str", "99", None],
                                          ["Rain_Depth2", 5, 9, "float64", "9999", 10],
                                          ["Rain_Condition2", 9, 10, "str", "9", None],
                                          ["Rain_Qual2", 10, 11, "str", None, None]]]),
                             ("AA3",[11, [["Rain_Period3", 3, 5, "str", "99", None],
                                          ["Rain_Depth3", 5, 9, "float64", "9999", 10],
                                          ["Rain_Condition3", 9, 10, "str", "9", None],
                                          ["Rain_Qual3", 10, 11, "str", None, None]]]),
                             ("AA4",[11, [["Rain_Period4", 3, 5, "str", "99", None],
                                          ["Rain_Depth4", 5, 9, "float64", "9999", 10],
                                          ["Rain_Condition4", 9, 10, "str", "9", None],
                                          ["Rain_Qual4", 10, 11, "str", None, None]]])])


In [15]:
station_filepaths[0]

'data/noaa/2001/691174-99999-2001.gz'

In [16]:
noaa_gzip_to_raw(station_filepaths[0])

[b'0161691174999992001020917004+99999+999999FM-15+9999KQAH V02099999999992200019N0060001N1+00701+00001101671ADDGD12991+0300099GF102991999999999999999999MA1101861999999MW1051REMMET068KQAH 091656Z VRB03KT 6000 HZ FEW100 07/00 A3008INS RMK SLP167 FIRST;EQDQ01    003PRSWM1']

In [17]:
raw_data = noaa_gzip_to_raw("data/noaa/2001/723075-13769-2001.gz")
raw_data
raw_noaa_to_dataframe(raw_data, fixed_locs, optional_locs).iloc[:, 10:20]
#raw_noaa_to_dataframe(raw_data, fixed_locs, optional_locs).iloc[0]

Unnamed: 0,Wind_Dir,Wind_Dir_Q,Wind_Type,Wind_Speed,Wind_Speed_Q,Air Temperature,Air Temperature_Q,Air_Pressure,Air_Pressure_Q,Rain_Period1
0,220.0,1,N,4.1,1,0.0,1,1019.6,1,
1,230.0,1,N,3.6,1,-3.0,1,1020.1,1,
2,220.0,1,N,4.1,1,-2.0,1,1020.6,1,
3,250.0,1,N,3.6,1,-2.0,1,1020.6,1,
4,260.0,1,N,3.1,1,-3.0,1,1022.1,1,
...,...,...,...,...,...,...,...,...,...,...
9337,240.0,5,N,3.6,5,5.0,5,1018.9,5,D0
9338,260.0,5,N,2.6,5,5.0,5,1018.8,5,D0
9339,320.0,5,N,2.1,5,4.4,5,1018.6,5,D0
9340,,9,C,0.0,5,3.0,5,1019.4,5,D0


In [18]:
all_dfs = pd.DataFrame()
for filepath in station_filepaths:
    print(filepath)
    raw_data = noaa_gzip_to_raw(filepath)
    station = raw_noaa_to_dataframe(raw_data, fixed_locs, optional_locs)
    all_dfs = pd.concat([all_dfs, station])

data/noaa/2001/691174-99999-2001.gz
data/noaa/2001/723075-13769-2001.gz
data/noaa/2001/723080-13737-2001.gz
data/noaa/2001/723084-99999-2001.gz
data/noaa/2001/723085-13750-2001.gz
data/noaa/2001/723086-93741-2001.gz
data/noaa/2001/723087-99999-2001.gz
data/noaa/2001/723098-99999-2001.gz
data/noaa/2001/723114-99999-2001.gz
data/noaa/2001/723180-99999-2001.gz
data/noaa/2001/724006-99999-2001.gz
data/noaa/2001/724007-99999-2001.gz
data/noaa/2001/724010-13740-2001.gz
data/noaa/2001/724014-99999-2001.gz
data/noaa/2001/724016-93736-2001.gz
data/noaa/2001/724016-99999-2001.gz
data/noaa/2001/724017-99999-2001.gz
data/noaa/2001/724020-93739-2001.gz
data/noaa/2001/724026-99999-2001.gz
data/noaa/2001/724030-93738-2001.gz
data/noaa/2001/724033-99999-2001.gz
data/noaa/2001/724035-13773-2001.gz
data/noaa/2001/724036-99999-2001.gz
data/noaa/2001/724037-99999-2001.gz
data/noaa/2001/724040-13721-2001.gz
data/noaa/2001/724045-93720-2001.gz
data/noaa/2001/724050-13743-2001.gz
data/noaa/2001/724053-99999-

KeyboardInterrupt: 

In [111]:
optional_locs["AA1"][1]

[['Rain_Period1', 3, 5, 'float64', '99', None],
 ['Rain_Depth1', 5, 9, 'float64', '9999', 10],
 ['Rain_Condition1', 9, 10, 'str', '9', None],
 ['Rain_Qual1', 10, 11, 'str', None, None]]

In [106]:
fixed_locs.extend(optional_locs["AA1"][1])

In [107]:
fixed_locs

[['USAF_ID', 4, 10, 'str', None, None],
 ['NCEI_WBAN_ID', 10, 15, 'str', None, None],
 ['Date', 15, 27, 'datetime', None, None],
 ['Data Source', 27, 28, 'str', '9', None],
 ['Latitude', 28, 34, 'float64', '+99999', 1000],
 ['Longitude', 34, 41, 'float64', '+99999', 1000],
 ['Code', 41, 46, 'str', '99999', None],
 ['Elevation', 46, 51, 'int64', '+9999', None],
 ['Call_Letter', 51, 56, 'str', '99999', None],
 ['Quality_Control', 56, 60, 'str', '99999', None],
 ['Wind_Dir', 60, 63, 'float64', '999', None],
 ['Wind_Dir_Q', 63, 64, 'str', None, None],
 ['Wind_Type', 64, 65, 'str', '9', None],
 ['Wind_Speed', 65, 69, 'float64', '9999', 10],
 ['Wind_Speed_Q', 69, 70, 'str', None, None],
 ['Air Temperature', 87, 92, 'float64', '+9999', 10],
 ['Air Temperature_Q', 92, 93, 'str', None, None],
 ['Air_Pressure', 99, 104, 'float64', '99999', 10],
 ['Air_Pressure_Q', 104, 105, 'str', None, None],
 ['Rain_Period1', 3, 5, 'float64', '99', None],
 ['Rain_Depth1', 5, 9, 'float64', '9999', 10],
 ['Rain_

In [104]:
optional_locs["AA1"][1]

[['Rain_Period1', 3, 5, 'float64', '99', None],
 ['Rain_Depth1', 5, 9, 'float64', '9999', 10],
 ['Rain_Condition1', 9, 10, 'str', '9', None],
 ['Rain_Qual1', 10, 11, 'str', None, None]]

In [85]:
parse_optional_noaa_data(data[0], optional_locs)

[b'01', b'0005', b'9', b'5']
[nan, nan, nan, nan]


In [90]:

    print(x)

['Rain_Period1', 'Rain_Depth1', 'Rain_Condition1', 'Rain_Qual1']
['Rain_Period2', 'Rain_Depth2', 'Rain_Condition2', 'Rain_Qual2']


In [49]:
parse_fixed_noaa_data(ext, optional_locs["AA1"])

[b'01', b'0005', b'9', b'5']

In [None]:
def 

In [None]:
for line in data:
    block = extract_noaa_optional_str(line, "A01", 11)
    print(block)

In [26]:
ext

b'AA101000595'

In [None]:
for line in data:
    term = b"AA1"
    idx = 108 + line[108:].find(term)
    if line.find(term) >= 0:
        print(line[idx:])

In [93]:
data[0]

b'0184A07359002402019010100156+42938-085061FM-15+0249KY70 V0203505N00315000915MN0020125N5+00105+00055999999ADDAA101000595AU120090015AW1415GA1085+000915999GD14991+0009159MA1099865096945REMMET09812/31/18 19:15:01 METAR KY70 010015Z 35006KT 1 1/4SM UP OVC003 01/01 A2949 RMK AO2 P0002 T00100005'