In [1]:
#Library imports
import re
import os
import sys
import shutil
import time
import math
import gzip
import fnmatch
import random
import warnings
import numpy as np
import pandas as pd
import scipy.stats as scs
import urllib.request
import seaborn as sns
import matplotlib.pyplot as plt

from pandas.tseries.offsets import DateOffset
from collections import OrderedDict

import scipy.stats as scs
from sklearn.neighbors import BallTree

import fiona
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point, Polygon, LineString
from pyproj import Proj
import geoplot as gplt
import geoplot.crs as gcrs

# Allows access to scripts and modules relative to the parent directory.
parent = os.getcwd()
sys.path.append(os.path.join(parent, "../functions"))
root_dir = "../"

# Project specific user driven functions
from cleaning_functions import *

# My open source reusable user driven function repository.
from random_lumberjacks.src.random_lumberjacks.cleaning.cleaning_functions import *
from random_lumberjacks.src.random_lumberjacks.model.model_classes import *
from random_lumberjacks.src.random_lumberjacks.visualization.visualization_functions import *
from random_lumberjacks.src.random_lumberjacks.parsing.parse_noaa import *

#Notebook arguments
%matplotlib inline

In [2]:
station_key = pd.read_csv(root_dir+"data/noaa/isd-history.csv")

In [3]:
all_states = station_key[(station_key["CTRY"]=="US")]["STATE"].dropna().unique()
airshed = ['DE','IN','KY','MD','MI','MI','NC','NJ','NY','OH','PA','SC','TN','VA','VT','WV']
not_airshed = np.setdiff1d(all_states, airshed)

In [4]:
#Converting the end date to datetime to be able to select relevant years.
station_key["END"] = pd.to_datetime(station_key["END"], format="%Y%m%d")

#All data must have coordinates, be within the US, not be explicitly outside of the airshed, and be from 2001 or later.
relevant_stations = station_key[(station_key["CTRY"]=="US") & (station_key["STATE"].isin(not_airshed) == False) &
                                (station_key["END"] > "2001") & (station_key["LAT"].isna()==False)].reset_index().drop(columns = "index")


In [5]:
relevant_stations

Unnamed: 0,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END
0,621010,99999,MOORED BUOY,US,,,50.600,-2.933,-999.0,20080721,2008-07-21
1,621110,99999,MOORED BUOY,US,,,58.900,-0.200,-999.0,20041118,2004-11-18
2,621130,99999,MOORED BUOY,US,,,58.400,0.300,-999.0,20040726,2004-07-26
3,621160,99999,MOORED BUOY,US,,,58.100,1.800,-999.0,20040829,2004-08-29
4,621170,99999,MOORED BUOY,US,,,57.900,0.100,-999.0,20040726,2004-07-26
...,...,...,...,...,...,...,...,...,...,...,...
1533,A06773,334,TUCKER GUTHRIE MEMORIAL AIRPORT,US,KY,KI35,36.859,-83.358,473.1,20140731,2020-08-24
1534,A06800,120,TAZEWELL COUNTY AIRPORT,US,VA,KJFZ,37.067,-81.800,808.6,20140731,2020-08-24
1535,A06884,416,LURAY CAVERNS AIRPORT,US,VA,KLUA,38.667,-78.501,275.2,20140731,2020-08-24
1536,A07086,468,CARL R KELLER FIELD AIRPORT,US,OH,KPCW,41.516,-82.869,179.8,20140731,2020-08-24


In [6]:
cbp_cmc = pd.read_pickle(root_dir+"data/cbp_cmc.pickle")

# Restricting the query to observations with the target variable.
cbp_cmc = cbp_cmc[cbp_cmc['SPECIFIC CONDUCTIVITY'].isna()==False]

In [7]:
locations_a = cbp_cmc.groupby(["Station"]).first().reset_index()[["Station", "Latitude", "Longitude"]]
locations_b = relevant_stations[["USAF", "LAT", "LON"]]
location_key = locations_a.copy()

In [8]:
tree = BallTree(np.deg2rad(locations_b[["LAT", "LON"]].values), metric='haversine')

In [9]:
k = 5
distances, indices = tree.query(np.deg2rad(locations_a[["Latitude", "Longitude"]]), k = k)
indices = pd.DataFrame(indices, columns = [f"id{i}" for i in np.arange(1,k+1)])
distances = pd.DataFrame(distances*3959, columns = [f"noaa_dist_mi{i}" for i in np.arange(1,k+1)])

In [12]:
for i, column in enumerate(indices.columns):
    location_key["USAF_"+column] = indices[column].map(lambda x: relevant_stations["USAF"][x])
    location_key["WBAN_"+column] = indices[column].map(lambda x: relevant_stations["WBAN"][x])
    location_key[f"noaa_dist_mi{i+1}"] = distances[f"noaa_dist_mi{i+1}"]

In [13]:
def raw_noaa_to_dataframe(data, fixed_locs, optional_locs=None):
    columns = [item[0] for item in [*fixed_locs]]
    #Adds optional column names if they exist.
    if optional_locs:
        for block in optional_locs:
            columns.extend([item[0] for item in optional_locs[block][1]])
    noaa_list = []
    for line in data:
        var = line[108:].split(b" ")
        fixed_data = parse_fixed_noaa_data(line, fixed_locs)
        optional_data = parse_optional_noaa_data(line, optional_locs)
        data = fixed_data + optional_data
        noaa_list.append(data)
    df = pd.DataFrame(noaa_list, columns = columns, dtype=object)
    fix_noaa_df_dtypes(df, fixed_locs, optional_locs)
    return df

In [14]:
key = ["column_name", "start", "end", "dtype", "nan_value", "conversion factor"]

fixed_locs = [["USAF_ID", 4, 10, "str", None, None], ["NCEI_WBAN_ID", 10, 15, "str", None, None],
              ["Date", 15, 27, "datetime", None, None], ["Data Source", 27, 28, "str", "9", None],
              ["Latitude", 28, 34, "float64", "+99999", 1000], ["Longitude", 34, 41, "float64", "+999999", 1000],
              ["Code", 41, 46, "str", "99999", None], ["Elevation", 46, 51, "float64", "+9999", None],
              ["Call_Letter", 51, 56, "str", "99999", None], ["Quality_Control", 56, 60, "str", "99999", None],
              ["Wind_Dir", 60, 63, "float64", "999", None], ["Wind_Dir_Q", 63, 64, "str", None, None],
              ["Wind_Type", 64, 65, "str", "9", None], ["Wind_Speed", 65, 69, "float64", "9999", 10],
              ["Wind_Speed_Q", 69, 70, "str", None, None], ["Air Temperature", 87, 92, "float64", "+9999", 10],
              ["Air Temperature_Q", 92, 93, "str", None, None], ["Air_Pressure", 99, 104, "float64", "99999", 10],
              ["Air_Pressure_Q", 104, 105, "str", None, None]]

optional_locs = OrderedDict([("AA1",[11, [["Rain_Period1", 3, 5, "str", "99", None],
                                          ["Rain_Depth1", 5, 9, "str", "9999", 10],
                                          ["Rain_Condition1", 9, 10, "str", "9", None],
                                          ["Rain_Qual1", 10, 11, "str", None, None]]]),
                             ("AA2",[11, [["Rain_Period2", 3, 5, "str", "99", None],
                                          ["Rain_Depth2", 5, 9, "str", "9999", 10],
                                          ["Rain_Condition2", 9, 10, "str", "9", None],
                                          ["Rain_Qual2", 10, 11, "str", None, None]]]),
                             ("AA3",[11, [["Rain_Period3", 3, 5, "str", "99", None],
                                          ["Rain_Depth3", 5, 9, "str", "9999", 10],
                                          ["Rain_Condition3", 9, 10, "str", "9", None],
                                          ["Rain_Qual3", 10, 11, "str", None, None]]]),
                             ("AA4",[11, [["Rain_Period4", 3, 5, "str", "99", None],
                                          ["Rain_Depth4", 5, 9, "str", "9999", 10],
                                          ["Rain_Condition4", 9, 10, "str", "9", None],
                                          ["Rain_Qual4", 10, 11, "str", None, None]]])])


In [None]:
all_dfs = pd.DataFrame()
stations_to_query = pd.Index([])
for i in np.arange(1,k+1):
    group = location_key[f"USAF_id{i}"] + "-" + location_key[f"WBAN_id{i}"].map(lambda x: f"{x:05d}")
    stations_to_query = stations_to_query.union(group).unique()
print(f"Adding {stations_to_query.size} stations")

station_filepaths = select_noaa_files(stations_to_query, root_dir+"data/noaa", 2001, 2020)

for station_query in stations_to_query:
    station_df = pd.DataFrame()
    
    cut = station_query.split("-")
    mask = pd.Series(False, index=location_key.index)
    for i in np.arange(1,k+1):
        partial_mask = (location_key[f"USAF_id{i}"] == cut[0]) & (location_key[f"WBAN_id{i}"] == int(cut[1]))
        mask = mask | partial_mask
    station_matches = location_key[mask]
    
    # Creates an index of dates for the current station to restrict the size of the Data imported from NOAA
    station_filter = cbp_cmc[cbp_cmc["Station"].isin(station_matches["Station"])]
    dates = station_filter.set_index("Date").groupby(pd.Grouper(freq='D')).first().dropna(how="all").index

    # Allows for collection of stats for previous two days.
    for i in np.arange(1,3):
        dates = dates.union(dates-DateOffset(days=i))
        

    
    reduced_filepaths = station_filepaths[station_filepaths.str.contains(station_query)]
    for filepath in reduced_filepaths:
        #print(filepath)
        raw_data = noaa_gzip_to_raw(filepath)
        station = raw_noaa_to_dataframe(raw_data, fixed_locs, optional_locs)
        #Trims out unneccary dates
        station = station[station["Date"].dt.date.isin(pd.Series(dates).dt.date)]
        station_df = pd.concat([station_df, station])
    all_dfs = pd.concat([all_dfs, station_df])

In [21]:
df = all_dfs.drop(columns = ["Rain_Period4", "Rain_Depth4", "Rain_Condition4", "Rain_Qual4"]).reset_index().drop(columns="index").copy()

In [23]:
df["Rain_Depth2"] = np.where(df["Rain_Depth2"]== '3;', np.nan, df["Rain_Depth2"])
df["Rain_Depth3"] = np.where(df["Rain_Depth3"]== '7;', np.nan, df["Rain_Depth3"])
for i in np.arange(1,4):
    df[f"Rain_Depth{i}"] = df[f"Rain_Depth{i}"].astype("float64")/10

In [24]:
df

Unnamed: 0,USAF_ID,NCEI_WBAN_ID,Date,Data Source,Latitude,Longitude,Code,Elevation,Call_Letter,Quality_Control,...,Rain_Condition1,Rain_Qual1,Rain_Period2,Rain_Depth2,Rain_Condition2,Rain_Qual2,Rain_Period3,Rain_Depth3,Rain_Condition3,Rain_Qual3
0,720297,03730,2018-05-21 00:15:00,7,37.239,-76.716,FM-15,15.0,KJGG,V020,...,,,,,,,,,,
1,720297,03730,2018-05-21 00:35:00,7,37.239,-76.716,FM-15,15.0,KJGG,V020,...,,,,,,,,,,
2,720297,03730,2018-05-21 00:55:00,7,37.239,-76.716,FM-15,15.0,KJGG,V020,...,,,,,,,,,,
3,720297,03730,2018-05-21 01:15:00,7,37.239,-76.716,FM-15,15.0,KJGG,V020,...,,,,,,,,,,
4,720297,03730,2018-05-21 01:35:00,7,37.239,-76.716,FM-15,15.0,KJGG,V020,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2923840,A06884,00416,2020-01-03 22:35:00,6,38.667,-78.501,FM-15,275.0,KLUA,V020,...,,,,,,,,,,
2923841,A06884,00416,2020-01-03 22:55:00,6,38.667,-78.501,FM-15,275.0,KLUA,V020,...,,,,,,,,,,
2923842,A06884,00416,2020-01-03 23:15:00,6,38.667,-78.501,FM-15,275.0,KLUA,V020,...,,,,,,,,,,
2923843,A06884,00416,2020-01-03 23:35:00,6,38.667,-78.501,FM-15,275.0,KLUA,V020,...,,5,,,,,,,,


In [25]:
df.to_pickle(root_dir+"data/noaa_match.pickle")
location_key.to_pickle(root_dir+"data/noaa_location_key.pickle")