# Functions provided by Austin Schmidt via E-Mail on Nov. 22

Download NOAA from National Center for Environmental Informatics

In [None]:
import numpy as np
import os
import pandas as pd




In [None]:
def download_noaa(buoy, date):
    myurl = "http://www.ndbc.noaa.gov/view_text_file.php?filename="+buoy+"h"+ date +".txt.gz&dir=data/historical/stdmet/"
    data = requests.get(myurl).text.split("\n")

    return_val = []
    for line in data[2:]:
        return_val.append([float(a) for a in line.strip().split(" ") if a.strip() != '' ] )

    return_val = np.array(return_val[0:-1])


    if return_val.size == 0:
        return return_val, return_val

    df = pd.DataFrame(return_val, columns = data[0].split())

    return df, return_val


Load Files:

In [None]:
def load_files(file_path):
    #Collect all files in a files list
    ext='.nc'
    files = []
    for filename in os.listdir(file_path):
        f = os.path.join(file_path, filename)
        if(f.endswith(ext)):
            files.append(f)

    return files

In [None]:
def load_numpy_files(file_path):
    #Collect all files in a files list
    ext='.npy'
    files = []
    for filename in os.listdir(file_path):
        f = os.path.join(file_path, filename)
        if(f.endswith(ext)):
            files.append(f)

    return files

In [None]:
##### reads buoy list
station_dict = {}
with open('stations_list.txt', "r") as f:
    lines = f.readlines()
    for line in lines[:-1]:
        v = [a.strip() for a in line.split(" ") if a.strip() != '']

        if len(v) > 1:
            #print(v)
            station = v[0].replace("*", "") # 41001*
            lat = v[1][:-1]
            lon = '-'+v[2][:-1]
            station_dict[station] = [lat, lon]

In [None]:
#load and process data for each buoy
for buoy, (lat, lon) in zip(station_dict.keys(), station_dict.values()):

    lat = float(lat)
    lon = float(lon)

In [None]:

######## data processing code
#Holding single netcdf file in memory to find lat/lon values
files = load_files(DATA_PATH)
c_files = load_files(CLIMATE_DATA_PATH)
for t in range(1):
    data = nc.Dataset(files[t], 'r')
    c_data = nc.Dataset(c_files[t], 'r')

buoy_year='2011'

multiple_buoy_data = []
columns = ['buoy_water_temp', 'buoy_gust', 'buoy_pressure', 'water_temp', 'salinity', 'water_u', 'water_v', 'surf_el']

buoy_data_train = pd.DataFrame(columns=columns)
buoy_data_val = pd.DataFrame(columns=columns)
buoy_data_test = pd.DataFrame(columns=np.append('buoy',columns))


from datetime import date, timedelta

station_dict = {}
with open('stations_list.txt', "r") as f:
    lines = f.readlines()
    for line in lines[:-1]:
        v = [a.strip() for a in line.split(" ") if a.strip() != '']

        if len(v) > 1:
            #print(v)
            station = v[0].replace("*", "") # 41001*
            lat = v[1][:-1]
            lon = '-'+v[2][:-1]
            station_dict[station] = [lat, lon]


#load and process data for each buoy
for buoy, (lat, lon) in zip(station_dict.keys(), station_dict.values()):

    lat = float(lat)
    lon = float(lon)


    #download and collect buoy data
    print(f"Beginning processing for buoy {buoy} in",buoy_year)
    df, val = download_noaa(buoy, buoy_year)

    if(val.size==0):
        print(f"No data for year", buoy_year, "\n")
        continue

    df_third_hour = df[df.hh % 3 == 0]
    buoy_df = df_third_hour[df_third_hour.mm == 50].reset_index(drop=True)

    #Removing problematic fill data from buoys. Simply replacing the fill values with min values
    buoy_df.replace(99.0, np.nan, inplace=True)
    buoy_df.replace(999.0, np.nan, inplace=True)
    buoy_df.replace(9999.0, np.nan, inplace=True)

    buoy_df = buoy_df.fillna(buoy_df.mean())

    if (buoy_df[['WTMP','GST','PRES']].isnull().values.any()):
        print(f"Buoy has unresolved NaNs for year", buoy_year, "\n")
        continue

    if(buoy_df.empty):
        print(f"Buoy is empty somehow...\n")
        continue

    #Given a set of date times and the buoy dataframe, we can align each recorded timestep with its HYCOM counterpart.
    buoy_datetimes = buoy_df[['#YY' ,'MM', 'DD', 'hh', 'mm']].to_numpy().astype('int')

    # calculate the difference array to get lats/lons index in the HYCOM data.
    difference_lat = np.absolute(data['lat'][:]-lat)
    difference_lon = np.absolute(data['lon'][:]-lon)

    c_difference_lat = np.absolute(c_data['latitude'][:]-lat)
    c_difference_lon = np.absolute(c_data['longitude'][:]-lon)

    # find the index of minimum element from the array (the closest lat and lon)
    lat_index = difference_lat.argmin()
    lon_index = difference_lon.argmin()

    c_lat_index = c_difference_lat.argmin()
    c_lon_index = c_difference_lon.argmin()

    #adjust so matrix corner always includes the buoy
    if(data['lat'][lat_index]>lat):
        lat_index=lat_index-1

    if(data['lon'][lon_index]>lon):
        lon_index=lon_index-1

    if(c_data['latitude'][c_lat_index]>lat):
        c_lat_index=c_lat_index-1

    if(c_data['longitude'][c_lon_index]>lon):
        c_lon_index=c_lon_index-1

    print(lat, lon, lat_index,lon_index, c_lat_index, c_lon_index)

    print("Loading HYCOM features ")
    #Get features using the desired lat and lon for each buoy
    inter_features, feature = load_interp_features(0,1,lat_index,lat_index+1,lon_index,lon_index+1)

    mask = inter_features > -30000
    inter_features[inter_features <= -30000] = inter_features[mask].mean()

    print("Loading Climate features ")
    #Load climate features with same technique
    c_features, c_feature = load_climate_features(0,1,c_lat_index,c_lat_index+1,c_lon_index,c_lon_index+1)

    mask = c_features > -30000
    c_features[c_features <= -30000] = 0.0

    #pressure variable same units as buoys
    c_features[7] = c_features[7] / 100

    #match up buoy data with HYCOM data temporally. Holes still remain in the data, but
    #HYCOM at least matches 1 to 1
    dates=[]
    train_index_match = []
    val_index_match = []
    test_index_match = []

    time_sets=[]
    date_generated = generate_time()

    #Match HYCOM instances with buoy instances timewise. We break up data into train/test/val by date
    print("Matching HYCOM and Climate instances with buoys")

    for i,j in zip(range(0,len(buoy_datetimes)), range(0,len(buoy_datetimes))):
        check = (date_generated[i] - buoy_datetimes[j]) == 0
        while (not np.all(check)):
            i=i+1
            check = ((date_generated[i] - buoy_datetimes[j]) == 0)

        if(date_generated[i] < date_generated[2044]):
            train_index_match.append([i,j])


        elif((date_generated[i] >= date_generated[2044]) & (date_generated[i] < date_generated[2336])):
            val_index_match.append([i,j])


        elif(date_generated[i] >= date_generated[2336]):
            test_index_match.append([i,j])

        im_train = np.array(train_index_match)
        im_val = np.array(val_index_match)
        im_test = np.array(test_index_match)

    #Use selected indices to build datasets by date
    print("Building Train, Test, Val sets")
    dct_train={}
    dct_val={}
    dct_test={}

    if(im_train.size!=0):
        dct_train ={'buoy_water_temp':buoy_df['WTMP'].to_numpy()[im_train[:-1,1]],
                    'buoy_gust':buoy_df['GST'].to_numpy()[im_train[:-1,1]],
                    'buoy_pressure':buoy_df['PRES'].to_numpy()[im_train[:-1,1]],

                    'water_temp':inter_features[im_train[:-1,0], 0, 0, 0] ,'salinity':inter_features[im_train[:-1,0], 1, 0, 0],
                    'water_u':inter_features[im_train[:-1,0], 2, 0, 0], 'water_v':inter_features[im_train[:-1,0], 3, 0, 0],
                    'surf_el':inter_features[im_train[:-1,0], 4, 0, 0],

                    'u10':c_features[0, 0,im_train[:-1,0]], 'v10':c_features[1, 0,im_train[:-1,0]],
                    'e':c_features[2, 0,im_train[:-1,0]],
                    'i10fg':c_features[3, 0,im_train[:-1,0]], 'mer':c_features[4, 0,im_train[:-1,0]],
                    'mror':c_features[5, 0,im_train[:-1,0]], 'siconc':c_features[6, 0,im_train[:-1,0]],
                    'sp':c_features[7, 0,im_train[:-1,0]],
                    'tcc':c_features[8, 0,im_train[:-1,0]], 'tp':c_features[9, 0,im_train[:-1,0]],

                    'next_buoy_water_temp':buoy_df['WTMP'].to_numpy()[im_train[1:,1]],
                    'next_buoy_gust':buoy_df['GST'].to_numpy()[im_train[1:,1]],
                    'next_buoy_pressure':buoy_df['PRES'].to_numpy()[im_train[1:,1]],

                    'next_water_temp':inter_features[im_train[1:,0], 0, 0, 0] ,
                    'next_salinity':inter_features[im_train[1:,0], 1, 0, 0],
                    'next_water_u':inter_features[im_train[1:,0], 2, 0, 0], 'next_water_v':inter_features[im_train[1:,0], 3, 0, 0],
                    'next_surf_el':inter_features[im_train[1:,0], 4, 0, 0],

                    'next_u10':c_features[0, 0,im_train[1:,0]], 'next_v10':c_features[1, 0,im_train[1:,0]],
                    'next_e':c_features[2, 0,im_train[1:,0]],
                    'next_i10fg':c_features[3, 0,im_train[1:,0]], 'next_mer':c_features[4, 0,im_train[1:,0]],
                    'next_mror':c_features[5, 0,im_train[1:,0]], 'next_siconc':c_features[6, 0,im_train[1:,0]],
                    'next_sp':c_features[7, 0,im_train[1:,0]],
                    'next_tcc':c_features[8, 0,im_train[1:,0]], 'next_tp':c_features[9, 0,im_train[1:,0]] }

    if(im_val.size!=0):
        dct_val = { 'buoy_water_temp':buoy_df['WTMP'].to_numpy()[im_val[:-1,1]] ,
                    'buoy_gust':buoy_df['GST'].to_numpy()[im_val[:-1,1]],
                    'buoy_pressure':buoy_df['PRES'].to_numpy()[im_val[:-1,1]],

                    'water_temp':inter_features[im_val[:-1,0], 0, 0, 0] ,'salinity':inter_features[im_val[:-1,0], 1, 0, 0],
                    'water_u':inter_features[im_val[:-1,0], 2, 0, 0], 'water_v':inter_features[im_val[:-1,0], 3, 0, 0],
                    'surf_el':inter_features[im_val[:-1,0], 4, 0, 0],

                    'u10':c_features[0, 0,im_val[:-1,0]], 'v10':c_features[1, 0,im_val[:-1,0]],
                    'e':c_features[2, 0,im_val[:-1,0]],
                    'i10fg':c_features[3, 0,im_val[:-1,0]], 'mer':c_features[4, 0,im_val[:-1,0]],
                    'mror':c_features[5, 0,im_val[:-1,0]], 'siconc':c_features[6, 0,im_val[:-1,0]],
                    'sp':c_features[7, 0,im_val[:-1,0]],
                    'tcc':c_features[8, 0,im_val[:-1,0]], 'tp':c_features[9, 0,im_val[:-1,0]],

                    'next_buoy_water_temp':buoy_df['WTMP'].to_numpy()[im_val[1:,1]] ,
                    'next_buoy_gust':buoy_df['GST'].to_numpy()[im_val[1:,1]],
                    'next_buoy_pressure':buoy_df['PRES'].to_numpy()[im_val[1:,1]],

                    'next_water_temp':inter_features[im_val[1:,0], 0, 0, 0] ,'next_salinity':inter_features[im_val[1:,0], 1, 0, 0],
                    'next_water_u':inter_features[im_val[1:,0], 2, 0, 0], 'next_water_v':inter_features[im_val[1:,0], 3, 0, 0],
                    'next_surf_el':inter_features[im_val[1:,0], 4, 0, 0],

                    'next_u10':c_features[0, 0,im_val[1:,0]], 'next_v10':c_features[1, 0,im_val[1:,0]],
                    'next_e':c_features[2, 0,im_val[1:,0]],
                    'next_i10fg':c_features[3, 0,im_val[1:,0]], 'next_mer':c_features[4, 0,im_val[1:,0]],
                    'next_mror':c_features[5, 0,im_val[1:,0]], 'next_siconc':c_features[6, 0,im_val[1:,0]],
                    'next_sp':c_features[7, 0,im_val[1:,0]],
                    'next_tcc':c_features[8, 0,im_val[1:,0]], 'next_tp':c_features[9, 0,im_val[1:,0]] }

    if(not im_test.size<8):
        dct_test = {'buoy':[buoy for p in range(len(im_test)-1) ],'buoy_water_temp':buoy_df['WTMP'].to_numpy()[im_test[:-1,1]] ,
                    'buoy_gust':buoy_df['GST'].to_numpy()[im_test[:-1,1]],
                    'buoy_pressure':buoy_df['PRES'].to_numpy()[im_test[:-1,1]],


                    'water_temp':inter_features[im_test[:-1,0], 0, 0, 0] ,'salinity':inter_features[im_test[:-1,0], 1, 0, 0],
                    'water_u':inter_features[im_test[:-1,0], 2, 0, 0], 'water_v':inter_features[im_test[:-1,0], 3, 0, 0],
                    'surf_el':inter_features[im_test[:-1,0], 4, 0, 0],

                    'u10':c_features[0, 0,im_test[:-1,0]], 'v10':c_features[1, 0,im_test[:-1,0]],
                    'e':c_features[2, 0,im_test[:-1,0]],
                    'i10fg':c_features[3, 0,im_test[:-1,0]], 'mer':c_features[4, 0,im_test[:-1,0]],
                    'mror':c_features[5, 0,im_test[:-1,0]], 'siconc':c_features[6, 0,im_test[:-1,0]],
                    'sp':c_features[7, 0,im_test[:-1,0]],
                    'tcc':c_features[8, 0,im_test[:-1,0]], 'tp':c_features[9, 0,im_test[:-1,0]],

                    'next_buoy_water_temp':buoy_df['WTMP'].to_numpy()[im_test[1:,1]] ,
                    'next_buoy_gust':buoy_df['GST'].to_numpy()[im_test[1:,1]],
                    'next_buoy_pressure':buoy_df['PRES'].to_numpy()[im_test[1:,1]],

                    'next_water_temp':inter_features[im_test[1:,0], 0, 0, 0] ,
                    'next_salinity':inter_features[im_test[1:,0], 1, 0, 0],
                    'next_water_u':inter_features[im_test[1:,0], 2, 0, 0], 'next_water_v':inter_features[im_test[1:,0], 3, 0, 0],
                    'next_surf_el':inter_features[im_test[1:,0], 4, 0, 0],

                    'next_u10':c_features[0, 0,im_test[1:,0]], 'next_v10':c_features[1, 0,im_test[1:,0]],
                    'next_e':c_features[2, 0,im_test[1:,0]],
                    'next_i10fg':c_features[3, 0,im_test[1:,0]], 'next_mer':c_features[4, 0,im_test[1:,0]],
                    'next_mror':c_features[5, 0,im_test[1:,0]], 'next_siconc':c_features[6, 0,im_test[1:,0]],
                    'next_sp':c_features[7, 0,im_test[1:,0]],
                    'next_tcc':c_features[8, 0,im_test[1:,0]], 'next_tp':c_features[9, 0,im_test[1:,0]] }

    single_buoy_data_train = pd.DataFrame(data=dct_train)
    single_buoy_data_val = pd.DataFrame(data=dct_val)
    single_buoy_data_test = pd.DataFrame(data=dct_test)

    print("Train size: ", len(single_buoy_data_train))
    print("Val size: ", len(single_buoy_data_val))
    print("Test size: ", len(single_buoy_data_test),'\n')

    buoy_data_train = pd.concat([buoy_data_train, single_buoy_data_train])
    buoy_data_val = pd.concat([buoy_data_val, single_buoy_data_val])
    buoy_data_test = pd.concat([buoy_data_test, single_buoy_data_test])

    buoy_data_train.to_pickle("./ProcessedData/all_buoys_data_train.pkl")
    buoy_data_val.to_pickle("./ProcessedData/all_buoys_data_val.pkl")
    buoy_data_test.to_pickle("./ProcessedData/all_buoys_data_test.pkl")








