# Base Model Dataset

## This dataset does not use any filters.
## It just stores all available values of all stations in the Gulf of Mexico with 1h Timestamps for the last 10 years.

In [1]:
import sys
import os
import numpy as np
dir_root =os.path.abspath(os.path.join(os.getcwd(), os.path.pardir))
sys.path.append(dir_root)
import myLibrary as mL
import pandas as pd
import time

In [2]:
#Complete data in Gulf of Mexico
STATION_LIST = mL.cleaned_stations_GOM
YEARS = [str(i) for i in list(range(2013,2023))]

print("Number of stations: ", len(STATION_LIST))
print("Number of years: ", len(YEARS))
print("Number of files: ", len(STATION_LIST)*len(YEARS))

Number of stations:  123
Number of years:  10
Number of files:  1230


In [3]:
metadata = pd.read_csv('../data/metadata/metadata_2023_03_14.csv')
metadata = metadata.set_index("StationID")

---

In [4]:
def get_buoy_data(station_id, year):
        timestamp_filter_list = mL.create_timestamp_list2(year)
        df_NDBC = mL.get_data_file(station_id, year, True)

        if df_NDBC is None:
            # add empty dataframe
            df_NDBC = pd.DataFrame(columns=[
                  f'WDIR_{station_id}',
                  f'WSPD_{station_id}',
                  f'WVHT_{station_id}',
                  f'APD_{station_id}',
                  f'MWD_{station_id}',
                  f'PRES_{station_id}',
                  f'ATMP_{station_id}',
                  f'WTMP_{station_id}',
                  f'DEWP_{station_id}',]
            )

        else:
            df_NDBC = mL.df_modification(df_NDBC)[1]
            df_NDBC = mL.replace_with_NaN(df_NDBC)

            #Handling duplicated index:
            num_of_duplicates = df_NDBC.index.duplicated().sum()
            if num_of_duplicates > 0: print(f"Found {num_of_duplicates} duplicates is {station_id}h{year} and removed them!")
            df_NDBC = df_NDBC.loc[~df_NDBC.index.duplicated(keep='first')]

            df_NDBC = df_NDBC.filter(timestamp_filter_list, axis=0)
            df_NDBC.drop([f'GST_{station_id}',
                          f'DPD_{station_id}',
                          f'VIS_{station_id}',
                          f'TIDE_{station_id}'], axis=1, inplace=True)

        # some data rows are missed. Those are filled up with NaN:
        for timestamp in timestamp_filter_list:
            if not timestamp in df_NDBC.index:  #might must be timestamp instead of index
                df_NDBC.loc[timestamp] = [np.NAN]*9

        df_NDBC.sort_index(inplace=True)
        df_NDBC = df_NDBC.astype(float) #convert string to float
        return df_NDBC

In [5]:
def build_NDBC_dataset(STATION_LIST, YEARS):

    time_ref = time.time()

    # create the new dataframe filled with False values
    file_nan_count = pd.DataFrame(
        [[-1 for _ in range(len(YEARS))] for _ in range(len(STATION_LIST))],
        index=STATION_LIST,
        columns=YEARS
    )

    data_list_annual = list()    # each element in this list is a df containing data of one certain year and all stations
    for year in YEARS:

        print("Started with ", year, ". Previous year took:  ", time.time() - time_ref , "seconds")
        time_ref = time.time()

        buoy_data_list = list() # each element in this list is a df containing data of one certain year and one certain station.
        for station in STATION_LIST:
            buoy_data = get_buoy_data(station, year)  # load file
            buoy_data_list.append(buoy_data)

            #Create NaN Statistic
            nan_rate = buoy_data.isna().sum().sum() / (buoy_data.shape[0] * buoy_data.shape[1])
            file_nan_count.loc[station,year] = nan_rate


        merged_buoy_data = pd.concat(buoy_data_list, axis=1, join="outer")  # outer join also includes NaN, inner join removes them
        data_list_annual.append(merged_buoy_data)

    print("Finished downloading - now merging it together!")

    dataset_NDBC = pd.concat(data_list_annual, axis=0)

    return dataset_NDBC, file_nan_count

---

In [6]:
dataset, statistic = build_NDBC_dataset(STATION_LIST, YEARS)

Started with  2013 . Previous year took:   0.0005748271942138672 seconds
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed to get file: HTTP Error 404: Not Found
Failed 

In [7]:
dataset

Unnamed: 0,WDIR_41117,WSPD_41117,WVHT_41117,APD_41117,MWD_41117,PRES_41117,ATMP_41117,WTMP_41117,DEWP_41117,WDIR_41112,...,DEWP_WPLF1,WDIR_WYCM6,WSPD_WYCM6,WVHT_WYCM6,APD_WYCM6,MWD_WYCM6,PRES_WYCM6,ATMP_WYCM6,WTMP_WYCM6,DEWP_WYCM6
2013-01-01 00:00:00,,,,,,,,,,,...,,,,,,,,,,
2013-01-01 01:00:00,,,,,,,,,,,...,,,,,,,,,,
2013-01-01 02:00:00,,,,,,,,,,,...,,,,,,,,,,
2013-01-01 03:00:00,,,,,,,,,,,...,,,,,,,,,,
2013-01-01 04:00:00,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31 19:00:00,,,0.97,5.79,88.0,,18.1,17.4,,,...,,,,,,,,,,
2022-12-31 20:00:00,,,0.93,5.93,88.0,,18.3,17.4,,,...,,,,,,,,,,
2022-12-31 21:00:00,,,0.95,6.17,91.0,,18.7,17.4,,,...,,,,,,,,,,
2022-12-31 22:00:00,,,0.88,5.85,78.0,,,17.4,,,...,,,,,,,,,,


In [9]:
filename = "dataset_GOM_baseline"
dataset.to_csv(f'../../data/datasets/{filename}_NDBC.csv', index=True)

done
