In [None]:
import netCDF4
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import re

import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Define the base directory
sat_dir = os.path.expanduser('~/sea-level-seekers/iharp_training_dataset/Copernicus_ENA_Satelite_Maps_Training_Data')
sat_names = os.listdir(sat_dir)
flood_dir = os.path.expanduser('~/sea-level-seekers/iharp_training_dataset/Flooding_Data')
flood_names = os.listdir(flood_dir)

In [None]:
# Get a list of all file names with their full paths
try:
    sat_names = [os.path.join(sat_dir, f) for f in os.listdir(sat_dir)]
    flood_names = [os.path.join(flood_dir, f) for f in os.listdir(flood_dir) if f.endswith('.csv')]
except FileNotFoundError:
    print(f"Directory not found")


In [None]:
def date_extractor(filename):
    file_name = filename
    
    # Regular expression to extract the date
    match = re.search(r'dt_ena_(\d+)_vDT', file_name)
    date_str = match.group(1)  # '19930101'
    
    # Format the date as 'YYYY-MM-DD'
    formatted_date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
    
    return formatted_date

In [None]:
# Iterate over all files in the directory
for filename in sat_names:
    # Extract the date part from the filename and format it
    formatted_date = date_extractor(filename)

    # Open the .nc file
    dataset = netCDF4.Dataset(file_path, mode="r")

    # Extract the 'sla' variable
    sla = dataset.variables["sla"][:]

    # Calculate the average of all values stored in sla
    average_sla = np.mean(sla)

    # Determine the value for the additional columns
    value = 1 if average_sla >= 0 else 0

    # Append the result to the list
    results.append([formatted_date, average_sla] + [value] * 12)

    # Close the dataset
    dataset.close()

In [None]:
# Create a DataFrame from the results
columns = [
    "Date",
    "Average_SLA",
    "Atlantic City",
    "Baltimore",
    "Eastport",
    "Fort Pulaski",
    "Lewes",
    "New London",
    "Newport",
    "Portland",
    "Sandy Hook",
    "Sewells Point",
    "The Battery",
    "Washington",
]
results_df = pd.DataFrame(results, columns=columns)

In [None]:
# Remove the Average_SLA column
results_df = results_df.drop(columns=["Average_SLA"])

In [None]:
output_csv_path = "/home/jovyan/sea-level-seekers/simple_models/baseline.csv"

# Save the DataFrame to a CSV file
results_df.to_csv(output_csv_path, index=False)