In [1]:
import netCDF4
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import re

In [3]:
# Define the base directory
sat_dir = os.path.expanduser('~/sea-level-seekers/iharp_training_dataset/Copernicus_ENA_Satelite_Maps_Training_Data')
sat_names = os.listdir(sat_dir)
flood_dir = os.path.expanduser('~/sea-level-seekers/iharp_training_dataset/Flooding_Data')
flood_names = os.listdir(flood_dir)

In [4]:
# Get a list of all file names with their full paths
try:
    sat_names = [os.path.join(sat_dir, f) for f in os.listdir(sat_dir)]
    flood_names = [os.path.join(flood_dir, f) for f in os.listdir(flood_dir) if f.endswith('.csv')]
except FileNotFoundError:
    print(f"Directory not found")

In [10]:
def date_extractor(filename):
    file_name = filename
    
    # Regular expression to extract the date
    match = re.search(r'dt_ena_(\d+)_vDT', file_name)
    date_str = match.group(1)  # '19930101'
    
    # Format the date as 'YYYY-MM-DD'
    formatted_date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
    
    return formatted_date

In [33]:
sat_dates = [date_extractor(sat_name) for sat_name in sat_names]
sat_dates_set = set(sat_dates)

date_range = pd.date_range(start='1993-01-01', end='2013-12-31').date
date_range_str = [date.strftime('%Y-%m-%d') for date in date_range]

In [36]:
missing_dates = [date for date in date_range_str if date not in sat_dates_set]

In [None]:
# import pandas as pd
# import numpy as np


# for file in flood_names:
#     # Read the CSV file
#     df = pd.read_csv(file)
#     location = df['location'][0]
#     modified_location = location.lower().replace(" ", "_")
#     anomaly = 0
#     latitude = df['latitude'][0]
#     longitude = df['longitude'][0]
    
#     # Convert 't' (date) column to datetime format
#     df['t'] = pd.to_datetime(df['t'])
    
#     # Set the date range from 1993-01-01 to 2013-12-31
#     date_range = pd.date_range(start='1993-01-01', end='2013-12-31')
    
#     # Ensure there is only one row per day
#     df = df.drop_duplicates(subset=['t'])
    
#     # Reindex the dataframe with the complete date range
#     df = df.set_index('t').reindex(date_range, fill_value=np.nan)
    
#     # Fill missing values for 'anomaly', 'location', 'latitude', and 'longitude'
#     df['anomaly'].fillna(anomaly, inplace=True)
#     df['location'].fillna(location, inplace=True)
#     df['latitude'].fillna(latitude, inplace=True)
#     df['longitude'].fillna(longitude, inplace=True)
    
#     # Reset the index to get the date column back
#     df.reset_index(inplace=True)
#     df.rename(columns={'index': 't'}, inplace=True)
    
#     # Save the cleaned data to a new CSV file
#     df.to_csv('/home/jovyan/sea-level-seekers/iharp_training_dataset/cleaned_flooding_' + modified_location, index=False)

In [38]:
# Read the CSV file
df = pd.read_csv(flood_names[0])
location = df['location'][0]
modified_location = location.lower().replace(" ", "_")
anomaly = int(0)
latitude = df['latitude'][0]
longitude = df['longitude'][0]

# Convert 't' (date) column to datetime format
df['t'] = pd.to_datetime(df['t'])

# Set the date range from 1993-01-01 to 2013-12-31
date_range = pd.date_range(start='1993-01-01', end='2013-12-31')

# Ensure there is only one row per day
df = df.drop_duplicates(subset=['t'])

# Reindex the dataframe with the complete date range
df = df.set_index('t').reindex(date_range, fill_value=np.nan)

# Fill missing values for 'anomaly', 'location', 'latitude', and 'longitude'
df['anomaly'].fillna(anomaly)
df['location'].fillna(location)
df['latitude'].fillna(latitude)
df['longitude'].fillna(longitude)

# Reset the index to get the date column back
df.reset_index(inplace=True)
df.rename(columns={'index': 't'}, inplace=True)

missing_dates = pd.to_datetime(missing_dates)
df = df[~df['t'].isin(missing_dates)]

# Save the cleaned data to a new CSV file
df.to_csv('/home/jovyan/sea-level-seekers/iharp_training_dataset/cleaned_flooding/' + modified_location, index=False)

In [39]:
print("done")

done
