### ERA5 Extraction

In [None]:
import ee 
import pandas as pd
import os
from datetime import datetime
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
%cd ..


In [3]:
ee.Authenticate()
ee.Initialize(project='my-project-410920') ### YOUR PROJECT HERE

In [4]:
variables_to_extract = ['temperature_2m', 'total_precipitation_sum', 'u_component_of_wind_10m',
						'v_component_of_wind_10m', 'surface_pressure', 'snowfall_sum',
						'snowmelt_sum', 'dewpoint_temperature_2m',
    					]

In [11]:
xy_df_sub1 = pd.read_csv('data/xy_df/xy_df_sub1.csv')

In [None]:
grid_to_coord = {}

for ind, row in xy_df_sub1.iterrows():
	if (row['latitude']) != 0 or (row['longitude']) != 0:
		grid_to_coord[row['grid_id']] = (row['latitude'], row['longitude']) 

In [None]:
for ind, row in xy_df_sub1.iterrows():
    if (row['latitude'] == 0) and (row['longitude'] == 0):
        latitude, longitude = grid_to_coord.get(row['grid_id'], (None, None))
        if latitude is None or longitude is None:
            print('err')
            break
        xy_df_sub1.at[ind, 'latitude'] = latitude
        xy_df_sub1.at[ind, 'longitude'] = longitude

In [None]:
# Load the Earth Engine ImageCollection
IC = ee.ImageCollection('ECMWF/ERA5_LAND/MONTHLY_AGGR').select(variables_to_extract)

def process_grid_point(args):
    '''
    process entries of xy_df_sub1 and populate appropriate directories
    '''
    index, row = args
    latitude_to_extract = row['latitude']
    longitude_to_extract = row['longitude']
    year_to_process = int(row['year'])
    grid_id = row['grid_id']
    flood_target = row[f'target_flood_{n_pred}']
    point = ee.Geometry.Point(longitude_to_extract, latitude_to_extract)

    # Create the output directory
    output_dir = f'era5_new/target_flood/extracted_{index}' if flood_target == 1 else f'era5_new/no_flood_target/extracted_{index}'
    os.makedirs(output_dir, exist_ok=True)

    # Define the date range for the entire period (5 years)
    start_date = datetime(year_to_process - 4, 1, 1)
    end_date = datetime(year_to_process, 12, 31)

    # Filter the ImageCollection for the entire date range
    era5_tp = IC.filterDate(start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'))

    # Extract the time series for the entire period
    try:
        time_series = era5_tp.getRegion(point, scale=100000).getInfo()
        time_series_df = pd.DataFrame(time_series[1:], columns=time_series[0])

        # Convert the time column to a readable date format
        time_series_df['time'] = pd.to_datetime(time_series_df['time'], unit='ms')

        # Melt the DataFrame to the desired format
        melted_df = time_series_df.melt(
            id_vars=['time'],
            value_vars=variables_to_extract,
            var_name='id',
            value_name='value'
        )

        # Sort by variable name (id) and then by time
        melted_df = melted_df.sort_values(by=['id', 'time'])

        # Save the entire melted DataFrame to a single CSV file
        output_file = os.path.join(output_dir, f'era5_data.csv')
        melted_df.to_csv(output_file, index=False)

    except Exception as e:
        print(f"Failed to process grid point {grid_id, row.name}: {e}")


In [None]:
with ThreadPoolExecutor(max_workers=8) as executor:
	list(tqdm(executor.map(process_grid_point, [(index, row) for index, row in xy_df_sub1.iterrows()]), total=len(xy_df_sub1)))

In [None]:
def populate_dir(xy_df_sub1, path):
	'''
	read directories and flatten weather vars
	'''
	directories = sorted([f for f in os.listdir(path) if not f.startswith('.')])
	num_features = 480

	# Create column names for the new features
	column_names = [f'feature_{i}' for i in range(num_features)]

	# Create an empty list to store the rows of data
	data = []

	# Loop through directories and process each one
	for directory in tqdm(directories):
		# Extract index from directory name
		idx = int(directory.split('_')[1])

		# Read and flatten features
		features_df = pd.read_csv(f'{path}/{directory}/era5_data.csv')
		values = list(features_df['value'])

		# Gather the entire row's data (keep all existing columns)
		row_data = xy_df_sub1.loc[idx].values  # All columns in that row

		# Append the original row data + the new flattened feature values
		data.append(list(row_data) + list(values))

	# Create a new DataFrame with all original columns and the new features
	return pd.DataFrame(data, columns=xy_df_sub1.columns.tolist() + column_names)

In [None]:
xy_df_sub_new_no_flood = populate_dir(xy_df_sub1, 'data/era5/target_1/no_flood_target')
xy_df_sub_new_flood = populate_dir(xy_df_sub1, 'data/era5/target_1/target_flood')

In [None]:
xy_df_sub1_new_combined = pd.concat([xy_df_sub_new_no_flood, xy_df_sub_new_flood], axis=0)
xy_df_sub1_new_combined.to_csv('xy_df_sub_1_combined.csv', index=True)