In [3]:
# Import packages

import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import pyart
import glob
import matplotlib.patheffects as path_effects
from matplotlib.colors import BoundaryNorm, LinearSegmentedColormap
from pyproj import Transformer
import radlib
import os
import h5py
import matplotlib.dates as mdates
from scipy.ndimage import gaussian_filter1d
from cartopy.feature import NaturalEarthFeature
import xmltodict, geopandas, geojson, xml, json #xml and json do not exist
from datetime import datetime, timedelta, timezone
import geopy.distance
import numpy.matlib as npm
import copy
from scipy.signal import convolve2d
from astropy.convolution import convolve
import scipy.ndimage as ndi
import re
from skimage.draw import polygon


from pprint import pprint
from pysteps import io, motion, rcparams
from pysteps.utils import conversion, transformation
from pysteps.visualization import plot_precip_field, quiver

import json

from mpl_toolkits.axes_grid1.anchored_artists import AnchoredSizeBar
import matplotlib.font_manager as fm

import polars as pl

import rasterio
from rasterio.warp import calculate_default_transform, reproject, Resampling

import geopandas as gpd
from shapely.geometry import Point

from mpl_toolkits.mplot3d import Axes3D

from shapely.geometry import Polygon


from geopandas import GeoDataFrame
import datetime


os.environ["library_metranet_path"] = "/store_new/mch/msrad/idl/lib/radlib4/" # needed for pyradlib
os.environ["METRANETLIB_PATH"] = "/store_new/mch/msrad/idl/lib/radlib4/" # needed for pyart_mch


# Calculate Swiss grid coordinates into composite raster points
def swiss_to_grid_index(swiss_x, swiss_y, clons, clats, zh_shape):
    # Initialize transformers
    transformer_swiss_to_3035 = Transformer.from_crs(21781, 3035, always_xy=True)
    
    # Transform Swiss coordinates to EPSG:3035
    x_3035, y_3035 = transformer_swiss_to_3035.transform(swiss_x, swiss_y)
    
    # Calculate distances
    distances = np.sqrt((clons - x_3035)**2 + (clats - y_3035)**2)
    
    # Find the index of the minimum distance
    y_idx, x_idx = np.unravel_index(np.argmin(distances), distances.shape)
    
    # Create a function to get values at specific vertical levels
    def get_value_at_level(zh_array, level):
        if 0 <= level < zh_shape[2]:
            return zh_array[y_idx, x_idx, level]
        else:
            raise ValueError(f"Level must be between 0 and {zh_shape[2]-1}")
    
    return y_idx, x_idx, get_value_at_level

# Function to retrieve cross sections for all variables of the wind composite
def load_and_create_cross_sections(year, month, day, valid_time):
    # Load the .npz file
    data = np.load(f'/scratch/mch/fackerma/orders/full_composite_npz/{year}{month}{day}{valid_time}00_conv_wind_composite_data.npz')
    
    # Access the specific arrays
    ZH = data['ZH_max']
    rad_shear = data['RAD_SHEAR_LLSD_max']
    az_shear = data['AZ_SHEAR_LLSD_abs_max']
    RVEL = data['RVEL_DE_abs_max']
    KDP = data['KDP_max']
    ZDR = data['ZH_max']
    
    # Create cross-sections and 2D projections for each array
    arrays = [ZH, rad_shear, az_shear, RVEL, KDP, ZDR]
    names = ['ZH', 'rad_shear', 'az_shear', 'RVEL', 'KDP', 'ZDR']
    
    results = {}
    
    for arr, name in zip(arrays, names):
        # Create cross-sections
        results[f'{name}_x_cross'] = arr[y_idx, x_start:x_end, :]
        results[f'{name}_y_cross'] = arr[y_start:y_end, x_idx, :]
        
        # Create 2D max projection
        results[f'{name}_2d'] = np.nanmax(arr, axis=2)
    
    return results
# All results are stored in a dictionary, with keys formatted as '{name}_x_cross', '{name}_y_cross', and '{name}_2d'


# Define the Swiss grid (adjusted to match data dimensions)
chx = np.arange(255000, 255000 + 710 * 1000, 1000)  # Easting values (710 points)
chy = sorted(np.arange(-160000, -160000 + 640 * 1000, 1000), reverse=True)  # Northing values (640 points)
X, Y = np.meshgrid(chx, chy)

# Initialize transformer for Swiss grid to WGS84 (EPSG:21781 to EPSG:4326 PlateCarree)
transformer = Transformer.from_crs(21781, 4326, always_xy=True)
clons, clats = transformer.transform(X, Y)





## You are using the Python ARM Radar Toolkit (Py-ART), an open source
## library for working with weather radar data. Py-ART is partly
## supported by the U.S. Department of Energy as part of the Atmospheric
## Radiation Measurement (ARM) Climate Research Facility, an Office of
## Science user facility.
##
## If you use this software to prepare a publication, please cite:
##
##     JJ Helmus and SM Collis, JORS 2016, doi: 10.5334/jors.119

Pysteps configuration file found at: /scratch/mch/fackerma/miniforge3/envs/testenv/lib/python3.12/site-packages/pysteps/pystepsrc



In [31]:
# Import the data
import os
import pandas as pd
import numpy as np

# Define the base directory and file names
base_dir = "/scratch/mch/fackerma/orders/TRT_processing_2/"
yearly_files = [
    "TRT_2019_05-10.pkl",
    "TRT_2020_05-10.pkl",
    "TRT_2021_05-10.pkl",
    "TRT_2022_05-10.pkl",
    "TRT_2023_05-10.pkl",
]

# Load and merge dataframes
dfs = []
for file_name in yearly_files:
    file_path = os.path.join(base_dir, file_name)
    if os.path.exists(file_path):
        print(f"Loading {file_name}...")
        df = pd.read_pickle(file_path)
        dfs.append(df)
    else:
        print(f"⚠️ File not found: {file_name}")

if not dfs:
    print("No data loaded. Exiting.")
    exit()

merged_df = pd.concat(dfs, ignore_index=True)
print(f"\nMerged dataframe shape: {merged_df.shape}")



Loading TRT_2019_05-10.pkl...
Loading TRT_2020_05-10.pkl...


Loading TRT_2021_05-10.pkl...
Loading TRT_2022_05-10.pkl...
Loading TRT_2023_05-10.pkl...


  return GeometryArray(data, crs=_get_common_crs(to_concat))



Merged dataframe shape: (2240266, 91)


## Best approach

In [32]:
import pandas as pd
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
from pyproj import Transformer

from pyproj import Transformer
import numpy as np


# Define the Swiss grid (adjusted to match data dimensions)
chx = np.arange(255000, 255000 + 710 * 1000, 1000)  # Easting values (710 points)
chy = sorted(np.arange(-160000, -160000 + 640 * 1000, 1000), reverse=True)  # Northing values (640 points)
X, Y = np.meshgrid(chx, chy)

# Initialize transformer for Swiss grid to WGS84 (EPSG:21781 to EPSG:4326 PlateCarree)
transformer = Transformer.from_crs(21781, 4326, always_xy=True)
clons, clats = transformer.transform(X, Y)



def filter_rows_by_datetime(merged_df, datetime_str):
    # Ensure the 'timestamp' column is in datetime format
    merged_df['timestamp'] = pd.to_datetime(merged_df['timestamp'])
    
    # Convert input datetime string to a datetime object
    target_datetime = pd.to_datetime(datetime_str)
    
    # Filter rows for the exact datetime
    filtered_df = merged_df[merged_df['timestamp'] == target_datetime]
    return filtered_df

def load_npz_file(datetime_obj):
    # Extract components from datetime
    year = datetime_obj.strftime('%Y')
    month = datetime_obj.strftime('%m')
    day = datetime_obj.strftime('%d')
    valid_time = datetime_obj.strftime('%H%M')  # Format: '1535' for 15:35
    
    # Load .npz file
    file_path = f'/scratch/mch/maregger/hailclass/convective_wind/full_composite_npz/{year}{month}{day}{valid_time}00_conv_wind_composite_data.npz'
    return np.load(file_path)

# Example usage
if __name__ == "__main__":
    # Define your datetime (including time)
    datetime_str = '2019-06-15 15:35:00'
    converted = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S').strftime('%Y%m%d%H%M')
    
    # Filter rows by exact datetime
    filtered_df = filter_rows_by_datetime(merged_df, datetime_str)
    print("Filtered DataFrame:")
    print(filtered_df)
    
    # Load corresponding .npz file
    if not filtered_df.empty:
        data = load_npz_file(pd.to_datetime(datetime_str))
        ZH, rad_shear, KDP = data['ZH_max'], data['RAD_SHEAR_LLSD_max'], data['KDP_max']
        print("\nLoaded Arrays:")
        print("ZH:", ZH.shape)  # Check array dimensions instead of printing NaNs

    # Anchor the fields in the Swiss grid
    anchored_data = {
        'ZH': ZH,
        'rad_shear': rad_shear,
        'KDP': KDP,
        'easting': X,
        'northing': Y,
        'longitude': clons,
        'latitude': clats
    }



Filtered DataFrame:
                timestamp                                           geometry  \
65788 2019-06-15 15:35:00  POLYGON ((6.8781 45.9107, 6.8653 45.9016, 6.86...   
65789 2019-06-15 15:35:00  POLYGON ((8.1332 46.3148, 8.1201 46.3059, 8.11...   
65790 2019-06-15 15:35:00  POLYGON ((6.4614 46.1507, 6.4616 46.1418, 6.44...   
65791 2019-06-15 15:35:00  POLYGON ((8.1177 46.117, 8.1047 46.108, 8.0918...   
65792 2019-06-15 15:35:00  POLYGON ((8.1562 46.0897, 8.1432 46.0808, 8.14...   
65793 2019-06-15 15:35:00  POLYGON ((8.4001 45.989, 8.387 45.9801, 8.3866...   
65794 2019-06-15 15:35:00  POLYGON ((6.9303 45.839, 6.9304 45.83, 6.9176 ...   
65795 2019-06-15 15:35:00  POLYGON ((8.2774 46.4038, 8.2642 46.3949, 8.25...   
65796 2019-06-15 15:35:00  POLYGON ((7.5479 45.7501, 7.5478 45.7142, 7.56...   
65797 2019-06-15 15:35:00  POLYGON ((5.7504 45.7015, 5.7506 45.6925, 5.73...   
65798 2019-06-15 15:35:00  POLYGON ((6.1868 45.7072, 6.1874 45.6802, 6.16...   
65799 2019-06-15 15:

In [19]:
import numpy as np
from shapely.prepared import prep
from shapely.geometry import Point
from scipy.ndimage import center_of_mass

def calculate_metrics(filtered_df, clons, clats, ZH, rad_shear, KDP):
    # Precompute grid properties
    ny, nx = clons.shape
    nz = ZH.shape[2]
    clons_flat = clons.ravel()
    clats_flat = clats.ravel()
    
    # Initialize results storage
    results = {
        'ZH_com_height': [], 'ZH_percent_above_45': [],
        'KDP_com_height': [], 'KDP_percent_above_2': [],
        'rad_shear_max': [], 'rad_shear_percent_above_2': []
    }

    for _, row in filtered_df.iterrows():
        poly = row['geometry']
        prep_poly = prep(poly)
        minx, miny, maxx, maxy = poly.bounds
        
        # Bounding box optimization
        bbox_mask = (clons_flat >= minx) & (clons_flat <= maxx) & \
                    (clats_flat >= miny) & (clats_flat <= maxy)
        if not bbox_mask.any():
            results = _append_defaults(results)
            continue
            
        # Vectorized containment check
        contained = np.array([prep_poly.contains(Point(p)) 
                               for p in zip(clons_flat[bbox_mask], clats_flat[bbox_mask])])
        if not contained.any():
            results = _append_defaults(results)
            continue
            
        # Create 2D mask
        mask_2d = np.zeros((ny, nx), bool)
        yx_indices = np.unravel_index(np.where(bbox_mask)[0][contained], (ny, nx))
        mask_2d[yx_indices] = True
        
        # Extend the mask to 3D by repeating along the vertical dimension
        mask_3d = np.broadcast_to(mask_2d[..., None], (ny, nx, nz))
        
        # Extract values within the polygon for ZH, KDP, rad_shear
        ZH_masked = np.where(mask_3d, ZH, np.nan)
        KDP_masked = np.where(mask_3d, KDP, np.nan)
        rad_shear_masked = np.where(mask_3d, rad_shear, np.nan)

        # Replace NaN values with 0 for calculations
        ZH_masked[np.isnan(ZH_masked)] = 0
        KDP_masked[np.isnan(KDP_masked)] = 0
        rad_shear_masked[np.isnan(rad_shear_masked)] = 0
        
        # Calculate metrics for ZH
        if np.any(ZH_masked > 0):  # Check if there are valid values
            com_ZH = center_of_mass(ZH_masked)  # Center of mass height
            results['ZH_com_height'].append(com_ZH[2])  # Use the vertical dimension (z-axis)
            results['ZH_percent_above_45'].append(np.sum(ZH_masked > 45) / np.size(ZH_masked) * 100)
        else:
            results['ZH_com_height'].append(np.nan)
            results['ZH_percent_above_45'].append(0)
        
        # Calculate metrics for KDP
        if np.any(KDP_masked > 0):  # Check if there are valid values
            com_KDP = center_of_mass(KDP_masked)  # Center of mass height
            results['KDP_com_height'].append(com_KDP[2])  # Use the vertical dimension (z-axis)
            results['KDP_percent_above_2'].append(np.sum(KDP_masked > 2) / np.size(KDP_masked) * 100)
        else:
            results['KDP_com_height'].append(np.nan)
            results['KDP_percent_above_2'].append(0)
        
        # Calculate metrics for rad_shear
        if np.any(rad_shear_masked > 0):  # Check if there are valid values
            results['rad_shear_max'].append(np.nanmax(rad_shear_masked))
            results['rad_shear_percent_above_2'].append(np.sum(rad_shear_masked > 2) / np.size(rad_shear_masked) * 100)
        else:
            results['rad_shear_max'].append(np.nan)
            results['rad_shear_percent_above_2'].append(0)

    # Add results to DataFrame
    for col in results:
        filtered_df[col] = results[col]
        
    return filtered_df

def _append_defaults(results):
    """Append default values to result lists when no valid data is found."""
    for k in results:
        default = 0 if 'percent' in k or 'max' in k else np.nan
        results[k].append(default)
    return results


In [33]:
import numpy as np
from shapely.prepared import prep
from shapely.geometry import Point
from scipy.ndimage import center_of_mass

def calculate_metrics(filtered_df, clons, clats, ZH, rad_shear, KDP):
    # Precompute grid properties
    ny, nx = clons.shape
    nz = ZH.shape[2]
    clons_flat = clons.ravel()
    clats_flat = clats.ravel()
    
    # Initialize results storage
    results = {
        'ZH_com_height': [], 'ZH_percent_above_45': [],
        'KDP_com_height': [], 'KDP_percent_above_2': [],
        'rad_shear_max': [], 'rad_shear_percent_above_2': [],
        'area_p': []  # New column for polygon area
    }

    for _, row in filtered_df.iterrows():
        poly = row['geometry']
        
        # Calculate polygon area in square meters
        poly_area = poly.area  # Shapely's area property calculates area in native CRS units (meters for Swiss grid)
        results['area_p'].append(poly_area)
        
        prep_poly = prep(poly)
        minx, miny, maxx, maxy = poly.bounds
        
        # Bounding box optimization
        bbox_mask = (clons_flat >= minx) & (clons_flat <= maxx) & \
                    (clats_flat >= miny) & (clats_flat <= maxy)
        if not bbox_mask.any():
            results = _append_defaults(results)
            continue
            
        # Vectorized containment check
        contained = np.array([prep_poly.contains(Point(p)) 
                               for p in zip(clons_flat[bbox_mask], clats_flat[bbox_mask])])
        if not contained.any():
            results = _append_defaults(results)
            continue
            
        # Create 2D mask
        mask_2d = np.zeros((ny, nx), bool)
        yx_indices = np.unravel_index(np.where(bbox_mask)[0][contained], (ny, nx))
        mask_2d[yx_indices] = True
        
        # Extend the mask to 3D by repeating along the vertical dimension
        mask_3d = np.broadcast_to(mask_2d[..., None], (ny, nx, nz))
        
        # Extract values within the polygon for ZH, KDP, rad_shear
        ZH_masked = np.where(mask_3d, ZH, np.nan)
        KDP_masked = np.where(mask_3d, KDP, np.nan)
        rad_shear_masked = np.where(mask_3d, rad_shear, np.nan)

        # Replace NaN values with 0 for calculations
        ZH_masked[np.isnan(ZH_masked)] = 0
        KDP_masked[np.isnan(KDP_masked)] = 0
        rad_shear_masked[np.isnan(rad_shear_masked)] = 0
        
        # Calculate metrics for ZH
        if np.any(ZH_masked > 0):  # Check if there are valid values
            com_ZH = center_of_mass(ZH_masked)  # Center of mass height
            results['ZH_com_height'].append(com_ZH[2])  # Use the vertical dimension (z-axis)
            results['ZH_percent_above_45'].append(np.sum(ZH_masked > 45) / np.size(ZH_masked) * 100)
        else:
            results['ZH_com_height'].append(np.nan)
            results['ZH_percent_above_45'].append(0)
        
        # Calculate metrics for KDP
        if np.any(KDP_masked > 0):  # Check if there are valid values
            com_KDP = center_of_mass(KDP_masked)  # Center of mass height
            results['KDP_com_height'].append(com_KDP[2])  # Use the vertical dimension (z-axis)
            results['KDP_percent_above_2'].append(np.sum(KDP_masked > 2) / np.size(KDP_masked) * 100)
        else:
            results['KDP_com_height'].append(np.nan)
            results['KDP_percent_above_2'].append(0)
        
        # Calculate metrics for rad_shear
        if np.any(rad_shear_masked > 0):  # Check if there are valid values
            results['rad_shear_max'].append(np.nanmax(rad_shear_masked))
            results['rad_shear_percent_above_2'].append(np.sum(rad_shear_masked > 2) / np.size(rad_shear_masked) * 100)
        else:
            results['rad_shear_max'].append(np.nan)
            results['rad_shear_percent_above_2'].append(0)

    # Add results to DataFrame
    for col in results:
        filtered_df[col] = results[col]
        
    return filtered_df

def _append_defaults(results):
    """Append default values to result lists when no valid data is found."""
    for k in results:
        default = 0 if 'percent' in k or 'max' in k else np.nan
        results[k].append(default)
    return results 


In [34]:
# Example usage:
# Assuming clons, clats, ZH, rad_shear, KDP are already defined and filtered_df contains 'geometry' column.
new_df = calculate_metrics(filtered_df, clons, clats, ZH, rad_shear, KDP)

# Display updated DataFrame with new columns
print(new_df.head())

#new_df.to_pickle(f'/scratch/mch/fackerma/orders/TRT_modelsetup_testdir/TRT_{converted}_3.pkl')

                timestamp                                           geometry  \
65788 2019-06-15 15:35:00  POLYGON ((6.8781 45.9107, 6.8653 45.9016, 6.86...   
65789 2019-06-15 15:35:00  POLYGON ((8.1332 46.3148, 8.1201 46.3059, 8.11...   
65790 2019-06-15 15:35:00  POLYGON ((6.4614 46.1507, 6.4616 46.1418, 6.44...   
65791 2019-06-15 15:35:00  POLYGON ((8.1177 46.117, 8.1047 46.108, 8.0918...   
65792 2019-06-15 15:35:00  POLYGON ((8.1562 46.0897, 8.1432 46.0808, 8.14...   

      CS Marker STA Marker ESWD Marker Gust_Flag             traj_ID  \
65788         0          0           0         -  2019061515350115.0   
65789         0          0           0         -  2019061515200086.0   
65790         0          0           0         -  2019061515150101.0   
65791         0          0           0         -  2019061515250100.0   
65792         0          0           0         -  2019061515200090.0   

               time     lon      lat  ... nrPOHthr080 nrPOHthr090 nrPOHthr100  \
65788

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[col] = results[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[col] = results[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[col] = results[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

## Job troubleshooting

In [None]:
import os
from datetime import datetime
import numpy as np
from shapely.geometry import Point
from scipy.ndimage import center_of_mass
import pandas as pd
import matplotlib.pyplot as plt
from pyproj import Transformer
from shapely.prepared import prep





# Define the Swiss grid (adjusted to match data dimensions)
chx = np.arange(255000, 255000 + 710 * 1000, 1000)  # Easting values (710 points)
chy = sorted(np.arange(-160000, -160000 + 640 * 1000, 1000), reverse=True)  # Northing values (640 points)
X, Y = np.meshgrid(chx, chy)

# Initialize transformer for Swiss grid to WGS84 (EPSG:21781 to EPSG:4326 PlateCarree)
transformer = Transformer.from_crs(21781, 4326, always_xy=True)
clons, clats = transformer.transform(X, Y)



def calculate_metrics(filtered_df, clons, clats, ZH, rad_shear, KDP):
    # Precompute grid properties
    ny, nx = clons.shape
    nz = ZH.shape[2]
    clons_flat = clons.ravel()
    clats_flat = clats.ravel()
    
    # Initialize results storage
    results = {
        'ZH_com_height': [], 'ZH_percent_above_45': [], 'ZH_percent_above_50': [], 'ZH_percent_above_55': [],
        'KDP_com_height': [], 'KDP_percent_above_2': [], 'KDP_percent_above_1.5': [], 'KDP_percent_above_1': [],
        'rad_shear_max': [], 'rad_shear_percent_above_2.5': [], 'rad_shear_percent_above_2': [], 'rad_shear_percent_above_1.5': [],
        'area_p': []
    }

    for _, row in filtered_df.iterrows():
        poly = row['geometry']
        prep_poly = prep(poly)
        minx, miny, maxx, maxy = poly.bounds
        
        # Bounding box optimization
        bbox_mask = (clons_flat >= minx) & (clons_flat <= maxx) & \
                    (clats_flat >= miny) & (clats_flat <= maxy)
        if not bbox_mask.any():
            results = _append_defaults(results)
            continue
            
        # Vectorized containment check
        contained = np.array([prep_poly.contains(Point(p)) 
                               for p in zip(clons_flat[bbox_mask], clats_flat[bbox_mask])])
        if not contained.any():
            results = _append_defaults(results)
            continue
            
        # Create 2D mask
        mask_2d = np.zeros((ny, nx), bool)
        yx_indices = np.unravel_index(np.where(bbox_mask)[0][contained], (ny, nx))
        mask_2d[yx_indices] = True
        
        # Extend the mask to 3D by repeating along the vertical dimension
        mask_3d = np.broadcast_to(mask_2d[..., None], (ny, nx, nz))
        
        # Extract values within the polygon for ZH, KDP, rad_shear
        ZH_masked = np.where(mask_3d, ZH, np.nan)
        KDP_masked = np.where(mask_3d, KDP, np.nan)
        rad_shear_masked = np.where(mask_3d, rad_shear, np.nan)

        # Replace NaN values with 0 for calculations
        ZH_masked[np.isnan(ZH_masked)] = 0
        KDP_masked[np.isnan(KDP_masked)] = 0
        rad_shear_masked[np.isnan(rad_shear_masked)] = 0
        
        # Calculate metrics for ZH
        if np.any(ZH_masked > 0):  # Check if there are valid values
            com_ZH = center_of_mass(ZH_masked)  # Center of mass height
            results['ZH_com_height'].append(com_ZH[2])  # Use the vertical dimension (z-axis)
            results['ZH_percent_above_45'].append(np.sum(ZH_masked > 45) / np.size(ZH_masked) * 100)
            results['ZH_percent_above_50'].append(np.sum(ZH_masked > 50) / np.size(ZH_masked) * 100)
            results['ZH_percent_above_55'].append(np.sum(ZH_masked > 55) / np.size(ZH_masked) * 100)
        else:
            results['ZH_com_height'].append(np.nan)
            results['ZH_percent_above_45'].append(0)
            results['ZH_percent_above_50'].append(0)
            results['ZH_percent_above_55'].append(0)
        
        # Calculate metrics for KDP
        if np.any(KDP_masked > 0):  # Check if there are valid values
            com_KDP = center_of_mass(KDP_masked)  # Center of mass height
            results['KDP_com_height'].append(com_KDP[2])  # Use the vertical dimension (z-axis)
            results['KDP_percent_above_2'].append(np.sum(KDP_masked > 2) / np.size(KDP_masked) * 100)
            results['KDP_percent_above_1.5'].append(np.sum(KDP_masked > 1.5) / np.size(KDP_masked) * 100)
            results['KDP_percent_above_1'].append(np.sum(KDP_masked > 1) / np.size(KDP_masked) * 100)
        else:
            results['KDP_com_height'].append(np.nan)
            results['KDP_percent_above_2'].append(0)
            results['KDP_percent_above_1.5'].append(0)
            results['KDP_percent_above_1'].append(0)
        
        # Calculate metrics for rad_shear
        if np.any(rad_shear_masked > 0):  # Check if there are valid values
            results['rad_shear_max'].append(np.nanmax(rad_shear_masked))
            results['rad_shear_percent_above_2.5'].append(np.sum(rad_shear_masked > 2.5) / np.size(rad_shear_masked) * 100)
            results['rad_shear_percent_above_2'].append(np.sum(rad_shear_masked > 2) / np.size(rad_shear_masked) * 100)
            results['rad_shear_percent_above_1.5'].append(np.sum(rad_shear_masked > 1.5) / np.size(rad_shear_masked) * 100)
        else:
            results['rad_shear_max'].append(np.nan)
            results['rad_shear_percent_above_2.5'].append(0)
            results['rad_shear_percent_above_2'].append(0)
            results['rad_shear_percent_above_1.5'].append(0)

    # Add results to DataFrame
    for col in results:
        filtered_df[col] = results[col]
        
    return filtered_df

def _append_defaults(results):
    """Append default values to result lists when no valid data is found."""
    for k in results:
        default = 0 if 'percent' in k or 'max' in k else np.nan
        results[k].append(default)
    return results




import os
import pandas as pd
import numpy as np
from datetime import datetime

# 1. Load Extraction Dates
extraction_file = "/scratch/mch/fackerma/orders/Reworked_gust_extraction_dates.txt"

# Read the file into a DataFrame
extraction_dates = pd.read_csv(extraction_file)

# Parse 'Valid_Time' column into datetime format
extraction_dates['Valid_Time'] = pd.to_datetime(extraction_dates['Valid_Time'], format='%Y%m%d%H%M%S')

# Extract valid times as Python datetime objects
#valid_times = extraction_dates['Valid_Time'].dt.to_pydatetime()

# Convert valid_times to timezone-aware datetime objects (UTC)
valid_times = pd.to_datetime(extraction_dates['Valid_Time'], format='%Y%m%d%H%M%S').dt.tz_localize('UTC')



# 2. Load Merged DataFrame
base_dir = "/scratch/mch/fackerma/orders/TRT_processing_3/"
yearly_files = [
    "TRT_2019_05-10.pkl",
    "TRT_2020_05-10.pkl",
    "TRT_2021_05-10.pkl",
    "TRT_2022_05-10.pkl",
    "TRT_2023_05-10.pkl",
]

dfs = []
for file_name in yearly_files:
    file_path = os.path.join(base_dir, file_name)
    if os.path.exists(file_path):
        print(f"Loading {file_name}...")
        dfs.append(pd.read_pickle(file_path))

merged_df = pd.concat(dfs, ignore_index=True)

# 3. Filter by Valid Times and Gust Flags
# Convert merged_df timestamp to match extraction format
merged_df['timestamp'] = pd.to_datetime(merged_df['yyyymmddHHMM'], utc=True)


# Time filtering
time_filter = merged_df['timestamp'].isin(valid_times)
filtered_by_time = merged_df[time_filter].copy()
print(f"Number of rows after time filtering: {filtered_by_time.shape[0]}")


# Find traj_IDs with at least one Yes/No in Gust_Flag
valid_traj_ids = merged_df[merged_df['Gust_Flag'].isin(['Yes', 'No'])]['traj_ID'].unique()
traj_filter = filtered_by_time['traj_ID'].isin(valid_traj_ids)


final_df = filtered_by_time[traj_filter].copy()
print(f"Number of rows after traj_ID filtering: {final_df.shape[0]}")

# 4. Process Data with calculate_metrics
npz_base = '/scratch/mch/fackerma/orders/npz_GT2/full_composite_npz/'
output_path = "/scratch/mch/fackerma/orders/TRT_modelsetup_3/Model_Setup_3.pkl"

# Group by timestamp for NPZ loading
grouped = final_df.groupby('timestamp')

all_results = []
for timestamp, group in grouped:
    try:
        # Load corresponding NPZ file
        npz_time = timestamp.strftime('%Y%m%d%H%M00')
        npz_path = f"{npz_base}{npz_time}_conv_wind_composite_data_pl.npz"
        
        if not os.path.exists(npz_path):
            print(f"⚠️ NPZ file not found: {npz_path}")
            continue
            
        with np.load(npz_path) as data:
            ZH = data['ZH_max']
            rad_shear = data['RAD_SHEAR_LLSD_max']
            KDP = data['KDP_max']
            
        # Process the group
        processed_group = calculate_metrics(
            filtered_df=group,
            clons=clons,
            clats=clats,
            ZH=ZH,
            rad_shear=rad_shear,
            KDP=KDP
        )
        
        all_results.append(processed_group)
        
    except Exception as e:
        print(f"Error processing {timestamp}: {str(e)}")

# 5. Save Final Output
if all_results:
    final_output = pd.concat(all_results, ignore_index=True)
    final_output.to_pickle(output_path)
    print(f"Saved final output to {output_path}")
else:
    print("No data processed - output file not created")

Loading TRT_2019_05-10.pkl...
Loading TRT_2020_05-10.pkl...
Loading TRT_2021_05-10.pkl...
Loading TRT_2022_05-10.pkl...
Loading TRT_2023_05-10.pkl...


  return GeometryArray(data, crs=_get_common_crs(to_concat))


Number of rows after time filtering: 416914
Number of rows after traj_ID filtering: 33066
Error processing 2019-05-11 12:25:00+00:00: Length of values (0) does not match length of index (1)
Error processing 2019-05-11 12:30:00+00:00: Length of values (0) does not match length of index (2)


KeyboardInterrupt: 

In [12]:
print(merged_df['timestamp'].dtype)
print(valid_times.dtype)

datetime64[ns, UTC]
datetime64[ns, UTC]


In [14]:
npz_path

'/scratch/mch/fackerma/orders/npz_GT2/full_composite_npz/20190511123500_conv_wind_composite_data_pl.npz'

In [22]:
print(merged_df['Gust_Flag'].unique())


['-' 'Yes' 'No']


In [7]:
valid_traj_ids = merged_df[merged_df['Gust_Flag'].isin(['Yes', 'No'])]['traj_ID'].unique()
print(f"Number of valid traj_IDs: {len(valid_traj_ids)}")
print(valid_traj_ids[:10])  # Print first 10 valid traj_IDs


Number of valid traj_IDs: 3554
['2019051112250040.0' '2019051112350035.0' '2019051112300050.0'
 '2019051113300051.0' '2019061509450011.0' '2019061510350021.0'
 '2019061515350046.0' '2019061518300167.0' '2019061519300003.0'
 '2019061518400144.0']


In [8]:
traj_filter = filtered_by_time['traj_ID'].isin(valid_traj_ids)
filtered_by_traj = filtered_by_time[traj_filter].copy()
print(f"Number of rows after traj_ID filtering: {filtered_by_traj.shape[0]}")
print(filtered_by_traj[['traj_ID', 'Gust_Flag']].head())  # Inspect a few rows


Number of rows after traj_ID filtering: 33066
                 traj_ID Gust_Flag
8125  2019051112250040.0         -
8151  2019051112250040.0       Yes
8153  2019051112300050.0         -
8171  2019051112300050.0         -
8172  2019051112350035.0         -


In [9]:
print(f"Number of valid timestamps: {len(valid_times)}")
print(valid_times[:5])  # Print the first 5 valid timestamps


Number of valid timestamps: 14231
0   2019-05-11 12:15:00+00:00
1   2019-05-11 12:20:00+00:00
2   2019-05-11 12:25:00+00:00
3   2019-05-11 12:30:00+00:00
4   2019-05-11 12:35:00+00:00
Name: Valid_Time, dtype: datetime64[ns, UTC]


In [10]:
print(f"Number of timestamps in merged_df: {merged_df['timestamp'].nunique()}")
print(merged_df['timestamp'].head())  # Print the first 5 timestamps


Number of timestamps in merged_df: 150333
0   2019-05-01 09:55:00+00:00
1   2019-05-01 10:00:00+00:00
2   2019-05-01 10:05:00+00:00
3   2019-05-01 10:05:00+00:00
4   2019-05-01 10:10:00+00:00
Name: timestamp, dtype: datetime64[ns, UTC]


## File troubleshooting

In [None]:
import pandas as pd

file_path = '/scratch/mch/fackerma/orders/TRT_modelsetup_2/Model_Setup_1.pkl'

data = pd.read_pickle(file_path)
print("File loaded successfully!")
print(data.isna().sum())  # Display NaN summary



File loaded successfully!
timestamp                       0
geometry                        0
CS Marker                       0
STA Marker                      0
ESWD Marker                     0
                               ..
rad_shear_max                  37
rad_shear_percent_above_2.5     0
rad_shear_percent_above_2       0
rad_shear_percent_above_1.5     0
area_p                          0
Length: 104, dtype: int64


In [16]:
import os
from datetime import datetime
import numpy as np
from shapely.geometry import Point
from scipy.ndimage import center_of_mass
import pandas as pd
import matplotlib.pyplot as plt
from pyproj import Transformer
from shapely.prepared import prep





# Define the Swiss grid (adjusted to match data dimensions)
chx = np.arange(255000, 255000 + 710 * 1000, 1000)  # Easting values (710 points)
chy = sorted(np.arange(-160000, -160000 + 640 * 1000, 1000), reverse=True)  # Northing values (640 points)
X, Y = np.meshgrid(chx, chy)

# Initialize transformer for Swiss grid to WGS84 (EPSG:21781 to EPSG:4326 PlateCarree)
transformer = Transformer.from_crs(21781, 4326, always_xy=True)
clons, clats = transformer.transform(X, Y)



def calculate_metrics(filtered_df, clons, clats, ZH, rad_shear, KDP):
    # Precompute grid properties
    ny, nx = clons.shape
    nz = ZH.shape[2]
    clons_flat = clons.ravel()
    clats_flat = clats.ravel()
    
    # Initialize results storage
    results = {
        'ZH_com_height': [], 'ZH_percent_above_45': [], 'ZH_percent_above_50': [], 'ZH_percent_above_55': [],
        'KDP_com_height': [], 'KDP_percent_above_2': [], 'KDP_percent_above_1.5': [], 'KDP_percent_above_1': [],
        'rad_shear_max': [], 'rad_shear_percent_above_2.5': [], 'rad_shear_percent_above_2': [], 'rad_shear_percent_above_1.5': [],
        'area_p': []   # New column for polygon area
    }

    for _, row in filtered_df.iterrows():
        poly = row['geometry']
        
        # Calculate polygon area in square meters
        poly_area = poly.area  # Shapely's area property calculates area in native CRS units (meters for Swiss grid)
        results['area_p'].append(poly_area)
        
        prep_poly = prep(poly)
        minx, miny, maxx, maxy = poly.bounds
        
        # Bounding box optimization
        bbox_mask = (clons_flat >= minx) & (clons_flat <= maxx) & \
                    (clats_flat >= miny) & (clats_flat <= maxy)
        if not bbox_mask.any():
            results = _append_defaults(results)
            continue
            
        # Vectorized containment check
        contained = np.array([prep_poly.contains(Point(p)) 
                               for p in zip(clons_flat[bbox_mask], clats_flat[bbox_mask])])
        if not contained.any():
            results = _append_defaults(results)
            continue
            
        # Create 2D mask
        mask_2d = np.zeros((ny, nx), bool)
        yx_indices = np.unravel_index(np.where(bbox_mask)[0][contained], (ny, nx))
        mask_2d[yx_indices] = True
        
        # Extend the mask to 3D by repeating along the vertical dimension
        mask_3d = np.broadcast_to(mask_2d[..., None], (ny, nx, nz))
        
        # Extract values within the polygon for ZH, KDP, rad_shear
        ZH_masked = np.where(mask_3d, ZH, np.nan)
        KDP_masked = np.where(mask_3d, KDP, np.nan)
        rad_shear_masked = np.where(mask_3d, rad_shear, np.nan)

        # Replace NaN values with 0 for calculations
        ZH_masked[np.isnan(ZH_masked)] = 0
        KDP_masked[np.isnan(KDP_masked)] = 0
        rad_shear_masked[np.isnan(rad_shear_masked)] = 0
        
        # Calculate metrics for ZH
        if np.any(ZH_masked > 0):  # Check if there are valid values
            com_ZH = center_of_mass(ZH_masked)  # Center of mass height
            results['ZH_com_height'].append(com_ZH[2])  # Use the vertical dimension (z-axis)
            results['ZH_percent_above_45'].append(np.sum(ZH_masked > 45) / np.size(ZH_masked) * 100)
            results['ZH_percent_above_50'].append(np.sum(ZH_masked > 50) / np.size(ZH_masked) * 100)
            results['ZH_percent_above_55'].append(np.sum(ZH_masked > 55) / np.size(ZH_masked) * 100)
        else:
            results['ZH_com_height'].append(np.nan)
            results['ZH_percent_above_45'].append(0)
            results['ZH_percent_above_50'].append(0)
            results['ZH_percent_above_55'].append(0)
        
        # Calculate metrics for KDP
        if np.any(KDP_masked > 0):  # Check if there are valid values
            com_KDP = center_of_mass(KDP_masked)  # Center of mass height
            results['KDP_com_height'].append(com_KDP[2])  # Use the vertical dimension (z-axis)
            results['KDP_percent_above_2'].append(np.sum(KDP_masked > 2) / np.size(KDP_masked) * 100)
            results['KDP_percent_above_1.5'].append(np.sum(KDP_masked > 1.5) / np.size(KDP_masked) * 100)
            results['KDP_percent_above_1'].append(np.sum(KDP_masked > 1) / np.size(KDP_masked) * 100)
        else:
            results['KDP_com_height'].append(np.nan)
            results['KDP_percent_above_2'].append(0)
            results['KDP_percent_above_1.5'].append(0)
            results['KDP_percent_above_1'].append(0)
        
        # Calculate metrics for rad_shear
        if np.any(rad_shear_masked > 0):  # Check if there are valid values
            results['rad_shear_max'].append(np.nanmax(rad_shear_masked))
            results['rad_shear_percent_above_2.5'].append(np.sum(rad_shear_masked > 2.5) / np.size(rad_shear_masked) * 100)
            results['rad_shear_percent_above_2'].append(np.sum(rad_shear_masked > 2) / np.size(rad_shear_masked) * 100)
            results['rad_shear_percent_above_1.5'].append(np.sum(rad_shear_masked > 1.5) / np.size(rad_shear_masked) * 100)
        else:
            results['rad_shear_max'].append(np.nan)
            results['rad_shear_percent_above_2.5'].append(0)
            results['rad_shear_percent_above_2'].append(0)
            results['rad_shear_percent_above_1.5'].append(0)

    # Add results to DataFrame
    for col in results:
        filtered_df[col] = results[col]
        
    return filtered_df

def _append_defaults(results):
    """Append default values to result lists when no valid data is found."""
    for k in results:
        default = 0 if 'percent' in k or 'max' in k else np.nan
        results[k].append(default)
    return results 



import os
import pandas as pd
import numpy as np
from datetime import datetime

# 1. Load Extraction Dates
extraction_file = "/scratch/mch/fackerma/orders/Extraction_dates_20250325130000.txt"

# Read the file into a DataFrame
extraction_dates = pd.read_csv(extraction_file)

# Parse 'Valid_Time' column into datetime format
extraction_dates['Valid_Time'] = pd.to_datetime(extraction_dates['Valid_Time'], format='%Y%m%d%H%M%S')

# Extract valid times as Python datetime objects
#valid_times = extraction_dates['Valid_Time'].dt.to_pydatetime()

# Convert valid_times to timezone-aware datetime objects (UTC)
valid_times = pd.to_datetime(extraction_dates['Valid_Time'], format='%Y%m%d%H%M%S').dt.tz_localize('UTC')



# 2. Load Merged DataFrame
base_dir = "/scratch/mch/fackerma/orders/TRT_processing_2/"
yearly_files = [
    "TRT_2019_05-10.pkl",
    "TRT_2020_05-10.pkl",
    "TRT_2021_05-10.pkl",
    "TRT_2022_05-10.pkl",
    "TRT_2023_05-10.pkl",
]

dfs = []
for file_name in yearly_files:
    file_path = os.path.join(base_dir, file_name)
    if os.path.exists(file_path):
        print(f"Loading {file_name}...")
        dfs.append(pd.read_pickle(file_path))

merged_df = pd.concat(dfs, ignore_index=True)

# 3. Filter by Valid Times and Gust Flags
# Convert merged_df timestamp to match extraction format
merged_df['timestamp'] = pd.to_datetime(merged_df['timestamp'], utc=True)


# Time filtering
time_filter = merged_df['timestamp'].isin(valid_times)
filtered_by_time = merged_df[time_filter].copy()
print(f"Number of rows after time filtering: {filtered_by_time.shape[0]}")


# Find traj_IDs with at least one Yes/No in Gust_Flag
valid_traj_ids = merged_df[merged_df['Gust_Flag'].isin(['Yes', 'No'])]['traj_ID'].unique()
traj_filter = filtered_by_time['traj_ID'].isin(valid_traj_ids)


final_df = filtered_by_time[traj_filter].copy()
print(f"Number of rows after traj_ID filtering: {final_df.shape[0]}")

# 4. Process Data with calculate_metrics
npz_base = '/scratch/mch/maregger/hailclass/convective_wind/full_composite_npz/'
output_path = "/scratch/mch/fackerma/orders/TRT_modelsetup_2/Model_Setup_XX.pkl"

# Group by timestamp for NPZ loading
grouped = final_df.groupby('timestamp')

all_results = []
for timestamp, group in grouped:
    try:
        # Load corresponding NPZ file
        npz_time = timestamp.strftime('%Y%m%d%H%M00')
        npz_path = f"{npz_base}{npz_time}_conv_wind_composite_data.npz"
        
        if not os.path.exists(npz_path):
            print(f"⚠️ NPZ file not found: {npz_path}")
            continue
            
        with np.load(npz_path) as data:
            ZH = data['ZH_max']
            rad_shear = data['RAD_SHEAR_LLSD_max']
            KDP = data['KDP_max']
            
        # Process the group
        processed_group = calculate_metrics(
            filtered_df=group,
            clons=clons,
            clats=clats,
            ZH=ZH,
            rad_shear=rad_shear,
            KDP=KDP
        )
        
        all_results.append(processed_group)
        print(f"Processed {timestamp}")
        
    except Exception as e:
        print(f"Error processing {timestamp}: {str(e)}")

# 5. Save Final Output
if all_results:
    final_output = pd.concat(all_results, ignore_index=True)
    final_output.to_pickle(output_path)
    print(f"Saved final output to {output_path}")
else:
    print("No data processed - output file not created")


Loading TRT_2019_05-10.pkl...
Loading TRT_2020_05-10.pkl...
Loading TRT_2021_05-10.pkl...
Loading TRT_2022_05-10.pkl...
Loading TRT_2023_05-10.pkl...


  return GeometryArray(data, crs=_get_common_crs(to_concat))


Number of rows after time filtering: 297918
Number of rows after traj_ID filtering: 18625
Processed 2019-05-11 12:25:00+00:00
Processed 2019-05-11 12:30:00+00:00
Processed 2019-05-11 12:35:00+00:00
Processed 2019-05-11 12:40:00+00:00
Processed 2019-05-11 12:45:00+00:00


KeyboardInterrupt: 

## Adjusted Code

In [5]:
import os
from datetime import datetime
import numpy as np
from shapely.geometry import Point
from scipy.ndimage import center_of_mass
import pandas as pd
import matplotlib.pyplot as plt
from pyproj import Transformer
from shapely.prepared import prep





# Define the Swiss grid (adjusted to match data dimensions)
chx = np.arange(255000, 255000 + 710 * 1000, 1000)  # Easting values (710 points)
chy = sorted(np.arange(-160000, -160000 + 640 * 1000, 1000), reverse=True)  # Northing values (640 points)
X, Y = np.meshgrid(chx, chy)

# Initialize transformer for Swiss grid to WGS84 (EPSG:21781 to EPSG:4326 PlateCarree)
transformer = Transformer.from_crs(21781, 4326, always_xy=True)
clons, clats = transformer.transform(X, Y)



def calculate_metrics(filtered_df, clons, clats, ZH, rad_shear, KDP):
    # Precompute grid properties
    ny, nx = clons.shape
    nz = ZH.shape[2]
    clons_flat = clons.ravel()
    clats_flat = clats.ravel()
    
    # Initialize results storage
    results = {
        'ZH_com_height': [], 'ZH_percent_above_30': [], 'ZH_percent_above_35': [], 'ZH_percent_above_40': [], 'ZH_percent_above_45': [], 'ZH_percent_above_50': [], 'ZH_percent_above_55': [],
        'KDP_com_height': [], 'KDP_percent_above_2': [], 'KDP_percent_above_1.5': [], 'KDP_percent_above_1': [], 'KDP_percent_above_0.5': [],
        'rad_shear_max': [], 'rad_shear_percent_above_2.5': [], 'rad_shear_percent_above_2': [], 'rad_shear_percent_above_1.5': [], 'rad_shear_percent_above_1': [], 'rad_shear_percent_above_0.5': [],
        'area_p': [],   
        
        # New ZH metrics
        'ZH_45_height': [], 'ZH_20_height': [],
        'ZH_95th_percentile': [], 'ZH_95th_percentile_height': [],
        'ZH_max': [], 'ZH_max_height': [],
    
        # New KDP metrics
        'KDP_95th_percentile': [], 'KDP_95th_percentile_height': [],
        'KDP_max': [], 'KDP_max_height': [],
    
        # New rad_shear metrics
        'rad_shear_95th_percentile': [], 'rad_shear_95th_percentile_height': [],
        'rad_shear_max_height': []# New column for polygon area
    }

    for _, row in filtered_df.iterrows():
        poly = row['geometry']
        
        # Calculate polygon area in square meters
        poly_area = poly.area  # Shapely's area property calculates area in native CRS units (meters for Swiss grid)
        results['area_p'].append(poly_area)
        
        prep_poly = prep(poly)
        minx, miny, maxx, maxy = poly.bounds
        
        # Bounding box optimization
        bbox_mask = (clons_flat >= minx) & (clons_flat <= maxx) & \
                    (clats_flat >= miny) & (clats_flat <= maxy)
        if not bbox_mask.any():
            results = _append_defaults(results)
            continue
            
        # Vectorized containment check
        contained = np.array([prep_poly.contains(Point(p)) 
                               for p in zip(clons_flat[bbox_mask], clats_flat[bbox_mask])])
        if not contained.any():
            results = _append_defaults(results)
            continue
            
        # Create 2D mask
        mask_2d = np.zeros((ny, nx), bool)
        yx_indices = np.unravel_index(np.where(bbox_mask)[0][contained], (ny, nx))
        mask_2d[yx_indices] = True
        
        # Extend the mask to 3D by repeating along the vertical dimension
        mask_3d = np.broadcast_to(mask_2d[..., None], (ny, nx, nz))
        
        # Extract values within the polygon for ZH, KDP, rad_shear
        ZH_masked = np.where(mask_3d, ZH, np.nan)
        KDP_masked = np.where(mask_3d, KDP, np.nan)
        rad_shear_masked = np.where(mask_3d, rad_shear, np.nan)

        # Replace NaN values with 0 for calculations
        ZH_masked[np.isnan(ZH_masked)] = 0
        KDP_masked[np.isnan(KDP_masked)] = 0
        rad_shear_masked[np.isnan(rad_shear_masked)] = 0
        
        # Calculate metrics for ZH
        if np.any(ZH_masked > 0):  # Check if there are valid values
            com_ZH = center_of_mass(ZH_masked)  # Center of mass height
            results['ZH_com_height'].append(com_ZH[2])  # Use the vertical dimension (z-axis)
            results['ZH_percent_above_30'].append(np.sum(ZH_masked > 30) / np.size(ZH_masked) * 100)
            results['ZH_percent_above_35'].append(np.sum(ZH_masked > 35) / np.size(ZH_masked) * 100)
            results['ZH_percent_above_40'].append(np.sum(ZH_masked > 40) / np.size(ZH_masked) * 100)
            results['ZH_percent_above_45'].append(np.sum(ZH_masked > 45) / np.size(ZH_masked) * 100)
            results['ZH_percent_above_50'].append(np.sum(ZH_masked > 50) / np.size(ZH_masked) * 100)
            results['ZH_percent_above_55'].append(np.sum(ZH_masked > 55) / np.size(ZH_masked) * 100)
            results['ZH_45_height'].append(np.max(np.where(ZH_masked >= 45)[2]) if np.any(ZH_masked >= 45) else np.nan)
            results['ZH_20_height'].append(np.max(np.where(ZH_masked >= 20)[2]) if np.any(ZH_masked >= 20) else np.nan)
            zh_95_val = np.percentile(ZH_masked[ZH_masked > 0], 95)
            results['ZH_95th_percentile'].append(zh_95_val)
            results['ZH_95th_percentile_height'].append(np.max(np.where(ZH_masked >= zh_95_val)[2]) if np.any(ZH_masked >= zh_95_val) else np.nan)
            zh_max = np.nanmax(ZH_masked)
            results['ZH_max'].append(zh_max)
            results['ZH_max_height'].append(np.max(np.where(ZH_masked == zh_max)[2]) if np.any(ZH_masked == zh_max) else np.nan)

        else:
            results['ZH_com_height'].append(np.nan)
            results['ZH_percent_above_30'].append(0)
            results['ZH_percent_above_35'].append(0)
            results['ZH_percent_above_40'].append(0)
            results['ZH_percent_above_45'].append(0)
            results['ZH_percent_above_50'].append(0)
            results['ZH_percent_above_55'].append(0)
            results['ZH_45_height'].append(np.nan)
            results['ZH_20_height'].append(np.nan)
            results['ZH_95th_percentile'].append(0)
            results['ZH_95th_percentile_height'].append(np.nan)
            results['ZH_max'].append(0)
            results['ZH_max_height'].append(np.nan)    
        
        # Calculate metrics for KDP
        if np.any(KDP_masked > 0):  # Check if there are valid values
            com_KDP = center_of_mass(KDP_masked)  # Center of mass height
            results['KDP_com_height'].append(com_KDP[2])  # Use the vertical dimension (z-axis)
            results['KDP_percent_above_2'].append(np.sum(KDP_masked > 2) / np.size(KDP_masked) * 100)
            results['KDP_percent_above_1.5'].append(np.sum(KDP_masked > 1.5) / np.size(KDP_masked) * 100)
            results['KDP_percent_above_1'].append(np.sum(KDP_masked > 1) / np.size(KDP_masked) * 100)
            results['KDP_percent_above_0.5'].append(np.sum(KDP_masked > 0.5) / np.size(KDP_masked) * 100)
            kdp_95_val = np.percentile(KDP_masked[KDP_masked > 0], 95)
            results['KDP_95th_percentile'].append(kdp_95_val)
            results['KDP_95th_percentile_height'].append(np.max(np.where(KDP_masked >= kdp_95_val)[2]) if np.any(KDP_masked >= kdp_95_val) else np.nan)
            kdp_max = np.nanmax(KDP_masked)
            results['KDP_max'].append(kdp_max)
            results['KDP_max_height'].append(np.max(np.where(KDP_masked == kdp_max)[2]) if np.any(KDP_masked == kdp_max) else np.nan)

        else:
            results['KDP_com_height'].append(np.nan)
            results['KDP_percent_above_2'].append(0)
            results['KDP_percent_above_1.5'].append(0)
            results['KDP_percent_above_1'].append(0)
            results['KDP_percent_above_0.5'].append(0)
            results['KDP_95th_percentile'].append(0)
            results['KDP_95th_percentile_height'].append(np.nan)
            results['KDP_max'].append(0)
            results['KDP_max_height'].append(np.nan)
        
        # Calculate metrics for rad_shear
        if np.any(rad_shear_masked > 0):  # Check if there are valid values
            results['rad_shear_max'].append(np.nanmax(rad_shear_masked))
            results['rad_shear_percent_above_2.5'].append(np.sum(rad_shear_masked > 2.5) / np.size(rad_shear_masked) * 100)
            results['rad_shear_percent_above_2'].append(np.sum(rad_shear_masked > 2) / np.size(rad_shear_masked) * 100)
            results['rad_shear_percent_above_1.5'].append(np.sum(rad_shear_masked > 1.5) / np.size(rad_shear_masked) * 100)
            results['rad_shear_percent_above_1'].append(np.sum(rad_shear_masked > 1) / np.size(rad_shear_masked) * 100)
            results['rad_shear_percent_above_0.5'].append(np.sum(rad_shear_masked > 0.5) / np.size(rad_shear_masked) * 100)
            rs_95_val = np.percentile(rad_shear_masked[rad_shear_masked > 0], 95)
            results['rad_shear_95th_percentile'].append(rs_95_val)
            results['rad_shear_95th_percentile_height'].append(np.max(np.where(rad_shear_masked >= rs_95_val)[2]) if np.any(rad_shear_masked >= rs_95_val) else np.nan)
            rs_max = results['rad_shear_max'][-1]  # From existing max calculation
            results['rad_shear_max_height'].append(np.max(np.where(rad_shear_masked == rs_max)[2]) if not np.isnan(rs_max) else np.nan)

        else:
            results['rad_shear_max'].append(np.nan)
            results['rad_shear_percent_above_2.5'].append(0)
            results['rad_shear_percent_above_2'].append(0)
            results['rad_shear_percent_above_1.5'].append(0)
            results['rad_shear_percent_above_1'].append(0)
            results['rad_shear_percent_above_0.5'].append(0)
            results['rad_shear_95th_percentile'].append(0)
            results['rad_shear_95th_percentile_height'].append(np.nan)
            results['rad_shear_max_height'].append(np.nan)

    # Add results to DataFrame
    for col in results:
        filtered_df[col] = results[col]
        
    return filtered_df

def _append_defaults(results):
    """Append default values to result lists when no valid data is found."""
    for k in results:
        default = 0 if 'percent' in k or 'max' in k else np.nan
        results[k].append(default)
    return results 



import os
import pandas as pd
import numpy as np
from datetime import datetime

# 1. Load Extraction Dates
extraction_file = "/scratch/mch/fackerma/orders/Reworked_gust_extraction_dates.txt"

# Read the file into a DataFrame
extraction_dates = pd.read_csv(extraction_file)

# Parse 'Valid_Time' column into datetime format
extraction_dates['Valid_Time'] = pd.to_datetime(extraction_dates['Valid_Time'], format='%Y%m%d%H%M%S')

# Extract valid times as Python datetime objects
#valid_times = extraction_dates['Valid_Time'].dt.to_pydatetime()

# Convert valid_times to timezone-aware datetime objects (UTC)
valid_times = pd.to_datetime(extraction_dates['Valid_Time'], format='%Y%m%d%H%M%S').dt.tz_localize('UTC')



# 2. Load Merged DataFrame
base_dir = "/scratch/mch/fackerma/orders/TRT_processing_3/"
yearly_files = [
    "TRT_2019_05-10.pkl",
    "TRT_2020_05-10.pkl",
    "TRT_2021_05-10.pkl",
    "TRT_2022_05-10.pkl",
    "TRT_2023_05-10.pkl",
]

dfs = []
for file_name in yearly_files:
    file_path = os.path.join(base_dir, file_name)
    if os.path.exists(file_path):
        print(f"Loading {file_name}...")
        dfs.append(pd.read_pickle(file_path))

merged_df = pd.concat(dfs, ignore_index=True)

# 3. Filter by Valid Times and Gust Flags
# Convert merged_df timestamp to match extraction format
merged_df['timestamp'] = pd.to_datetime(merged_df['yyyymmddHHMM'], utc=True)


# Time filtering
time_filter = merged_df['timestamp'].isin(valid_times)
filtered_by_time = merged_df[time_filter].copy()
print(f"Number of rows after time filtering: {filtered_by_time.shape[0]}")


# Find traj_IDs with at least one Yes/No in Gust_Flag
valid_traj_ids = merged_df[merged_df['Gust_Flag'].isin(['Yes', 'No'])]['traj_ID'].unique()
traj_filter = filtered_by_time['traj_ID'].isin(valid_traj_ids)


final_df = filtered_by_time[traj_filter].copy()
print(f"Number of rows after traj_ID filtering: {final_df.shape[0]}")

# 4. Process Data with calculate_metrics
npz_base = '/scratch/mch/maregger/hailclass/convective_wind/full_composite_npz/'
output_path = "/scratch/mch/fackerma/orders/TRT_modelsetup_3/Model_Setup_XX.pkl"

# Group by timestamp for NPZ loading
grouped = final_df.groupby('timestamp')

all_results = []
for timestamp, group in grouped:
    try:
        # Load corresponding NPZ file
        npz_time = timestamp.strftime('%Y%m%d%H%M00')
        npz_path = f"{npz_base}{npz_time}_conv_wind_composite_data_pl.npz"
        
        if not os.path.exists(npz_path):
            print(f"⚠️ NPZ file not found: {npz_path}")
            continue
            
        with np.load(npz_path) as data:
            ZH = data['ZH_max']
            rad_shear = data['RAD_SHEAR_LLSD_max']
            KDP = data['KDP_max']
            
        # Process the group
        processed_group = calculate_metrics(
            filtered_df=group,
            clons=clons,
            clats=clats,
            ZH=ZH,
            rad_shear=rad_shear,
            KDP=KDP
        )
        
        all_results.append(processed_group)
        print(f"Processed {timestamp}")
        
    except Exception as e:
        print(f"Error processing {timestamp}: {str(e)}")

# 5. Save Final Output
if all_results:
    final_output = pd.concat(all_results, ignore_index=True)
    final_output.to_pickle(output_path)
    print(f"Saved final output to {output_path}")
else:
    print("No data processed - output file not created")


Loading TRT_2019_05-10.pkl...
Loading TRT_2020_05-10.pkl...
Loading TRT_2021_05-10.pkl...
Loading TRT_2022_05-10.pkl...
Loading TRT_2023_05-10.pkl...


  return GeometryArray(data, crs=_get_common_crs(to_concat))


Number of rows after time filtering: 416914
Number of rows after traj_ID filtering: 33066
Processed 2019-05-11 12:25:00+00:00
Processed 2019-05-11 12:30:00+00:00
Processed 2019-05-11 12:35:00+00:00


KeyboardInterrupt: 