In [130]:
import pandas as pd
import numpy as np
import os

In [115]:
df = pd.read_csv("/home/furkan-dev/Furkan/PCPP/data/cosmo-e.csv")
df = df.iloc[2:].reset_index(drop=True)

In [116]:
df['forecast_reference_time'] = pd.to_datetime(df['time'], format='%Y%m%d %H:%M')
df['t'] = df['leadtime'].str.split(':').str[0].astype(float)  # Extract hours

In [117]:
result = pd.DataFrame({
    'station': df['stn'],
    'forecast_reference_time': df['forecast_reference_time'],
    't': df['t']
})

In [118]:
def process_variable(df, variable_prefix):
    # Find all columns for this variable
    cols = [col for col in df.columns if col.startswith(variable_prefix)]

    # Convert to numeric, replacing -999 with NaN
    for col in cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df.loc[df[col] == -999.0, col] = np.nan

    # Calculate and return ensemble mean
    return df[cols].mean(axis=1)

In [119]:
result['coe:air_temperature_ensavg'] = process_variable(df, 'T_2M')
result['coe:relative_humidity_ensavg'] = process_variable(df, 'RELHUM_2M')

In [120]:
result['coe:surface_air_pressure_ensavg'] = 1013.25 # bunu değiştirmeyi unutma

In [121]:
def calculate_dewpoint(T, RH):
    if pd.isna(T) or pd.isna(RH) or RH <= 0:
        return np.nan
    a = 17.27
    b = 237.7
    alpha = ((a * T) / (b + T)) + np.log(RH/100.0)
    return (b * alpha) / (a - alpha)

In [122]:
result['coe:dew_point_temperature_ensavg'] = result.apply(
    lambda row: calculate_dewpoint(
        row['coe:air_temperature_ensavg'],
        row['coe:relative_humidity_ensavg']
    ),
    axis=1
)

In [123]:
result['coe:dew_point_depression_ensavg'] = (
    result['coe:air_temperature_ensavg'] -
    result['coe:dew_point_temperature_ensavg']
)

In [124]:
def calculate_mixing_ratio(T_d, P):
    if pd.isna(T_d) or pd.isna(P):
        return np.nan
    a = 17.368
    b = 238.83
    c = 6.107
    e = c * np.exp((a * T_d) / (b + T_d))
    return 622.0 * (e / (P - e))

In [125]:
result['coe:water_vapor_mixing_ratio_ensavg'] = result.apply(
    lambda row: calculate_mixing_ratio(
        row['coe:dew_point_temperature_ensavg'],
        row['coe:surface_air_pressure_ensavg']
    ),
    axis=1
)

In [126]:
result['time:cos_hourofday'] = np.cos(2 * np.pi * result['forecast_reference_time'].dt.hour / 24)
result['time:sin_hourofday'] = np.sin(2 * np.pi * result['forecast_reference_time'].dt.hour / 24)
result['time:cos_dayofyear'] = np.cos(2 * np.pi * result['forecast_reference_time'].dt.dayofyear / 365)
result['time:sin_dayofyear'] = np.sin(2 * np.pi * result['forecast_reference_time'].dt.dayofyear / 365)


In [127]:
result['coe:leadtime'] = result['t']

In [128]:
ds = result.set_index(['station', 'forecast_reference_time', 't']).to_xarray()

In [131]:
if not os.path.exists('data'):
    os.makedirs('data')

In [133]:
ds.to_zarr('data/features.zarr')

  return cls(**configuration_parsed)
  meta = AsyncArray._create_metadata_v3(
  return cls(**configuration_parsed)


<xarray.backends.zarr.ZarrStore at 0x720d30031240>

In [134]:
print(result.head())

  station forecast_reference_time     t  coe:air_temperature_ensavg  \
0     ARO     2025-04-15 12:00:00   0.0                    7.066667   
1     ARO     2025-04-15 18:00:00   6.0                    6.766667   
2     ARO     2025-04-16 00:00:00  12.0                    5.171429   
3     ARO     2025-04-16 06:00:00  18.0                    6.261905   
4     ARO     2025-04-16 12:00:00  24.0                   11.200000   

   coe:relative_humidity_ensavg  coe:surface_air_pressure_ensavg  \
0                     63.323810                          1013.25   
1                     59.123810                          1013.25   
2                     62.285714                          1013.25   
3                     57.923810                          1013.25   
4                     41.466667                          1013.25   

   coe:dew_point_temperature_ensavg  coe:dew_point_depression_ensavg  \
0                          0.575254                         6.491413   
1                   