In [16]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter
import json

In [17]:
start_year = 1960
end_year = 2024

# kecamatan = {'warungkondang': ["era5"], 'gekbrong': ['era5'], 'cugenang': ['era5'],  'cianjur': ['era5'],
#              'campaka': ['era5'],  'cilaku': ['era5'],  'cibeber': ['era5'],} # , "OpenWeather", "era5"
kecamatan = {'warungkondang': ["chirts"], 'gekbrong': ['chirts'], 'cugenang': ['chirts'],  'cianjur': ['chirts'],
             'campaka': ['chirts'],  'cilaku': ['chirts'],  'cibeber': ['chirts'],} # , "OpenWeather", "era5"
climate_dir = 'climate_timeseries'
output_dir = 'cleaned'

In [20]:
def power_climate_preprocessing(df):
    power_columns = [
        'T2M',           # MERRA-2 Temperature at 2 Meters (C)
        'T2MDEW',        # MERRA-2 Dew/Frost Point at 2 Meters (C)
        'T2MWET',        # MERRA-2 Wet Bulb Temperature at 2 Meters (C)
        'TS',            # MERRA-2 Earth Skin Temperature (C)
        'T2M_RANGE',     # MERRA-2 Temperature at 2 Meters Range (C)
        'T2M_MAX',       # MERRA-2 Temperature at 2 Meters Maximum (C)
        'T2M_MIN',       # MERRA-2 Temperature at 2 Meters Minimum (C)
        'PS',            # MERRA-2 Surface Pressure (kPa)
        'WS2M',          # MERRA-2 Wind Speed at 2 Meters (m/s)
        'WS2M_MAX',      # MERRA-2 Wind Speed at 2 Meters Maximum (m/s)
        'WS2M_MIN',      # MERRA-2 Wind Speed at 2 Meters Minimum (m/s)
        'GWETTOP',       # MERRA-2 Surface Soil Wetness (1)
        'GWETROOT'      # MERRA-2 Root Zone Soil Wetness (1)
    ]
    
    df['datetime'] = pd.to_datetime(df['YEAR'].astype(str) + 
                                  df['DOY'].astype(str), 
                                  format='%Y%j')
    df.set_index('datetime', inplace=True) # set kolom DATE sebagai index terbaru
    df = df.drop(columns=['YEAR', 'DOY'], axis=0)
    df = df[power_columns]
    df = df.replace(-999, np.nan)
    # print(df.describe())
    df = df.interpolate(limit_direction='both')

    '''
    Columns description:
    T2M           MERRA-2 Temperature at 2 Meters (C)
    T2MDEW        MERRA-2 Dew/Frost Point at 2 Meters (C)
    T2MWET        MERRA-2 Wet Bulb Temperature at 2 Meters (C)
    TS            MERRA-2 Earth Skin Temperature (C)
    T2M_RANGE     MERRA-2 Temperature at 2 Meters Range (C)
    T2M_MAX       MERRA-2 Temperature at 2 Meters Maximum (C)
    T2M_MIN       MERRA-2 Temperature at 2 Meters Minimum (C)
    PS            MERRA-2 Surface Pressure (kPa)
    WS2M          MERRA-2 Wind Speed at 2 Meters (m/s)
    WS2M_MAX      MERRA-2 Wind Speed at 2 Meters Maximum (m/s)
    WS2M_MIN      MERRA-2 Wind Speed at 2 Meters Minimum (m/s)
    GWETTOP       MERRA-2 Surface Soil Wetness (1)
    GWETROOT      MERRA-2 Root Zone Soil Wetness (1)
    '''
    return df[(df.index.year >= start_year) & (df.index.year <= end_year)]

def era5_climate_preprocessing(df):
    df['datetime'] = pd.to_datetime(df['date'], format='%Y%m%d')
    df.set_index('datetime', inplace=True) # set kolom DATE sebagai index terbaru

    era5_features = [
        # Temperature variables
        'temperature_2m',                       # Air temperature
        'temperature_2m_min',                   # Daily minimum air temperature
        'temperature_2m_max',                   # Daily maximum air temperature
        'soil_temperature_level_1',             # Topsoil temperature (0-7 cm)
        'soil_temperature_level_2',             # Soil temperature (7-28 cm)
        
        # Moisture variables
        'volumetric_soil_water_layer_1',        # Topsoil moisture content
        'volumetric_soil_water_layer_2',        # Soil moisture (7-28 cm)
        'volumetric_soil_water_layer_3',        # Soil moisture (28-100 cm)
        'total_precipitation_sum',              # Total rainfall and snow
        'dewpoint_temperature_2m',              # Air humidity indicator
        
        # Radiation and energy variables
        'surface_solar_radiation_downwards_sum', # Solar radiation at surface
        'surface_net_solar_radiation_sum',       # Net solar radiation at surface
        
        # Evaporation and water cycle
        'total_evaporation_sum',                 # Actual evaporation
        
        # Wind variables
        'u_component_of_wind_10m',               # East-west wind component
        'v_component_of_wind_10m'                # North-south wind component
    ]

    df = df[era5_features]
    print(df.describe())
    for col in era5_features:
        if 'temperature' in col:
            df[col] = df[col] - 273.15
    df.loc[:, 'temperature_range'] = df['temperature_2m_max'] - df['temperature_2m_min']
    return df[(df.index.year >= start_year) & (df.index.year <= end_year)]

def openweather_preprocessing(df):
    openweather_features = [
        'temp',         # Current temperature (C or K depending on units)
        'feels_like',   # Perceived temperature considering humidity and wind (C or K)
        'temp_min',     # Minimum temperature at the moment (C or K)
        'temp_max',     # Maximum temperature at the moment (C or K)
        'pressure',     # Atmospheric pressure at sea level (hPa)
        'humidity',     # Humidity percentage (%)
        'wind_speed',   # Wind speed (meter/sec)
        'wind_deg',     # Wind direction in degrees (0–360)
        'rain_1h',      # Rain volume for the last 1 hour (mm)
        'rain_3h',      # Rain volume for the last 3 hours (mm)
        'clouds_all'    # Cloudiness percentage (%)
    ]
    
    print(df.describe())
    df['dt_iso'] = df['dt_iso'].str.replace(' UTC', '', regex=False)
    df['datetime'] = pd.to_datetime(df['dt_iso'])
    df.drop(columns='dt_iso', inplace=True)
    
    df.set_index('datetime', inplace=True)
    df = df[openweather_features]
    df.loc[:,'rain_1h'] = df['rain_1h'].fillna(0)
    df.loc[:,'rain_3h'] = df['rain_3h'].fillna(0)
    df.loc[:,'temp_range'] = df['temp_max'] - df['temp_min']
    df.loc[:, 'rain_sum'] = df['rain_1h'] + df['rain_3h']
    df = df.interpolate(limit_direction='both')
    df = df.resample('D').mean()
    
    return df[(df.index.year >= start_year) & (df.index.year <= end_year)]

def chirts_preprocessing(df):
    df['datetime'] = pd.to_datetime(df['date'], format='%Y%m%d')
    df.set_index('datetime', inplace=True) # set kolom DATE sebagai index terbaru

    chirts_features = [
        'heat_index',
        'maximum_temperature',
        'minimum_temperature',
        'relative_humidity',
        'saturation_vapor_pressure',
        'vapor_pressure_deficit',
    ]
    
    df = df[chirts_features]
    df['heat_index'] = (df['heat_index'] - 32) * (5/9)
    df.loc[:, 'temperature_range'] = df['maximum_temperature'] - df['minimum_temperature']
    print(df.describe())

    return df[(df.index.year >= start_year) & (df.index.year <= end_year)]

In [21]:
for key, value in kecamatan.items():
    for folder in value:
        df = pd.read_csv(f"{climate_dir}/{folder}/{key}.csv")
        # print(df.head())
        # print(df.describe())
        if folder == "POWER":
            df = power_climate_preprocessing(df)
        elif folder == "OpenWeather":
            df = openweather_preprocessing(df)
        elif folder == "chirts":
            df = chirts_preprocessing(df)
        else:
            df = era5_climate_preprocessing(df)

        print(folder)
        # print(df.head())
        # print(df.describe())
        df.to_csv(f"{climate_dir}/{output_dir}/{folder}_{key}.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['heat_index'] = (df['heat_index'] - 32) * (5/9)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'temperature_range'] = df['maximum_temperature'] - df['minimum_temperature']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['heat_index'] = (df['heat_index'] - 32) * (5/9)
A value is tryin

         heat_index  maximum_temperature  minimum_temperature  \
count  12419.000000         12419.000000         12419.000000   
mean      23.036485            26.600792            18.798293   
std        0.897901             1.193567             1.104859   
min       18.645152            20.768885            11.939768   
25%       22.464672            25.931827            18.186519   
50%       23.032207            26.706540            18.934836   
75%       23.582184            27.386673            19.555083   
max       27.077967            31.357861            21.829409   

       relative_humidity  saturation_vapor_pressure  vapor_pressure_deficit  \
count       12419.000000               12419.000000            12419.000000   
mean           88.191428                   2.863136                0.368166   
std             4.753592                   0.145378                0.148752   
min            58.659813                   2.276981                0.025226   
25%            86.4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['heat_index'] = (df['heat_index'] - 32) * (5/9)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'temperature_range'] = df['maximum_temperature'] - df['minimum_temperature']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['heat_index'] = (df['heat_index'] - 32) * (5/9)
A value is tryin

         heat_index  maximum_temperature  minimum_temperature  \
count  12419.000000         12419.000000         12419.000000   
mean      21.700158            25.324030            17.425632   
std        0.851639             1.205321             1.097884   
min       17.328889            19.401272            10.506824   
25%       21.152508            24.651670            16.820539   
50%       21.714936            25.428308            17.542783   
75%       22.257763            26.107701            18.170792   
max       25.285136            30.023364            20.548880   

       relative_humidity  saturation_vapor_pressure  vapor_pressure_deficit  \
count       12419.000000               12419.000000            12419.000000   
mean           93.988794                   2.643929                0.178380   
std             4.048982                   0.135498                0.118308   
min            63.811349                   2.085426                0.001356   
25%            92.6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['heat_index'] = (df['heat_index'] - 32) * (5/9)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'temperature_range'] = df['maximum_temperature'] - df['minimum_temperature']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['heat_index'] = (df['heat_index'] - 32) * (5/9)
A value is tryin