In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter
import json

In [10]:
start_year = 1960
end_year = 2024

kecamatan = {'warungkondang': ["era5"], 'gekbrong': ['era5'], 'cugenang': ['era5'],  'cianjur': ['era5'],
             'campaka': ['era5'],  'cilaku': ['era5'],  'cibeber': ['era5'],} # , "OpenWeather", "era5"
# kecamatan = {'warungkondang': ["chirts"], 'gekbrong': ['chirts'], 'cugenang': ['chirts'],  'cianjur': ['chirts'],
#              'campaka': ['chirts'],  'cilaku': ['chirts'],  'cibeber': ['chirts'],} # , "OpenWeather", "era5"
climate_dir = 'climate_timeseries'
output_dir = 'cleaned'

In [13]:
def power_climate_preprocessing(df):
    power_columns = [
        'T2M',           # MERRA-2 Temperature at 2 Meters (C)
        'T2MDEW',        # MERRA-2 Dew/Frost Point at 2 Meters (C)
        'T2MWET',        # MERRA-2 Wet Bulb Temperature at 2 Meters (C)
        'TS',            # MERRA-2 Earth Skin Temperature (C)
        'T2M_RANGE',     # MERRA-2 Temperature at 2 Meters Range (C)
        'T2M_MAX',       # MERRA-2 Temperature at 2 Meters Maximum (C)
        'T2M_MIN',       # MERRA-2 Temperature at 2 Meters Minimum (C)
        'PS',            # MERRA-2 Surface Pressure (kPa)
        'WS2M',          # MERRA-2 Wind Speed at 2 Meters (m/s)
        'WS2M_MAX',      # MERRA-2 Wind Speed at 2 Meters Maximum (m/s)
        'WS2M_MIN',      # MERRA-2 Wind Speed at 2 Meters Minimum (m/s)
        'GWETTOP',       # MERRA-2 Surface Soil Wetness (1)
        'GWETROOT'      # MERRA-2 Root Zone Soil Wetness (1)
    ]
    
    df['datetime'] = pd.to_datetime(df['YEAR'].astype(str) + 
                                  df['DOY'].astype(str), 
                                  format='%Y%j')
    df.set_index('datetime', inplace=True) # set kolom DATE sebagai index terbaru
    df = df.drop(columns=['YEAR', 'DOY'], axis=0)
    df = df[power_columns]
    df = df.replace(-999, np.nan)
    # print(df.describe())
    df = df.interpolate(limit_direction='both')

    '''
    Columns description:
    T2M           MERRA-2 Temperature at 2 Meters (C)
    T2MDEW        MERRA-2 Dew/Frost Point at 2 Meters (C)
    T2MWET        MERRA-2 Wet Bulb Temperature at 2 Meters (C)
    TS            MERRA-2 Earth Skin Temperature (C)
    T2M_RANGE     MERRA-2 Temperature at 2 Meters Range (C)
    T2M_MAX       MERRA-2 Temperature at 2 Meters Maximum (C)
    T2M_MIN       MERRA-2 Temperature at 2 Meters Minimum (C)
    PS            MERRA-2 Surface Pressure (kPa)
    WS2M          MERRA-2 Wind Speed at 2 Meters (m/s)
    WS2M_MAX      MERRA-2 Wind Speed at 2 Meters Maximum (m/s)
    WS2M_MIN      MERRA-2 Wind Speed at 2 Meters Minimum (m/s)
    GWETTOP       MERRA-2 Surface Soil Wetness (1)
    GWETROOT      MERRA-2 Root Zone Soil Wetness (1)
    '''
    return df[(df.index.year >= start_year) & (df.index.year <= end_year)]

def era5_climate_preprocessing(df):
    df['datetime'] = pd.to_datetime(df['date'], format='%Y%m%d')
    df.set_index('datetime', inplace=True) # set kolom DATE sebagai index terbaru

    era5_features = [
        # Temperature variables
        'temperature_2m',                       # Air temperature
        'temperature_2m_min',                   # Daily minimum air temperature
        'temperature_2m_max',                   # Daily maximum air temperature
        'soil_temperature_level_1',             # Topsoil temperature (0-7 cm)
        'soil_temperature_level_2',             # Soil temperature (7-28 cm)
        
        # Moisture variables
        'volumetric_soil_water_layer_1',        # Topsoil moisture content
        'volumetric_soil_water_layer_2',        # Soil moisture (7-28 cm)
        'volumetric_soil_water_layer_3',        # Soil moisture (28-100 cm)
        'total_precipitation_sum',              # Total rainfall and snow
        'dewpoint_temperature_2m',              # Air humidity indicator
        
        # Radiation and energy variables
        'surface_solar_radiation_downwards_sum', # Solar radiation at surface
        'surface_net_solar_radiation_sum',       # Net solar radiation at surface
        
        # Evaporation and water cycle
        'total_evaporation_sum',                 # Actual evaporation
        
        # Wind variables
        'u_component_of_wind_10m',               # East-west wind component
        'v_component_of_wind_10m'                # North-south wind component
    ]

    df = df[era5_features]
    print(df.describe())
    for col in era5_features:
        if 'temperature' in col:
            df[col] = df[col] - 273.15
    df.loc[:, 'temperature_range'] = df['temperature_2m_max'] - df['temperature_2m_min']
    return df[(df.index.year >= start_year) & (df.index.year <= end_year)]

def openweather_preprocessing(df):
    openweather_features = [
        'temp',         # Current temperature (C or K depending on units)
        'feels_like',   # Perceived temperature considering humidity and wind (C or K)
        'temp_min',     # Minimum temperature at the moment (C or K)
        'temp_max',     # Maximum temperature at the moment (C or K)
        'pressure',     # Atmospheric pressure at sea level (hPa)
        'humidity',     # Humidity percentage (%)
        'wind_speed',   # Wind speed (meter/sec)
        'wind_deg',     # Wind direction in degrees (0–360)
        'rain_1h',      # Rain volume for the last 1 hour (mm)
        'rain_3h',      # Rain volume for the last 3 hours (mm)
        'clouds_all'    # Cloudiness percentage (%)
    ]
    
    print(df.describe())
    df['dt_iso'] = df['dt_iso'].str.replace(' UTC', '', regex=False)
    df['datetime'] = pd.to_datetime(df['dt_iso'])
    df.drop(columns='dt_iso', inplace=True)
    
    df.set_index('datetime', inplace=True)
    df = df[openweather_features]
    df.loc[:,'rain_1h'] = df['rain_1h'].fillna(0)
    df.loc[:,'rain_3h'] = df['rain_3h'].fillna(0)
    df.loc[:,'temp_range'] = df['temp_max'] - df['temp_min']
    df.loc[:, 'rain_sum'] = df['rain_1h'] + df['rain_3h']
    df = df.interpolate(limit_direction='both')
    df = df.resample('D').mean()
    
    return df[(df.index.year >= start_year) & (df.index.year <= end_year)]

def chirts_preprocessing(df):
    df['datetime'] = pd.to_datetime(df['date'], format='%Y%m%d')
    df.set_index('datetime', inplace=True) # set kolom DATE sebagai index terbaru

    chirts_features = [
        'heat_index',
        'maximum_temperature',
        'minimum_temperature',
        'relative_humidity',
        'saturation_vapor_pressure',
        'vapor_pressure_deficit',
    ]
    
    df = df[chirts_features]
    df.loc[:, 'temperature_range'] = df['maximum_temperature'] - df['minimum_temperature']
    print(df.describe())

    return df[(df.index.year >= start_year) & (df.index.year <= end_year)]

In [14]:
for key, value in kecamatan.items():
    for folder in value:
        df = pd.read_csv(f"{climate_dir}/{folder}/{key}.csv")
        # print(df.head())
        # print(df.describe())
        if folder == "POWER":
            df = power_climate_preprocessing(df)
        elif folder == "OpenWeather":
            df = openweather_preprocessing(df)
        elif folder == "chirts":
            df = chirts_preprocessing(df)
        else:
            df = era5_climate_preprocessing(df)

        print(folder)
        # print(df.head())
        # print(df.describe())
        df.to_csv(f"{climate_dir}/{output_dir}/{folder}_{key}.csv")

       temperature_2m  temperature_2m_min  temperature_2m_max  \
count    23741.000000        23741.000000        23741.000000   
mean       294.938035          291.853317          299.095881   
std          0.862821            0.973268            1.413609   
min        291.143317          286.010435          293.402400   
25%        294.372998          291.350045          298.334540   
50%        294.855264          291.963716          299.017614   
75%        295.412949          292.492036          299.707293   
max        298.994208          294.709224          305.269981   

       soil_temperature_level_1  soil_temperature_level_2  \
count              23741.000000              23741.000000   
mean                 296.885014                296.873046   
std                    1.248802                  1.116553   
min                  293.368295                294.001362   
25%                  296.148175                296.204687   
50%                  296.684527                2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col] - 273.15
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'temperature_range'] = df['temperature_2m_max'] - df['temperature_2m_min']


       temperature_2m  temperature_2m_min  temperature_2m_max  \
count    23741.000000        23741.000000        23741.000000   
mean       294.911426          291.859306          299.063244   
std          0.861602            0.960390            1.406224   
min        291.083182          285.993947          293.369562   
25%        294.346831          291.360819          298.302749   
50%        294.830837          291.963201          298.988491   
75%        295.387251          292.488194          299.676085   
max        298.996661          294.753682          305.099797   

       soil_temperature_level_1  soil_temperature_level_2  \
count              23741.000000              23741.000000   
mean                 296.881434                296.869567   
std                    1.248382                  1.115610   
min                  293.334702                293.972753   
25%                  296.142667                296.200228   
50%                  296.686412                2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col] - 273.15
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'temperature_range'] = df['temperature_2m_max'] - df['temperature_2m_min']


       temperature_2m  temperature_2m_min  temperature_2m_max  \
count    23741.000000        23741.000000        23741.000000   
mean       294.384969          291.262103          298.605652   
std          0.831452            0.964585            1.391339   
min        290.916183          285.544134          292.995742   
25%        293.841263          290.759342          297.864601   
50%        294.312200          291.373317          298.533682   
75%        294.848646          291.900958          299.218834   
max        298.207155          294.130297          304.827402   

       soil_temperature_level_1  soil_temperature_level_2  \
count              23741.000000              23741.000000   
mean                 296.330608                296.313051   
std                    1.217778                  1.089878   
min                  292.845349                293.394036   
25%                  295.612985                295.659968   
50%                  296.139095                2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col] - 273.15
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'temperature_range'] = df['temperature_2m_max'] - df['temperature_2m_min']


       temperature_2m  temperature_2m_min  temperature_2m_max  \
count    23741.000000        23741.000000        23741.000000   
mean       295.657225          292.460242          299.860891   
std          0.912974            0.991268            1.499174   
min        291.963321          286.672111          294.096391   
25%        295.060243          291.943178          299.050201   
50%        295.554058          292.565352          299.743720   
75%        296.139971          293.110017          300.484795   
max        299.922249          295.390944          306.651425   

       soil_temperature_level_1  soil_temperature_level_2  \
count              23741.000000              23741.000000   
mean                 297.284176                297.266933   
std                    1.285200                  1.151352   
min                  293.794456                294.421322   
25%                  296.535856                296.582447   
50%                  297.056670                2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col] - 273.15
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'temperature_range'] = df['temperature_2m_max'] - df['temperature_2m_min']


       temperature_2m  temperature_2m_min  temperature_2m_max  \
count    23741.000000        23741.000000        23741.000000   
mean       294.863054          291.924173          298.923404   
std          0.767172            0.983355            1.268817   
min        291.274615          286.290031          293.446268   
25%        294.355649          291.424528          298.230468   
50%        294.823514          292.048330          298.914754   
75%        295.325979          292.579136          299.561170   
max        298.338847          294.755050          304.537350   

       soil_temperature_level_1  soil_temperature_level_2  \
count              23741.000000              23741.000000   
mean                 296.814757                296.806751   
std                    1.108935                  0.979618   
min                  293.306400                294.021386   
25%                  296.158927                296.213883   
50%                  296.682403                2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col] - 273.15
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'temperature_range'] = df['temperature_2m_max'] - df['temperature_2m_min']


       temperature_2m  temperature_2m_min  temperature_2m_max  \
count    23741.000000        23741.000000        23741.000000   
mean       296.290583          293.115033          300.379523   
std          0.942804            1.037158            1.521917   
min        292.296473          287.009787          294.509256   
25%        295.676919          292.580910          299.534340   
50%        296.185043          293.225219          300.260668   
75%        296.791119          293.794685          301.019272   
max        300.737746          296.460932          307.217549   

       soil_temperature_level_1  soil_temperature_level_2  \
count              23741.000000              23741.000000   
mean                 297.819605                297.809094   
std                    1.275478                  1.139111   
min                  294.205192                295.094504   
25%                  297.076320                297.134905   
50%                  297.599540                2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col] - 273.15
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'temperature_range'] = df['temperature_2m_max'] - df['temperature_2m_min']


       temperature_2m  temperature_2m_min  temperature_2m_max  \
count    23741.000000        23741.000000        23741.000000   
mean       296.039115          292.897776          300.133304   
std          0.916591            1.022009            1.485767   
min        292.089728          286.894993          294.334558   
25%        295.439058          292.372210          299.314294   
50%        295.941622          293.010072          300.025747   
75%        296.532208          293.570032          300.769446   
max        300.341127          296.114429          306.796292   

       soil_temperature_level_1  soil_temperature_level_2  \
count              23741.000000              23741.000000   
mean                 297.656997                297.646624   
std                    1.262445                  1.127128   
min                  294.046959                294.906134   
25%                  296.920968                296.977761   
50%                  297.443553                2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col] - 273.15
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'temperature_range'] = df['temperature_2m_max'] - df['temperature_2m_min']
