In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../Dataset/nasa_power_prepared_data.csv')

In [3]:
df.head(3)

Unnamed: 0,date,evland,evptrns,gwetprof,gwetroot,gwettop,hdd0,hdd10,hdd18_3,pbltop,...,soil_d5_forecast,soil_d6_forecast,soil_d7_forecast,wind_d1_forecast,wind_d2_forecast,wind_d3_forecast,wind_d4_forecast,wind_d5_forecast,wind_d6_forecast,wind_d7_forecast
0,1981-01-01,1.88,0.34,0.61,0.61,0.62,0.0,0.0,0.0,93.03,...,0.59,0.58,0.56,3.09,3.83,3.09,1.75,2.36,2.64,1.95
1,1981-01-02,1.77,0.28,0.61,0.6,0.62,0.0,0.0,0.0,91.99,...,0.58,0.56,0.55,3.83,3.09,1.75,2.36,2.64,1.95,2.52
2,1981-01-03,1.72,0.24,0.6,0.6,0.62,0.0,0.0,0.0,91.67,...,0.56,0.55,0.54,3.09,1.75,2.36,2.64,1.95,2.52,3.58


## Feature selection

In [4]:
df = df.sort_values("date")

In [5]:
COLS_TO_DROP_PRE_TRAIN = ["date", "day", "week", "weekday"]

H = 7

t2m_targets = [f"t2m_d{h}_forecast" for h in range(1, H+1)]
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [col for col in num_cols if col not in t2m_targets]

In [6]:
selected_features = [

    # Atmospheric moisture
    "tqv", "qv2m", "qv10m", "rh2m", "t2mdew", "t2mwet",

    # Temperature variability
    "t2m_range", "t10m_range", "ts_range",

    # Pressure
    "ps", "slp",

    # Soil moisture 
    "gwettop", "gwetroot", "gwetprof",

    # Evapotranspiration / evaporation
    "evland", "evptrns", "et_total",

    # Windation 
    "u10m", "v10m", "wd10m",

    # Physical/Upper atmosphere
    "to3",

    # Rain history)
    "prectotcorr",

    # Seasonal
    "month", "season_num"
]

In [7]:
base_features = [c for c in selected_features if c in df.columns]
df_sel = df[["date"] + base_features + t2m_targets].copy()

In [8]:
df_sel

Unnamed: 0,date,tqv,qv2m,qv10m,rh2m,t2mdew,t2mwet,t2m_range,t10m_range,ts_range,...,prectotcorr,month,season_num,t2m_d1_forecast,t2m_d2_forecast,t2m_d3_forecast,t2m_d4_forecast,t2m_d5_forecast,t2m_d6_forecast,t2m_d7_forecast
0,1981-01-01,30.93,12.74,12.17,66.26,17.86,21.54,11.36,10.27,13.89,...,0.00,1,0,25.53,25.43,25.61,26.59,26.70,26.66,27.44
1,1981-01-02,37.93,13.47,12.90,68.24,18.74,22.14,11.10,9.96,13.54,...,0.00,1,0,25.43,25.61,26.59,26.70,26.66,27.44,26.87
2,1981-01-03,35.02,13.45,12.98,68.62,18.74,22.08,10.14,9.10,12.70,...,0.00,1,0,25.61,26.59,26.70,26.66,27.44,26.87,26.14
3,1981-01-04,35.38,13.57,12.99,68.48,18.88,22.25,10.77,9.92,13.60,...,0.00,1,0,26.59,26.70,26.66,27.44,26.87,26.14,23.96
4,1981-01-05,35.04,13.93,13.18,66.41,19.27,22.93,10.25,8.87,14.27,...,0.00,1,0,26.70,26.66,27.44,26.87,26.14,23.96,21.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16380,2025-11-06,58.03,18.52,17.99,87.61,23.89,25.02,6.01,5.82,6.73,...,6.95,11,2,26.40,27.16,27.78,27.96,27.73,28.32,27.32
16381,2025-11-07,53.62,19.20,18.76,89.28,24.47,25.44,3.25,3.17,3.24,...,3.06,11,2,27.16,27.78,27.96,27.73,28.32,27.32,25.75
16382,2025-11-08,61.09,19.00,18.45,84.64,24.34,25.75,4.24,3.75,5.77,...,6.95,11,2,27.78,27.96,27.73,28.32,27.32,25.75,24.49
16383,2025-11-09,59.55,19.60,18.95,84.44,24.87,26.33,5.34,4.88,7.37,...,4.94,11,2,27.96,27.73,28.32,27.32,25.75,24.49,25.14


### Seasonal Features

In [9]:
# Seasonal Features (from date/month/dayofyear)
df_sel["date"] = pd.to_datetime(df_sel["date"])
df_sel["dayofyear"] = df_sel["date"].dt.dayofyear

seasonal_df = pd.DataFrame({
    "month_sin": np.sin(2*np.pi*df_sel["month"]/12),
    "month_cos": np.cos(2*np.pi*df_sel["month"]/12),
    "doy_sin": np.sin(2*np.pi*df_sel["dayofyear"]/365),
    "doy_cos": np.cos(2*np.pi*df_sel["dayofyear"]/365),
})

In [10]:
df_sel.columns

Index(['date', 'tqv', 'qv2m', 'qv10m', 'rh2m', 't2mdew', 't2mwet', 't2m_range',
       't10m_range', 'ts_range', 'ps', 'slp', 'gwettop', 'gwetroot',
       'gwetprof', 'evland', 'evptrns', 'et_total', 'u10m', 'v10m', 'wd10m',
       'to3', 'prectotcorr', 'month', 'season_num', 't2m_d1_forecast',
       't2m_d2_forecast', 't2m_d3_forecast', 't2m_d4_forecast',
       't2m_d5_forecast', 't2m_d6_forecast', 't2m_d7_forecast', 'dayofyear'],
      dtype='object')

In [11]:
df_sel.head(3)

Unnamed: 0,date,tqv,qv2m,qv10m,rh2m,t2mdew,t2mwet,t2m_range,t10m_range,ts_range,...,month,season_num,t2m_d1_forecast,t2m_d2_forecast,t2m_d3_forecast,t2m_d4_forecast,t2m_d5_forecast,t2m_d6_forecast,t2m_d7_forecast,dayofyear
0,1981-01-01,30.93,12.74,12.17,66.26,17.86,21.54,11.36,10.27,13.89,...,1,0,25.53,25.43,25.61,26.59,26.7,26.66,27.44,1
1,1981-01-02,37.93,13.47,12.9,68.24,18.74,22.14,11.1,9.96,13.54,...,1,0,25.43,25.61,26.59,26.7,26.66,27.44,26.87,2
2,1981-01-03,35.02,13.45,12.98,68.62,18.74,22.08,10.14,9.1,12.7,...,1,0,25.61,26.59,26.7,26.66,27.44,26.87,26.14,3


### Lag Features

In [12]:
# Lag Features
lags = [1, 3, 7] # lag 1, 3, 7 days
lag_frames = [] 

for col in base_features:
    if col == "month":
        continue
    for lag in lags:
        lag_frames.append(
            df_sel[col].shift(lag).rename(f"{col}_lag{lag}")
        )
        
lag_df = pd.concat(lag_frames, axis=1)

In [13]:
df_sel.columns

Index(['date', 'tqv', 'qv2m', 'qv10m', 'rh2m', 't2mdew', 't2mwet', 't2m_range',
       't10m_range', 'ts_range', 'ps', 'slp', 'gwettop', 'gwetroot',
       'gwetprof', 'evland', 'evptrns', 'et_total', 'u10m', 'v10m', 'wd10m',
       'to3', 'prectotcorr', 'month', 'season_num', 't2m_d1_forecast',
       't2m_d2_forecast', 't2m_d3_forecast', 't2m_d4_forecast',
       't2m_d5_forecast', 't2m_d6_forecast', 't2m_d7_forecast', 'dayofyear'],
      dtype='object')

### Rolling Features

In [14]:
# Rolling Window Features for Rain Forecasting
rolling_cols = [
    "qv2m", "qv10m", "tqv",
    "rh2m", "t2mdew", "t2mwet",
    "ps", "slp",
    "gwettop", "gwetroot"
]

windows = [3, 7]

rolling_frames = []
for col in rolling_cols:
    for w in windows:
        rolling_frames.append(
            df_sel[col].rolling(w).mean().rename(f"{col}_roll{w}_mean")
        )
        rolling_frames.append(
            df_sel[col].rolling(w).std().rename(f"{col}_roll{w}_std")
        )

rolling_df = pd.concat(rolling_frames, axis=1)

In [15]:
df_fe = pd.concat([
    df_sel,
    seasonal_df,
    lag_df,
    rolling_df
], axis=1)

df_fe = df_fe.dropna().reset_index(drop=True)

print("Final FE shape:", df_fe.shape)

Final FE shape: (16378, 146)


In [16]:
df_fe.to_csv('../Dataset/FeatureEngineering_rain_data.csv', index=False)

In [17]:
df_fe.columns.tolist()

['date',
 'tqv',
 'qv2m',
 'qv10m',
 'rh2m',
 't2mdew',
 't2mwet',
 't2m_range',
 't10m_range',
 'ts_range',
 'ps',
 'slp',
 'gwettop',
 'gwetroot',
 'gwetprof',
 'evland',
 'evptrns',
 'et_total',
 'u10m',
 'v10m',
 'wd10m',
 'to3',
 'prectotcorr',
 'month',
 'season_num',
 't2m_d1_forecast',
 't2m_d2_forecast',
 't2m_d3_forecast',
 't2m_d4_forecast',
 't2m_d5_forecast',
 't2m_d6_forecast',
 't2m_d7_forecast',
 'dayofyear',
 'month_sin',
 'month_cos',
 'doy_sin',
 'doy_cos',
 'tqv_lag1',
 'tqv_lag3',
 'tqv_lag7',
 'qv2m_lag1',
 'qv2m_lag3',
 'qv2m_lag7',
 'qv10m_lag1',
 'qv10m_lag3',
 'qv10m_lag7',
 'rh2m_lag1',
 'rh2m_lag3',
 'rh2m_lag7',
 't2mdew_lag1',
 't2mdew_lag3',
 't2mdew_lag7',
 't2mwet_lag1',
 't2mwet_lag3',
 't2mwet_lag7',
 't2m_range_lag1',
 't2m_range_lag3',
 't2m_range_lag7',
 't10m_range_lag1',
 't10m_range_lag3',
 't10m_range_lag7',
 'ts_range_lag1',
 'ts_range_lag3',
 'ts_range_lag7',
 'ps_lag1',
 'ps_lag3',
 'ps_lag7',
 'slp_lag1',
 'slp_lag3',
 'slp_lag7',
 'gwettop