In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../Dataset/nasa_power_prepared_data.csv')

In [3]:
df.head(3)

Unnamed: 0,date,evland,evptrns,gwetprof,gwetroot,gwettop,hdd0,hdd10,hdd18_3,pbltop,...,soil_d5_forecast,soil_d6_forecast,soil_d7_forecast,wind_d1_forecast,wind_d2_forecast,wind_d3_forecast,wind_d4_forecast,wind_d5_forecast,wind_d6_forecast,wind_d7_forecast
0,1981-01-01,1.88,0.34,0.61,0.61,0.62,0.0,0.0,0.0,93.03,...,0.59,0.58,0.56,3.09,3.83,3.09,1.75,2.36,2.64,1.95
1,1981-01-02,1.77,0.28,0.61,0.6,0.62,0.0,0.0,0.0,91.99,...,0.58,0.56,0.55,3.83,3.09,1.75,2.36,2.64,1.95,2.52
2,1981-01-03,1.72,0.24,0.6,0.6,0.62,0.0,0.0,0.0,91.67,...,0.56,0.55,0.54,3.09,1.75,2.36,2.64,1.95,2.52,3.58


## Feature selection

In [4]:
df = df.sort_values("date")

In [5]:
COLS_TO_DROP_PRE_TRAIN = ["date", "day", "week", "weekday"]

H = 7

t2m_targets = [f"t2m_d{h}_forecast" for h in range(1, H+1)]
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [col for col in num_cols if col not in t2m_targets]

In [6]:
selected_features = [
    # Soil Temperature
    "tsoil1", "tsoil2", "tsoil3", "tsoil4",

    # Surface / Air Temperature
    "t2m", "t2m_max", "t2m_min",
    "t10m", "t10m_max", "t10m_min",
    "ts", "ts_min", "ts_max", "tsurf",
    "t2m_range", "t10m_range", "ts_range",

    # Wind (V component)
    "v2m", "v10m", "v50m",

    # Humidity / Air Density
    "rhoa", "t2mwet",

    # Soil Moisture
    "gwettop", "gwetroot", "gwetprof",

    # ET / Evapotranspiration
    "evptrns", "et_total", "evland",

    # Physical properties
    "z0m", "to3",

    # Seasonal (for sin/cos)
    "month",
]

In [7]:
base_features = [c for c in selected_features if c in df.columns]
df_sel = df[["date"] + base_features + t2m_targets].copy()

### Seasonal Features

In [8]:
# Seasonal Features (from date/month/dayofyear)
df_sel["date"] = pd.to_datetime(df_sel["date"])
df_sel["dayofyear"] = df_sel["date"].dt.dayofyear

seasonal_df = pd.DataFrame({
    "month_sin": np.sin(2*np.pi*df_sel["month"]/12),
    "month_cos": np.cos(2*np.pi*df_sel["month"]/12),
    "doy_sin": np.sin(2*np.pi*df_sel["dayofyear"]/365),
    "doy_cos": np.cos(2*np.pi*df_sel["dayofyear"]/365),
})

In [9]:
df_sel.columns

Index(['date', 'tsoil1', 'tsoil2', 'tsoil3', 'tsoil4', 't2m', 't2m_max',
       't2m_min', 't10m', 't10m_max', 't10m_min', 'ts', 'ts_min', 'ts_max',
       'tsurf', 't2m_range', 't10m_range', 'ts_range', 'v2m', 'v10m', 'v50m',
       'rhoa', 't2mwet', 'gwettop', 'gwetroot', 'gwetprof', 'evptrns',
       'et_total', 'evland', 'z0m', 'to3', 'month', 't2m_d1_forecast',
       't2m_d2_forecast', 't2m_d3_forecast', 't2m_d4_forecast',
       't2m_d5_forecast', 't2m_d6_forecast', 't2m_d7_forecast', 'dayofyear'],
      dtype='object')

In [10]:
df_sel.head(3)

Unnamed: 0,date,tsoil1,tsoil2,tsoil3,tsoil4,t2m,t2m_max,t2m_min,t10m,t10m_max,...,to3,month,t2m_d1_forecast,t2m_d2_forecast,t2m_d3_forecast,t2m_d4_forecast,t2m_d5_forecast,t2m_d6_forecast,t2m_d7_forecast,dayofyear
0,1981-01-01,25.45,25.33,25.53,26.07,25.21,31.06,19.7,25.46,30.33,...,246.0,1,25.53,25.43,25.61,26.59,26.7,26.66,27.44,1
1,1981-01-02,25.78,25.58,25.58,26.05,25.53,31.07,19.97,25.72,30.31,...,244.56,1,25.43,25.61,26.59,26.7,26.66,27.44,26.87,2
2,1981-01-03,26.05,25.85,25.67,26.04,25.43,30.87,20.73,25.44,29.92,...,245.96,1,25.61,26.59,26.7,26.66,27.44,26.87,26.14,3


### Lag Features

In [11]:
# Lag Features
lags = [1, 3, 7] # lag 1, 3, 7 days
lag_frames = [] 

for col in base_features:
    if col == "month":
        continue
    for lag in lags:
        lag_frames.append(
            df_sel[col].shift(lag).rename(f"{col}_lag{lag}")
        )
        
lag_df = pd.concat(lag_frames, axis=1)

In [12]:
df_sel.columns

Index(['date', 'tsoil1', 'tsoil2', 'tsoil3', 'tsoil4', 't2m', 't2m_max',
       't2m_min', 't10m', 't10m_max', 't10m_min', 'ts', 'ts_min', 'ts_max',
       'tsurf', 't2m_range', 't10m_range', 'ts_range', 'v2m', 'v10m', 'v50m',
       'rhoa', 't2mwet', 'gwettop', 'gwetroot', 'gwetprof', 'evptrns',
       'et_total', 'evland', 'z0m', 'to3', 'month', 't2m_d1_forecast',
       't2m_d2_forecast', 't2m_d3_forecast', 't2m_d4_forecast',
       't2m_d5_forecast', 't2m_d6_forecast', 't2m_d7_forecast', 'dayofyear'],
      dtype='object')

### Rolling Features

In [13]:
# Rolling Window Features
rolling_cols = [
    "t2m", "t2m_max", "t2m_min",
    "t10m", "t10m_max", "t10m_min",
    "ts", "ts_max", "ts_min", "tsurf",
    "tsoil1", "tsoil2", "tsoil3", "tsoil4",
    "t2mwet", "rhoa",
    "gwettop", "gwetroot", "gwetprof",
    "v2m", "v10m", "v50m"
]

windows = [3, 7]

rolling_frames = []
for col in rolling_cols:
    for w in windows:
        rolling_frames.append(
            df_sel[col].rolling(w).mean().rename(f"{col}_roll{w}_mean")
        )
        rolling_frames.append(
            df_sel[col].rolling(w).std().rename(f"{col}_roll{w}_std")
        )

rolling_df = pd.concat(rolling_frames, axis=1)

In [14]:
df_fe = pd.concat([
    df_sel,
    seasonal_df,
    lag_df,
    rolling_df
], axis=1)

df_fe = df_fe.dropna().reset_index(drop=True)

print("Final FE shape:", df_fe.shape)

Final FE shape: (16378, 222)


In [15]:
df_fe.to_csv('../Dataset/FeatureEngineering_T2M_data.csv', index=False)

In [16]:
df_fe.columns.tolist()

['date',
 'tsoil1',
 'tsoil2',
 'tsoil3',
 'tsoil4',
 't2m',
 't2m_max',
 't2m_min',
 't10m',
 't10m_max',
 't10m_min',
 'ts',
 'ts_min',
 'ts_max',
 'tsurf',
 't2m_range',
 't10m_range',
 'ts_range',
 'v2m',
 'v10m',
 'v50m',
 'rhoa',
 't2mwet',
 'gwettop',
 'gwetroot',
 'gwetprof',
 'evptrns',
 'et_total',
 'evland',
 'z0m',
 'to3',
 'month',
 't2m_d1_forecast',
 't2m_d2_forecast',
 't2m_d3_forecast',
 't2m_d4_forecast',
 't2m_d5_forecast',
 't2m_d6_forecast',
 't2m_d7_forecast',
 'dayofyear',
 'month_sin',
 'month_cos',
 'doy_sin',
 'doy_cos',
 'tsoil1_lag1',
 'tsoil1_lag3',
 'tsoil1_lag7',
 'tsoil2_lag1',
 'tsoil2_lag3',
 'tsoil2_lag7',
 'tsoil3_lag1',
 'tsoil3_lag3',
 'tsoil3_lag7',
 'tsoil4_lag1',
 'tsoil4_lag3',
 'tsoil4_lag7',
 't2m_lag1',
 't2m_lag3',
 't2m_lag7',
 't2m_max_lag1',
 't2m_max_lag3',
 't2m_max_lag7',
 't2m_min_lag1',
 't2m_min_lag3',
 't2m_min_lag7',
 't10m_lag1',
 't10m_lag3',
 't10m_lag7',
 't10m_max_lag1',
 't10m_max_lag3',
 't10m_max_lag7',
 't10m_min_lag1',
