In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../Dataset/nasa_power_prepared_data.csv')

In [3]:
df.tail(3)

Unnamed: 0,date,evland,evptrns,gwetprof,gwetroot,gwettop,hdd0,hdd10,hdd18_3,pbltop,...,soil_d5_forecast,soil_d6_forecast,soil_d7_forecast,wind_d1_forecast,wind_d2_forecast,wind_d3_forecast,wind_d4_forecast,wind_d5_forecast,wind_d6_forecast,wind_d7_forecast
16386,2025-11-12,3.96,2.46,0.92,0.94,0.92,0.0,0.0,0.0,92.68754,...,0.91,0.92,0.91,1.8,3.7,4.17,4.63,4.17,4.89,6.46
16387,2025-11-13,2.83,1.06,0.92,0.94,0.92,0.0,0.0,0.0,92.68754,...,0.92,0.91,0.9,3.7,4.17,4.63,4.17,4.89,6.46,5.8
16388,2025-11-14,4.9,1.66,0.92,0.94,0.91,0.0,0.0,0.0,92.68754,...,0.91,0.9,0.89,4.17,4.63,4.17,4.89,6.46,5.8,5.23


## Feature selection

In [4]:
df = df.sort_values("date")

In [5]:
COLS_TO_DROP_PRE_TRAIN = ["date", "day", "week", "weekday"]

H = 7

t2m_targets = [f"t2m_d{h}_forecast" for h in range(1, H+1)]
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [col for col in num_cols if col not in t2m_targets]

In [6]:
selected_features = [
    # Soil Temperature
    "tsoil1", "tsoil2", "tsoil3", "tsoil4",

    # Surface / Air Temperature
    "t2m", "t2m_max", "t2m_min",
    "t10m", "t10m_max", "t10m_min",
    "ts", "ts_min", "ts_max", "tsurf",
    "t2m_range", "t10m_range", "ts_range",

    # Wind (V component)
    "v2m", "v10m", "v50m",

    # Humidity / Air Density
    "rhoa", "t2mwet",

    # Soil Moisture
    "gwettop", "gwetroot", "gwetprof",

    # ET / Evapotranspiration
    "evptrns", "et_total", "evland",

    # Physical properties
    "z0m", "to3",

    # Seasonal (for sin/cos)
    "month",
]

In [7]:
base_features = [c for c in selected_features if c in df.columns]
df_sel = df[["date"] + base_features + t2m_targets].copy()

### Seasonal Features

In [8]:
# Seasonal Features (from date/month/dayofyear)
df_sel["date"] = pd.to_datetime(df_sel["date"])
df_sel["dayofyear"] = df_sel["date"].dt.dayofyear

seasonal_df = pd.DataFrame({
    "month_sin": np.sin(2*np.pi*df_sel["month"]/12),
    "month_cos": np.cos(2*np.pi*df_sel["month"]/12),
    "doy_sin": np.sin(2*np.pi*df_sel["dayofyear"]/365),
    "doy_cos": np.cos(2*np.pi*df_sel["dayofyear"]/365),
})

In [9]:
df_sel.columns

Index(['date', 'tsoil1', 'tsoil2', 'tsoil3', 'tsoil4', 't2m', 't2m_max',
       't2m_min', 't10m', 't10m_max', 't10m_min', 'ts', 'ts_min', 'ts_max',
       'tsurf', 't2m_range', 't10m_range', 'ts_range', 'v2m', 'v10m', 'v50m',
       'rhoa', 't2mwet', 'gwettop', 'gwetroot', 'gwetprof', 'evptrns',
       'et_total', 'evland', 'z0m', 'to3', 'month', 't2m_d1_forecast',
       't2m_d2_forecast', 't2m_d3_forecast', 't2m_d4_forecast',
       't2m_d5_forecast', 't2m_d6_forecast', 't2m_d7_forecast', 'dayofyear'],
      dtype='object')

In [10]:
df_sel.head(3)

Unnamed: 0,date,tsoil1,tsoil2,tsoil3,tsoil4,t2m,t2m_max,t2m_min,t10m,t10m_max,...,to3,month,t2m_d1_forecast,t2m_d2_forecast,t2m_d3_forecast,t2m_d4_forecast,t2m_d5_forecast,t2m_d6_forecast,t2m_d7_forecast,dayofyear
0,1981-01-01,25.45,25.33,25.53,26.07,25.21,31.06,19.7,25.46,30.33,...,246.0,1,25.53,25.43,25.61,26.59,26.7,26.66,27.44,1
1,1981-01-02,25.78,25.58,25.58,26.05,25.53,31.07,19.97,25.72,30.31,...,244.56,1,25.43,25.61,26.59,26.7,26.66,27.44,26.87,2
2,1981-01-03,26.05,25.85,25.67,26.04,25.43,30.87,20.73,25.44,29.92,...,245.96,1,25.61,26.59,26.7,26.66,27.44,26.87,26.14,3


### Lag Features

In [11]:
# Lag Features
# lag 1, 3, 7 days
#lags = [1, 3, 7]
lags = list(range(1, 8))

lag_frames = [] 

for col in base_features:
    if col == "month":
        continue
    for lag in lags:
        lag_frames.append(
            df_sel[col].shift(lag).rename(f"{col}_lag{lag}")
        )
        
lag_df = pd.concat(lag_frames, axis=1)

In [12]:
df_sel.columns

Index(['date', 'tsoil1', 'tsoil2', 'tsoil3', 'tsoil4', 't2m', 't2m_max',
       't2m_min', 't10m', 't10m_max', 't10m_min', 'ts', 'ts_min', 'ts_max',
       'tsurf', 't2m_range', 't10m_range', 'ts_range', 'v2m', 'v10m', 'v50m',
       'rhoa', 't2mwet', 'gwettop', 'gwetroot', 'gwetprof', 'evptrns',
       'et_total', 'evland', 'z0m', 'to3', 'month', 't2m_d1_forecast',
       't2m_d2_forecast', 't2m_d3_forecast', 't2m_d4_forecast',
       't2m_d5_forecast', 't2m_d6_forecast', 't2m_d7_forecast', 'dayofyear'],
      dtype='object')

### Rolling Features

In [13]:
# Rolling Window Features
rolling_cols = [
    "t2m", "t2m_max", "t2m_min",
    "t10m", "t10m_max", "t10m_min",
    "ts", "ts_max", "ts_min", "tsurf",
    "tsoil1", "tsoil2", "tsoil3", "tsoil4",
    "t2mwet", "rhoa",
    "gwettop", "gwetroot", "gwetprof",
    "v2m", "v10m", "v50m"
]

windows = [3, 7]

rolling_frames = []
for col in rolling_cols:
    for w in windows:
        rolling_frames.append(
            df_sel[col].rolling(w).mean().rename(f"{col}_roll{w}_mean")
        )
        rolling_frames.append(
            df_sel[col].rolling(w).std().rename(f"{col}_roll{w}_std")
        )

rolling_df = pd.concat(rolling_frames, axis=1)

In [14]:
df_fe = pd.concat([
    df_sel,
    seasonal_df,
    lag_df,
    rolling_df
], axis=1)

df_fe = df_fe.dropna().reset_index(drop=True)

print("Final FE shape:", df_fe.shape)

Final FE shape: (16382, 342)


In [15]:
df_fe.to_csv('../Dataset/FeatureEngineering_T2M_data.csv', index=False)

In [16]:
len(df_fe.columns)

342

In [17]:
df_fe.columns.tolist()

['date',
 'tsoil1',
 'tsoil2',
 'tsoil3',
 'tsoil4',
 't2m',
 't2m_max',
 't2m_min',
 't10m',
 't10m_max',
 't10m_min',
 'ts',
 'ts_min',
 'ts_max',
 'tsurf',
 't2m_range',
 't10m_range',
 'ts_range',
 'v2m',
 'v10m',
 'v50m',
 'rhoa',
 't2mwet',
 'gwettop',
 'gwetroot',
 'gwetprof',
 'evptrns',
 'et_total',
 'evland',
 'z0m',
 'to3',
 'month',
 't2m_d1_forecast',
 't2m_d2_forecast',
 't2m_d3_forecast',
 't2m_d4_forecast',
 't2m_d5_forecast',
 't2m_d6_forecast',
 't2m_d7_forecast',
 'dayofyear',
 'month_sin',
 'month_cos',
 'doy_sin',
 'doy_cos',
 'tsoil1_lag1',
 'tsoil1_lag2',
 'tsoil1_lag3',
 'tsoil1_lag4',
 'tsoil1_lag5',
 'tsoil1_lag6',
 'tsoil1_lag7',
 'tsoil2_lag1',
 'tsoil2_lag2',
 'tsoil2_lag3',
 'tsoil2_lag4',
 'tsoil2_lag5',
 'tsoil2_lag6',
 'tsoil2_lag7',
 'tsoil3_lag1',
 'tsoil3_lag2',
 'tsoil3_lag3',
 'tsoil3_lag4',
 'tsoil3_lag5',
 'tsoil3_lag6',
 'tsoil3_lag7',
 'tsoil4_lag1',
 'tsoil4_lag2',
 'tsoil4_lag3',
 'tsoil4_lag4',
 'tsoil4_lag5',
 'tsoil4_lag6',
 'tsoil4_lag7',

In [None]:
# Advanced Feature Engineering
print("Adding advanced features...")

# Temperature Volatility Features
print("Temperature Volatility Features")
volatility_df = pd.DataFrame({
    # Temperature volatility (coefficient of variation)
    't2m_volatility_3d': df_fe['t2m'].rolling(3).std() / df_fe['t2m'].rolling(3).mean(),
    't2m_volatility_7d': df_fe['t2m'].rolling(7).std() / df_fe['t2m'].rolling(7).mean(),
    
    # Daily changes and acceleration
    't2m_daily_change': df_fe['t2m'].diff(),
    't2m_acceleration': df_fe['t2m'].diff().diff(),
    
    # Temperature volatility for other temp variables
    't10m_volatility_3d': df_fe['t10m'].rolling(3).std() / df_fe['t10m'].rolling(3).mean(),
    'ts_volatility_3d': df_fe['ts'].rolling(3).std() / df_fe['ts'].rolling(3).mean(),
    'tsurf_volatility_3d': df_fe['tsurf'].rolling(3).std() / df_fe['tsurf'].rolling(3).mean(),
})

print(f"   - Added {len(volatility_df.columns)} volatility features")

# Weather Pattern Features  
print("Weather Pattern Features")
weather_df = pd.DataFrame({
    # Heating/Cooling degree days (using Thai comfort temperature ~26°C)
    'hdd': np.maximum(26 - df_fe['t2m'], 0),  # Heating Degree Days
    'cdd': np.maximum(df_fe['t2m'] - 26, 0),  # Cooling Degree Days
    
    # Temperature anomalies
    't2m_monthly_avg': df_fe.groupby('month')['t2m'].transform('mean'),
    't2m_anomaly': df_fe['t2m'] - df_fe.groupby('month')['t2m'].transform('mean'),
    
    # Seasonal temperature difference
    't2m_seasonal_trend': df_fe.groupby('month')['t2m'].transform('std'),
    't2m_vs_seasonal_avg': df_fe['t2m'] - df_fe.groupby('month')['t2m'].transform('mean'),
    
    # Extreme temperature indicators
    't2m_is_extreme_hot': (df_fe['t2m'] > df_fe['t2m'].quantile(0.95)).astype(int),
    't2m_is_extreme_cold': (df_fe['t2m'] < df_fe['t2m'].quantile(0.05)).astype(int),
})

print(f"   - Added {len(weather_df.columns)} weather pattern features")

# Cross-variable Interactions
print("Cross-variable Interactions")
interaction_df = pd.DataFrame({
    # Temperature gradients
    'temp_gradient_surface': df_fe['t2m'] - df_fe['ts'],
    'temp_gradient_soil': df_fe['tsoil1'] - df_fe['tsoil4'],
    'temp_gradient_altitude': df_fe['t10m'] - df_fe['t2m'],
    
    # Soil temperature gradients
    'soil_temp_gradient_shallow': df_fe['tsoil1'] - df_fe['tsoil2'],
    'soil_temp_gradient_deep': df_fe['tsoil2'] - df_fe['tsoil4'],
    
    # Wind-temperature interactions
    'wind_chill_factor': df_fe['t2m'] - (df_fe['v10m'] * 0.1),
    'heat_index_simple': df_fe['t2m'] + (df_fe['rhoa'] * 0.01),
    
    # Moisture-temperature interactions
    'temp_humidity_ratio': df_fe['t2m'] / (df_fe['rhoa'] + 1),
    'evaporation_potential': df_fe['t2m'] * df_fe['v10m'] * 0.01,
    
    # Soil moisture vs temperature
    'soil_temp_moisture': df_fe['tsoil1'] * df_fe['gwettop'],
    'surface_efficiency': df_fe['ts'] / (df_fe['evptrns'] + 0.1),
})

print(f"   - Added {len(interaction_df.columns)} interaction features")

print(f"\nCombining all features...")
print(f"Original features: {len(df_fe.columns)}")

Adding advanced features...
1. Temperature Volatility Features
   - Added 7 volatility features
2. Weather Pattern Features
   - Added 8 weather pattern features
3. Cross-variable Interactions
   - Added 11 interaction features

Combining all features...
Original features: 342


In [None]:
# Statistical Pattern Features
print("4. Statistical Pattern Features")
statistical_df = pd.DataFrame({
    # Moving percentiles
    't2m_rolling_p25': df_fe['t2m'].rolling(7).quantile(0.25),
    't2m_rolling_p75': df_fe['t2m'].rolling(7).quantile(0.75),
    't2m_rolling_iqr': df_fe['t2m'].rolling(7).quantile(0.75) - df_fe['t2m'].rolling(7).quantile(0.25),
    
    # Relative position in recent window
    't2m_position_in_week': df_fe['t2m'] / (df_fe['t2m'].rolling(7).max() + 0.001),
    't2m_relative_to_recent_max': df_fe['t2m'] - df_fe['t2m'].rolling(7).max(),
    't2m_relative_to_recent_min': df_fe['t2m'] - df_fe['t2m'].rolling(7).min(),
    
    # Momentum indicators
    't2m_momentum_3d': df_fe['t2m'] - df_fe['t2m'].shift(3),
    't2m_momentum_7d': df_fe['t2m'] - df_fe['t2m'].shift(7),
    
    # Trend strength
    't2m_trend_strength': (df_fe['t2m'].rolling(7).mean() - df_fe['t2m'].rolling(14).mean()),
})

print(f"   - Added {len(statistical_df.columns)} statistical features")

# Cyclical and Temporal Features
print("5. Enhanced Temporal Features")
temporal_df = pd.DataFrame({
    # Multi-frequency seasonal patterns
    'week_sin': np.sin(2*np.pi*df_fe['dayofyear']/7),
    'week_cos': np.cos(2*np.pi*df_fe['dayofyear']/7),
    
    # Quarterly patterns
    'quarter_sin': np.sin(2*np.pi*(df_fe['month']-1)/3),
    'quarter_cos': np.cos(2*np.pi*(df_fe['month']-1)/3),
    
    # Half-year patterns
    'half_year_sin': np.sin(2*np.pi*(df_fe['month']-1)/6),
    'half_year_cos': np.cos(2*np.pi*(df_fe['month']-1)/6),
    
    # Day position in month
    'month_progress': (df_fe['dayofyear'] % 30) / 30,
})

print(f"   - Added {len(temporal_df.columns)} temporal features")

# 6. Lag Features for New Variables
print("6. Advanced Lag Features")
new_vars_for_lag = ['t2m_daily_change', 't2m_volatility_3d', 't2m_anomaly', 
                    'temp_gradient_surface', 'wind_chill_factor']

advanced_lag_frames = []
for var in new_vars_for_lag:
    for lag in [1, 2, 3]:  # Short-term lags for derived features
        if var in volatility_df.columns:
            series = volatility_df[var]
        elif var in weather_df.columns:
            series = weather_df[var]
        elif var in interaction_df.columns:
            series = interaction_df[var]
        else:
            continue
            
        advanced_lag_frames.append(
            series.shift(lag).rename(f"{var}_lag{lag}")
        )

if advanced_lag_frames:
    advanced_lag_df = pd.concat(advanced_lag_frames, axis=1)
    print(f"   - Added {len(advanced_lag_df.columns)} advanced lag features")
else:
    advanced_lag_df = pd.DataFrame()
    print("   - No advanced lag features added")

print(f"\nCombining all advanced features...")

4. Statistical Pattern Features
   - Added 9 statistical features
5. Enhanced Temporal Features
   - Added 7 temporal features
6. Advanced Lag Features
   - Added 15 advanced lag features

Combining all advanced features...


In [None]:
# Combine all advanced features
df_advanced = pd.concat([
    df_fe,
    volatility_df,
    weather_df, interaction_df,
    statistical_df,
    temporal_df,
    advanced_lag_df
], axis=1)

# Remove rows with NaN values (due to rolling windows and lags)
df_advanced = df_advanced.dropna().reset_index(drop=True)

print(f"Original features: {len(df_fe.columns)}")
print(
    f"Advanced features added: {len(df_advanced.columns) - len(df_fe.columns)}")

# Feature categories summary
feature_categories = {
    'Original': len(df_fe.columns),
    'Volatility': len(volatility_df.columns),
    'Weather Patterns': len(weather_df.columns),  'Interactions': len(interaction_df.columns),
    'Statistical': len(statistical_df.columns),
    'Temporal': len(temporal_df.columns),
    'Advanced Lags': len(advanced_lag_df.columns) if len(advanced_lag_df.columns) > 0 else 0
}

for category, count in feature_categories.items():
    print(f"   - {category}: {count} features")

Original features: 342
Advanced features added: 57
   - Original: 342 features
   - Volatility: 7 features
   - Weather Patterns: 8 features
   - Interactions: 11 features
   - Statistical: 9 features
   - Temporal: 7 features
   - Advanced Lags: 15 features


In [21]:
# Save enhanced dataset
df_advanced.to_csv('../Dataset/FeatureEngineering_T2M_data.csv', index=False)

In [3]:
import pandas as pd

df = pd.read_csv('../Dataset/FeatureEngineering_T2M_data.csv')
df.shape

(16369, 399)

In [4]:
df.columns.tolist()

['date',
 'tsoil1',
 'tsoil2',
 'tsoil3',
 'tsoil4',
 't2m',
 't2m_max',
 't2m_min',
 't10m',
 't10m_max',
 't10m_min',
 'ts',
 'ts_min',
 'ts_max',
 'tsurf',
 't2m_range',
 't10m_range',
 'ts_range',
 'v2m',
 'v10m',
 'v50m',
 'rhoa',
 't2mwet',
 'gwettop',
 'gwetroot',
 'gwetprof',
 'evptrns',
 'et_total',
 'evland',
 'z0m',
 'to3',
 'month',
 't2m_d1_forecast',
 't2m_d2_forecast',
 't2m_d3_forecast',
 't2m_d4_forecast',
 't2m_d5_forecast',
 't2m_d6_forecast',
 't2m_d7_forecast',
 'dayofyear',
 'month_sin',
 'month_cos',
 'doy_sin',
 'doy_cos',
 'tsoil1_lag1',
 'tsoil1_lag2',
 'tsoil1_lag3',
 'tsoil1_lag4',
 'tsoil1_lag5',
 'tsoil1_lag6',
 'tsoil1_lag7',
 'tsoil2_lag1',
 'tsoil2_lag2',
 'tsoil2_lag3',
 'tsoil2_lag4',
 'tsoil2_lag5',
 'tsoil2_lag6',
 'tsoil2_lag7',
 'tsoil3_lag1',
 'tsoil3_lag2',
 'tsoil3_lag3',
 'tsoil3_lag4',
 'tsoil3_lag5',
 'tsoil3_lag6',
 'tsoil3_lag7',
 'tsoil4_lag1',
 'tsoil4_lag2',
 'tsoil4_lag3',
 'tsoil4_lag4',
 'tsoil4_lag5',
 'tsoil4_lag6',
 'tsoil4_lag7',