In [27]:
import pandas as pd 
import numpy as np 
path = "D:/PythonCode/DSEB65A_MachineLearningProject_Group5/data/hanoi_weather_data_hourly.csv"
df = pd.read_csv(path)

## Handling Missing Value

In [28]:
df = df.drop(columns=['name', 'address', 'resolvedAddress', 'source', 'latitude', 'longitude', 'severerisk','preciptype', 'snow', 'snowdepth', 'solarenergy', 'solarradiation'])

In [29]:
print(df.columns)

Index(['datetime', 'temp', 'feelslike', 'dew', 'humidity', 'precip',
       'precipprob', 'windgust', 'windspeed', 'winddir', 'sealevelpressure',
       'cloudcover', 'visibility', 'uvindex', 'conditions', 'icon'],
      dtype='object')


In [30]:
# Loại bỏ các hàng có giá trị null
df = df.dropna()

# Kiểm tra lại kích thước của DataFrame sau khi loại bỏ
print(df.shape)

(87421, 16)


## Time Conversion 
- The code converts the datetime column to a proper datetime format using pd.to_datetime, sets it as the DataFrame index with set_index, and resamples the data to a daily frequency using `resample('D')`. The `mean(numeric_only=True)` aggregates numeric columns by calculating their daily averages, simplifying hourly data into daily summaries for further analysis.

In [31]:
df['datetime'] = pd.to_datetime(df['datetime'])
df.set_index('datetime', inplace=True)  # Đặt cột thời gian làm chỉ số
df_daily = df.resample('D').mean(numeric_only=True)

## Feature Engineer

### Wind Feature Engineer
- The wind direction, measured in degrees, is converted into its sine and cosine components using numpy functions. This transformation allows us to represent the wind direction as cyclical features.

In [32]:
df['winddir_sin'] = np.sin(np.deg2rad(df['winddir']))
df['winddir_cos'] = np.cos(np.deg2rad(df['winddir']))

- Using the wind speed and the sine/cosine components of the wind direction, calculate the wind vector components for the north-south (`wind_vector_ns`) and east-west (`wind_vector_ew`) directions.

In [33]:
df['wind_vector_ns'] = df['windspeed'] * df['winddir_cos']
df['wind_vector_ew'] = df['windspeed'] * df['winddir_sin']
df = df.drop(columns=['winddir'])

### Time Feature Engineer

In [34]:
#Thêm cột ngày trong năm
df_daily['day_of_year'] = df_daily.index.dayofyear
#Mã hóa tuần hoàn bằng sin-cos (cyclical encoding)
df_daily['day_sin'] = np.sin(2 * np.pi * df_daily['day_of_year'] / 365)
df_daily['day_cos'] = np.cos(2 * np.pi * df_daily['day_of_year'] / 365)
#Thêm cả tháng, thứ trong tuần nếu cần
df_daily['month'] = df_daily.index.month
df_daily['day_of_week'] = df_daily.index.dayofweek
df_daily['is_weekend'] = df_daily['day_of_week'].isin([5, 6]).astype(int)  # binary

- Creating Seasonal Feature

In [35]:
# Feature mùa và mã hóa one-hot (Season Indicators)
def get_season(month):
    if month in [12, 1, 2]: return 'Winter'
    elif month in [3, 4, 5]: return 'Spring'
    elif month in [6, 7, 8]: return 'Summer'
    else: return 'Fall'
df_daily['season'] = df_daily['month'].apply(get_season)
season_dummies = pd.get_dummies(df_daily['season'], prefix='season')
df_daily = pd.concat([df_daily, season_dummies], axis=1)
# Drop the 'season' column
df_daily = df_daily.drop(columns=['season'])

# Convert all dummy features to numeric
df_daily = df_daily.astype({col: int for col in season_dummies.columns})

### Seasonal Decomposition

In [36]:
from statsmodels.tsa.seasonal import STL
# Đảm bảo index là datetime và sorted (tích hợp từ code mới)
if 'datetime' in df_daily.columns:
    df_daily['datetime'] = pd.to_datetime(df_daily['datetime'])
    df_daily.set_index('datetime', inplace=True)
df_daily.sort_index(inplace=True)  # sort

# Chọn target, handle NaN ban đầu
series_to_decompose = df_daily['temp'].dropna()  # Hoặc fill nếu cần: .fillna(method='ffill')

# STL yearly
stl_yearly = STL(series_to_decompose, period=365, seasonal=13, robust=True)
result_yearly = stl_yearly.fit()


In [37]:
# Thêm features vào dataset
df_daily['temp_trend_yearly'] = result_yearly.trend
df_daily['temp_seasonal_yearly'] = result_yearly.seasonal
df_daily['temp_resid_yearly'] = result_yearly.resid

In [38]:
if df_daily[['temp_trend_yearly', 'temp_seasonal_yearly', 'temp_resid_yearly']].isna().any().any():
    # Forward/backward fill để giữ tính liên tục time series
    df_daily[['temp_trend_yearly', 'temp_seasonal_yearly', 'temp_resid_yearly']] = \
        df_daily[['temp_trend_yearly', 'temp_seasonal_yearly', 'temp_resid_yearly']].fillna(method='ffill')
    df_daily[['temp_trend_yearly', 'temp_seasonal_yearly', 'temp_resid_yearly']] = \
        df_daily[['temp_trend_yearly', 'temp_seasonal_yearly', 'temp_resid_yearly']].fillna(method='bfill')

print("NaN after handling:", df_daily[['temp_trend_yearly', 'temp_seasonal_yearly', 'temp_resid_yearly']].isna().sum().sum())

# Kiểm tra head
print(df_daily[['temp', 'temp_seasonal_yearly', 'temp_trend_yearly', 'temp_resid_yearly']].head())

NaN after handling: 0
                 temp  temp_seasonal_yearly  temp_trend_yearly  \
datetime                                                         
2015-09-27  29.629167              4.729238          24.431184   
2015-09-28  30.283333              5.392690          24.433189   
2015-09-29  30.758333              5.714577          24.435196   
2015-09-30  29.933333              5.108645          24.437206   
2015-10-01  29.408696              4.265492          24.439217   

            temp_resid_yearly  
datetime                       
2015-09-27           0.468744  
2015-09-28           0.457455  
2015-09-29           0.608560  
2015-09-30           0.387482  
2015-10-01           0.703986  


### Handling Categorical Feature
- Chuyển dữ liệu định tính hourly thành daily bằng cách lấy mode (dữ liệu xuất hiện nhiều nhất trong 24h)

In [39]:
def get_mode(series:pd.Series):
    modes = series.mode()
    return modes.iloc[0] if not modes.empty else np.nan

categorical_cols = ['conditions', 'icon']  # ví dụ
df_cat = df[categorical_cols].resample('D').agg(lambda x: get_mode(x))
df_cat_encoded = pd.get_dummies(df_cat, prefix=categorical_cols)
# Convert boolean columns to binary (0 and 1)
df_cat_encoded = df_cat_encoded.astype(int)

# Concatenate the binary-encoded categorical data with the daily dataframe
df_daily = pd.concat([df_daily, df_cat_encoded], axis=1)

***Split dataframe to avoid leakage***


In [40]:
# Chia tập dữ liệu df_daily thành 3 phần train, valid, test
train_size = int(len(df_daily) * 0.7)
valid_size = int(len(df_daily) * 0.15)

df_train = df_daily.iloc[:train_size]
df_valid = df_daily.iloc[train_size:train_size + valid_size]
df_test = df_daily.iloc[train_size + valid_size:]

# Kiểm tra kích thước của các tập dữ liệu
print("Train size:", df_train.shape)
print("Validation size:", df_valid.shape)
print("Test size:", df_test.shape)

Train size: (2557, 37)
Validation size: (548, 37)
Test size: (549, 37)


### Lag Feature

In [41]:
def create_lag_features(df, lag_cols, lags, diff_periods=[1, 7]):
    """
    Create lag and difference features for specified columns.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - lag_cols (list): List of column names to create lag features for.
    - lags (list): List of lag values to use.
    - diff_periods (list): List of periods for difference features (default: [1, 7]).

    Returns:
    - pd.DataFrame: DataFrame with lag and difference features added.
    """
    for col in lag_cols:
        for lag in lags:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)
        for period in diff_periods:
            df[f'{col}_diff_{period}'] = df[col].diff(periods=period)
    return df

# Example usage:
lag_cols = ['temp', 'humidity', 'windspeed', 'cloudcover', 'precip']
lags = [1, 2, 3, 4, 5, 7, 14, 30]
df_train = create_lag_features(df_train, lag_cols, lags)
df_valid = create_lag_features(df_valid, lag_cols, lags)
df_test = create_lag_features(df_test, lag_cols, lags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}_lag_{lag}'] = df[col].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}_lag_{lag}'] = df[col].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}_lag_{lag}'] = df[col].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

### Rolling Window Feature

In [42]:
def create_rolling_features(df, lag_cols, windows, min_periods=1):
    """
    Create rolling mean, std, min, max, and sum features for specified columns.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - lag_cols (list): List of column names to create rolling features for.
    - windows (list): List of window sizes for rolling calculations.
    - min_periods (int): Minimum number of observations in the window required to have a value.

    Returns:
    - pd.DataFrame: DataFrame with rolling features added.
    """
    for col in lag_cols:
        for window in windows:
            # Rolling on past data (shift(1) to avoid leakage)
            shifted = df[col]
            df[f'{col}_roll_mean_{window}'] = shifted.rolling(window=window, min_periods=min_periods).mean()
            df[f'{col}_roll_std_{window}'] = shifted.rolling(window=window, min_periods=min_periods).std()

            # Add min/max/sum for specific columns
            if col in ['temp', 'precip', 'windspeed']:
                df[f'{col}_roll_min_{window}'] = shifted.rolling(window=window, min_periods=min_periods).min()
                df[f'{col}_roll_max_{window}'] = shifted.rolling(window=window, min_periods=min_periods).max()
            if col == 'precip':
                df[f'{col}_roll_sum_{window}'] = shifted.rolling(window=window, min_periods=min_periods).sum()
    return df
# Define parameters
windows = [2, 3, 4, 5, 7, 14, 21, 30]

# Apply the function to your DataFrame
df_train = create_rolling_features(df_train, lag_cols, windows)
df_valid = create_rolling_features(df_valid, lag_cols, windows)
df_test = create_rolling_features(df_test, lag_cols, windows)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}_roll_mean_{window}'] = shifted.rolling(window=window, min_periods=min_periods).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}_roll_std_{window}'] = shifted.rolling(window=window, min_periods=min_periods).std()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}_ro

### Weighted Rolling Feature

In [43]:
def create_ewma_features(df, lag_cols, spans):
    """
    Create Exponentially Weighted Moving Average (EWMA) features for specified columns.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - lag_cols (list): List of column names to create EWMA features for.
    - spans (list): List of span values for EWMA calculations.

    Returns:
    - pd.DataFrame: DataFrame with EWMA features added.
    """
    for col in lag_cols:
        for span in spans:
            df[f'{col}_ewm_{span}'] = df[col].ewm(span=span, adjust=False).mean()
    return df

# Define parameters
spans = [2, 3, 4, 5, 7, 14, 21, 30]

# Apply the function to your DataFrame
df_train = create_ewma_features(df_train, lag_cols, spans)
df_train = df_train.dropna()
df_valid = create_ewma_features(df_valid, lag_cols, spans)
df_valid = df_valid.dropna()
df_test = create_ewma_features(df_test, lag_cols, spans)
df_test = df_test.dropna()

  df[f'{col}_ewm_{span}'] = df[col].ewm(span=span, adjust=False).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}_ewm_{span}'] = df[col].ewm(span=span, adjust=False).mean()
  df[f'{col}_ewm_{span}'] = df[col].ewm(span=span, adjust=False).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}_ewm_{span}'] = df[col].ewm(span=span, adjust=False).mean()
  df[f'{col}_ewm_{span}'] = df[col].ewm(span=span, adjust=False).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value

### Feature Interaction

In [44]:
def create_interaction_features(df):
    """
    Create interaction features for the given DataFrame.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.

    Returns:
    - pd.DataFrame: DataFrame with new interaction features added.
    """
    # Hiệu ứng của nhiệt độ 1 ngày trước trong mùa hè
    df['interaction_lag1_summer'] = df['temp_lag_1'] * df['season_Summer']

    # Hiệu ứng của nhiệt độ 1 ngày trước vào cuối tuần
    df['interaction_lag1_weekend'] = df['temp_lag_1'] * df['is_weekend']

    # "Hiệu ứng gió lạnh" = tốc độ gió * (mức độ quang đãng của bầu trời)
    df['interaction_wind_clearsky_effect'] = df['windspeed'] * (1 - df['cloudcover'] / 100.0)

    # Bức xạ hiệu dụng (Effective Solar Radiation)
    df['effective_radiation'] = df['uvindex'] * (1 - df['cloudcover'] / 100)

    # Tương tác giữa Độ ẩm và Nhiệt độ
    df['humidity_temp_interact'] = df['humidity'] * df['temp']

    return df
# Áp dụng hàm để tạo các feature tương tác
df_train = create_interaction_features(df_train)
df_valid = create_interaction_features(df_valid)
df_test = create_interaction_features(df_test)

  df['interaction_lag1_summer'] = df['temp_lag_1'] * df['season_Summer']
  df['interaction_lag1_weekend'] = df['temp_lag_1'] * df['is_weekend']
  df['interaction_wind_clearsky_effect'] = df['windspeed'] * (1 - df['cloudcover'] / 100.0)
  df['effective_radiation'] = df['uvindex'] * (1 - df['cloudcover'] / 100)
  df['humidity_temp_interact'] = df['humidity'] * df['temp']
  df['interaction_lag1_summer'] = df['temp_lag_1'] * df['season_Summer']
  df['interaction_lag1_weekend'] = df['temp_lag_1'] * df['is_weekend']
  df['interaction_wind_clearsky_effect'] = df['windspeed'] * (1 - df['cloudcover'] / 100.0)
  df['effective_radiation'] = df['uvindex'] * (1 - df['cloudcover'] / 100)
  df['humidity_temp_interact'] = df['humidity'] * df['temp']
  df['interaction_lag1_summer'] = df['temp_lag_1'] * df['season_Summer']
  df['interaction_lag1_weekend'] = df['temp_lag_1'] * df['is_weekend']
  df['interaction_wind_clearsky_effect'] = df['windspeed'] * (1 - df['cloudcover'] / 100.0)
  df['effective_radi

### Hourly Rolling Feature
- The rolling feature created by the mean of temperature in a range of hour.

In [45]:
rolling_hour_feature = ['temp', 'feelslike', 'dew', 'humidity', 'precip', 'precipprob', 'windspeed', 'uvindex']
window_sizes = [3, 6, 9, 12, 15, 18]  # Các giá trị window

def create_rolling_hour_feature(df_infunc:pd.DataFrame):
    # Tạo dataframe tạm thời để lưu các feature rolling
    df_temp = pd.DataFrame(index=df.index)
    
    # Tính rolling feature cho từng feature và từng window size
    for feature in rolling_hour_feature:
        for window in window_sizes:
            col_name = f'{feature}_rolling_last_{window}h'
            df_temp[col_name] = df[feature].rolling(window=window).mean()
    
    # Tái cấu trúc thời gian dữ liệu rolling về daily và lưu vào df_daily
    for col in df_temp.columns:
        df_infunc[col] = df_temp[col].resample('D').last()
    
    # Xóa dataframe tạm thời để bảo toàn tài nguyên
    del df_temp

# Gọi hàm để tạo rolling features cho train, test, split
create_rolling_hour_feature(df_train)
create_rolling_hour_feature(df_valid)
create_rolling_hour_feature(df_test)


  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').last()
  df_infunc[col] = df_temp[col].resample('D').

## X/y Splitting
- Creating Label

In [46]:
def create_target_features(df, horizons):
    """
    Create target features for forecasting based on specified horizons.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing the 'temp' column.
    - horizons (list): List of horizons (time steps) for which to create target features.

    Returns:
    - pd.DataFrame: DataFrame with target features added and NaN rows dropped.
    """
    for h in horizons:
        df[f'target_temp_t+{h}'] = df['temp'].shift(-h)
    return df.dropna()

# Example usage:
horizons = [1, 2, 3, 4, 5]
df_train = create_target_features(df_train, horizons)
df_train = df_train.dropna()
df_valid = create_target_features(df_valid, horizons)
df_valid = df_valid.dropna()
df_test = create_target_features(df_test, horizons)
df_test = df_test.dropna()

  df[f'target_temp_t+{h}'] = df['temp'].shift(-h)
  df[f'target_temp_t+{h}'] = df['temp'].shift(-h)
  df[f'target_temp_t+{h}'] = df['temp'].shift(-h)
  df[f'target_temp_t+{h}'] = df['temp'].shift(-h)
  df[f'target_temp_t+{h}'] = df['temp'].shift(-h)
  df[f'target_temp_t+{h}'] = df['temp'].shift(-h)
  df[f'target_temp_t+{h}'] = df['temp'].shift(-h)
  df[f'target_temp_t+{h}'] = df['temp'].shift(-h)
  df[f'target_temp_t+{h}'] = df['temp'].shift(-h)
  df[f'target_temp_t+{h}'] = df['temp'].shift(-h)
  df[f'target_temp_t+{h}'] = df['temp'].shift(-h)
  df[f'target_temp_t+{h}'] = df['temp'].shift(-h)
  df[f'target_temp_t+{h}'] = df['temp'].shift(-h)
  df[f'target_temp_t+{h}'] = df['temp'].shift(-h)
  df[f'target_temp_t+{h}'] = df['temp'].shift(-h)


- Feature\Label Splitting

In [47]:
def feature_label_split(df_infunc: pd.DataFrame, horizons=[1, 2, 3, 4, 5]):
    X = df_infunc.drop([f'target_temp_t+{h}' for h in horizons], axis=1)
    y = df_infunc[[f'target_temp_t+{h}' for h in horizons]]
    return X, y

X_train, y_train = feature_label_split(df_train)
X_valid, y_valid = feature_label_split(df_valid)
X_test, y_test = feature_label_split(df_test)
print(X_train.shape, y_train.shape)

(2522, 316) (2522, 5)


In [48]:
# Check for non-numeric features in the DataFrame
non_numeric_features = df_train.select_dtypes(exclude=[np.number]).columns.tolist()

if non_numeric_features:
    print("Non-numeric features found:")
    print(non_numeric_features)
else:
    print("All features are numeric.")

All features are numeric.


## Handle Multicollinearity

In [49]:
# Select numeric columns
num_df = X_train.select_dtypes(include=[np.number])

# Compute Pearson correlation
corr_matrix = num_df.corr()

top_corr = corr_matrix['temp'].abs().sort_values(ascending=False).head(40)

# Handling multicolinearity

def find_highly_correlated_features(corr_matrix, threshold=0.95):
    """
    Find feature pairs with correlation above the specified threshold.
    """
    # Get the upper triangle of the correlation matrix
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find feature pairs with correlation > threshold
    highly_correlated_pairs = [
        (column, upper_tri[column].abs().idxmax(), upper_tri[column].abs().max())
        for column in upper_tri.columns if upper_tri[column].abs().max() > threshold
    ]

    # Sort for easier viewing
    sorted_pairs = sorted(highly_correlated_pairs, key=lambda x: x[2], reverse=True)

    return sorted_pairs


def drop_correlated_features(df, correlated_pairs, target_col):
    """
    Drop one feature from each highly correlated pair.
    This function keeps the feature that has higher correlation with the target variable.
    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        correlated_pairs (list): List of highly correlated feature pairs from the find_highly_correlated_features function.
        target_col (str): The name of the target column (e.g., 'temp').
    Returns:
        pd.DataFrame: The new DataFrame with features dropped.
        set: A set of dropped columns.
    """
    cols_to_drop = set()

    # Calculate correlation of all features with the target column once
    corr_with_target = df.corr()[target_col].abs()

    # Iterate through the pairs found
    for feat1, feat2, _ in correlated_pairs:
        # Get the correlation values with the target for each feature in the pair
        corr1 = corr_with_target.get(feat1, 0)
        corr2 = corr_with_target.get(feat2, 0)

        # Decide which feature to drop
        if corr1 > corr2:
            # Keep feat1, drop feat2
            cols_to_drop.add(feat2)
        else:
            # Keep feat2, drop feat1
            cols_to_drop.add(feat1)

    # Drop the identified columns
    df_reduced = df.drop(columns=list(cols_to_drop))

    print(f"\nDropped {len(cols_to_drop)} features due to multicollinearity.")
    if cols_to_drop:
        print("- Dropped columns:", list(cols_to_drop))

    return df_reduced, cols_to_drop

correlated_pairs = find_highly_correlated_features(corr_matrix, threshold=0.70)

print("\n feature pairs with very high correlation (> 0.70):")
for feat1, feat2, corr_val in correlated_pairs:
    print(f"- {feat1:<25} and {feat2:<25} : {corr_val:.4f}")

X_train, dropped_cols = drop_correlated_features(df=X_train,
                                                  correlated_pairs=correlated_pairs,
                                                  target_col='temp')
X_valid = X_valid[X_train.columns]
X_test = X_test[X_train.columns]


 feature pairs with very high correlation (> 0.70):
- precip_roll_sum_3         and precip_roll_mean_3        : 1.0000
- precip_roll_sum_30        and precip_roll_mean_30       : 1.0000
- precip_roll_sum_21        and precip_roll_mean_21       : 1.0000
- precip_roll_sum_2         and precip_roll_mean_2        : 1.0000
- precip_roll_sum_4         and precip_roll_mean_4        : 1.0000
- precip_roll_sum_7         and precip_roll_mean_7        : 1.0000
- precip_roll_sum_5         and precip_roll_mean_5        : 1.0000
- precip_roll_sum_14        and precip_roll_mean_14       : 1.0000
- precip_rolling_last_15h   and precip_rolling_last_12h   : 0.9997
- uvindex_rolling_last_18h  and uvindex                   : 0.9993
- dew_rolling_last_18h      and dew_rolling_last_15h      : 0.9990
- temp_ewm_5                and temp_ewm_4                : 0.9990
- dew_rolling_last_15h      and dew_rolling_last_12h      : 0.9990
- temp_rolling_last_15h     and temp_rolling_last_12h     : 0.9988
- dew_rol

## Save dataframe after preprocessing

In [None]:
import pickle

# Đường dẫn lưu file
output_path = "Hourly Dataframe Preprocessed.pkl"

# Lưu các DataFrame vào file
with open(output_path, 'wb') as f:
    pickle.dump({
        'X_train': X_train,
        'y_train': y_train,
        'X_valid': X_valid,
        'y_valid': y_valid,
        'X_test': X_test,
        'y_test': y_test
    }, f)

print(f"DataFrames have been saved to {output_path}")