## Import Dataset

In [None]:
!pip install gdown

In [None]:
!gdown --folder https://drive.google.com/drive/folders/1dydbU9HlSIgGQBzYMLogDNI27uO6wga7?usp=drive_link

## Load & Clean the Data

In [None]:
import os
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

# Define your cleaning function
def clean_header_and_drop_unuse_row(tmp_df):
    tmp_df.columns = tmp_df.iloc[0]
    tmp_df = tmp_df[1:]
    tmp_df = tmp_df.reset_index(drop=True)
    if 'Date' in tmp_df.columns:
        tmp_df = tmp_df[~pd.isna(tmp_df['Date'])]
    return tmp_df

# Helper function to process a single file
def process_file(file_info):
    file_path, rel_path = file_info

    try:
        tmp_df = pd.read_excel(file_path)
        cleaned_df = clean_header_and_drop_unuse_row(tmp_df)

        # Construct new CSV path
        output_path = os.path.join("/content/cleaned_data", rel_path).replace(".xlsx", ".csv")
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        # Save to CSV
        cleaned_df.to_csv(output_path, index=False)
        return f"✅ Processed: {file_path}"
    except Exception as e:
        return f"❌ Error with {file_path}: {str(e)}"

# Gather all .xlsx files with relative paths
xlsx_files = []
root_dir = "/content/Load-data"

for subdir, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith(".xlsx"):
            full_path = os.path.join(subdir, file)
            rel_path = os.path.relpath(full_path, root_dir)
            xlsx_files.append((full_path, rel_path))

# Run in parallel using Pool
with Pool(cpu_count()) as pool:
    results = list(tqdm(pool.imap_unordered(process_file, xlsx_files), total=len(xlsx_files)))

# Optional: Print summary
for res in results:
    print(res)

100%|██████████| 71/71 [00:03<00:00, 21.25it/s]

✅ Processed: /content/Load-data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-03-2024.xlsx
✅ Processed: /content/Load-data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-08-2024.xlsx
✅ Processed: /content/Load-data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-02-2024.xlsx
✅ Processed: /content/Load-data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-12-2023.xlsx
✅ Processed: /content/Load-data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-04-2024.xlsx
✅ Processed: /content/Load-data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-01-2024.xlsx
✅ Processed: /content/Load-data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-05-2024.xlsx
✅ Processed: /content/Load-data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-09-2024.xlsx
✅ Processed: /content/Load-data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-10-2024.xlsx
✅ Processed: /conte




## Preprocess

In [None]:
import os
import pandas as pd
import re
from datetime import datetime
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

# Updated function
def preprocess_convert_datatype_with_date(tmp_df, filename):
    # Extract MM-YYYY from filename
    match = re.search(r"(\d{2})-(\d{4})", filename)
    if not match:
        raise ValueError(f"Cannot extract date from filename: {filename}")

    start_month = int(match.group(1))
    start_year = int(match.group(2))

    # Generate datetime range
    num_days = len(tmp_df)
    date_range = pd.date_range(start=datetime(start_year, start_month, 1), periods=num_days, freq='D')
    tmp_df['Date'] = date_range

    # Convert all other columns to numeric
    time_cols = [col for col in tmp_df.columns if col != "Date"]
    tmp_df[time_cols] = tmp_df[time_cols].apply(pd.to_numeric, errors='coerce')

    return tmp_df

# Wrapper for parallel processing
def process_csv_file(file_info):
    file_path, rel_path = file_info

    try:
        tmp_df = pd.read_csv(file_path)
        processed_df = preprocess_convert_datatype_with_date(tmp_df, os.path.basename(file_path))

        # Extract station name from relative path
        station_name = os.path.normpath(rel_path).split(os.sep)[0]
        processed_df.insert(0, 'station_name', station_name)  # Add as first column

        # Save to new folder
        output_path = os.path.join("/content/preprocessed_data", rel_path)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        processed_df.to_csv(output_path, index=False)
        return f"✅ Processed: {file_path}"
    except Exception as e:
        return f"❌ Error with {file_path}: {str(e)}"

# Collect files
csv_files = []
root_dir = "/content/cleaned_data"

for subdir, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith(".csv"):
            full_path = os.path.join(subdir, file)
            rel_path = os.path.relpath(full_path, root_dir)
            csv_files.append((full_path, rel_path))

# Run in parallel
with Pool(cpu_count()) as pool:
    results = list(tqdm(pool.imap_unordered(process_csv_file, csv_files), total=len(csv_files)))

# Optional: Print summary
for res in results:
    print(res)

100%|██████████| 71/71 [00:01<00:00, 47.53it/s]

✅ Processed: /content/cleaned_data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-09-2024.csv
✅ Processed: /content/cleaned_data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-08-2024.csv
✅ Processed: /content/cleaned_data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-10-2024.csv
✅ Processed: /content/cleaned_data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-02-2024.csv
✅ Processed: /content/cleaned_data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-11-2024.csv
✅ Processed: /content/cleaned_data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-01-2024.csv
✅ Processed: /content/cleaned_data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-07-2024.csv
✅ Processed: /content/cleaned_data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-05-2024.csv
✅ Processed: /content/cleaned_data/Data_อาคารวิทยนิเวศน์/รายงานสรุป-Demand-รายวัน-อาคารวิทยนิเวศน์-06-2024.csv
✅




## Preprocess

In [None]:
# After processing is done, concatenate all processed files
all_data = []

output_root = "/content/preprocessed_data"

for subdir, _, files in os.walk(output_root):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(subdir, file)
            try:
                df = pd.read_csv(file_path)
                all_data.append(df)
            except Exception as e:
                print(f"❌ Failed to read {file_path}: {e}")

# Concatenate all data
if all_data:
    all_data_df = pd.concat(all_data, ignore_index=True)
    all_data_df.to_csv("/content/all_data_df.csv", index=False)
    print("✅ All data concatenated and saved to /content/all_data_df.csv")
else:
    print("⚠️ No data was loaded for concatenation.")
import re

# Identify time columns (HH:MM format)
time_columns = [col for col in all_data_df.columns if re.match(r"^\d{1,2}:\d{2}$", str(col))]

# Melt the DataFrame to long format
long_df = all_data_df.melt(
    id_vars=['station_name', 'Date'],
    value_vars=time_columns,
    var_name='Time',
    value_name='Electricity(kW)'
)

# Combine 'Date' and 'Time' into full datetime
long_df['Date'] = pd.to_datetime(long_df['Date'].astype(str) + ' ' + long_df['Time'])

# Drop 'Time' column
long_df.drop(columns=['Time'], inplace=True)

# Sort by station_name first, then by Date
long_df.sort_values(by=['station_name', 'Date'], inplace=True)

# Save to CSV
long_df.to_csv('/content/all_data_timeseries.csv', index=False)
print("✅ Time series data saved and sorted by station_name > Date to /content/all_data_timeseries.csv")

✅ All data concatenated and saved to /content/all_data_df.csv
✅ Time series data saved and sorted by station_name > Date to /content/all_data_timeseries.csv


## Define Weight

In [None]:
# Count the number of rows per station
station_counts = long_df['station_name'].value_counts()

# Normalize so max count has weight = 1
max_count = station_counts.max()
normalized_reverse_weights = max_count / station_counts

# Convert to DataFrame for easier viewing
station_weights_df = normalized_reverse_weights.reset_index()
station_weights_df.columns = ['station_name', 'normalized_reverse_weight']

# Print result
print(station_weights_df)

             station_name  normalized_reverse_weight
0         Data_สถานีชาร์จ                   1.000000
1     Data_อาคารจามจุรี 9                   1.000000
2   Data_อาคารวิทยนิเวศน์                   1.000000
3   Data_อาคารจุลจักรพงษ์                   1.002786
4  Data_อาคารบรมราชกุมารี                   1.002786
5      Data_อาคารจามจุรี4                   1.094225


## Experiment [Clean Data]

In [None]:
long_df.loc[long_df['Electricity(kW)'] < 0, 'Electricity(kW)'] = 0

## Split train,valid and test

In [None]:
# Define ratios
train_ratio = 0.8
test_ratio = 0.2  # Optional, just for clarity (1 - train_ratio)

# Create empty lists to collect per-station splits
train_list = []
test_list = []

# Split per station
for station, station_df in long_df.groupby('station_name'):
    station_df = station_df.sort_values('Date')
    n = len(station_df)

    train_end = int(n * train_ratio)

    train_list.append(station_df.iloc[:train_end])
    test_list.append(station_df.iloc[train_end:])

# Combine all stations back into global sets
train_df = pd.concat(train_list).reset_index(drop=True)
test_df = pd.concat(test_list).reset_index(drop=True)

# Save to CSV
train_df.to_csv('/content/train_timeseries.csv', index=False)
test_df.to_csv('/content/test_timeseries.csv', index=False)

print("✅ Split completed:")
print(f"Train set size: {len(train_df)} rows")
print(f"Test set size: {len(test_df)} rows")

✅ Split completed:
Train set size: 163353 rows
Test set size: 40839 rows


In [None]:
import pandas as pd
test_df = pd.read_csv("/content/test_timeseries.csv")
train_df = pd.read_csv("/content/train_timeseries.csv")

In [None]:
locations = {
    "Data_สถานีชาร์จ": (13.73624, 100.52995),
    "Data_อาคารจามจุรี4": (13.73260, 100.53177),
    "Data_อาคารจามจุรี 9": (13.73380, 100.53045),
    "Data_อาคารจุลจักรพงษ์": (13.73684, 100.52852),
    "Data_อาคารบรมราชกุมารี": (13.73800, 100.52905),
    "Data_อาคารวิทยนิเวศน์": (13.73723, 100.53015),
}
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163353 entries, 0 to 163352
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   station_name     163353 non-null  object        
 1   Date             163353 non-null  datetime64[ns]
 2   Electricity(kW)  163231 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 3.7+ MB


In [None]:
!pip install torch-geometric-temporal

In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import LabelEncoder
from torch_geometric.data import Data
from torch_geometric.transforms import LaplacianLambdaMax
from torch_geometric_temporal.nn.attention.astgcn import ASTGCN

def add_astgcn_embeddings(train_df: pd.DataFrame,
                          locations: dict,
                          len_input: int = 12,
                          k_neighbors: int = 4,
                          astgcn_params: dict = None) -> pd.DataFrame:
    """
    Compute ASTGCN-based embedding for each (station, timestamp) pair
    and merge as a new column 'astgcn_emb' into train_df.

    Parameters
    ----------
    train_df : pd.DataFrame
        Input dataframe with columns ['station_name', 'Date', 'Electricity(kW)'].
    locations : dict
        Mapping from station_name to (lat, lon) coordinate.
    len_input : int
        Length of the input time window for ASTGCN (T_in).
    k_neighbors : int
        Number of neighbors for the KNN graph.
    astgcn_params : dict
        Keyword args for ASTGCN constructor
        (nb_block, in_channels, K,
         nb_chev_filter, nb_time_filter,
         time_strides, num_for_predict,
         len_input, num_of_vertices).

    Returns
    -------
    pd.DataFrame
        train_df with an added 'astgcn_emb' column
        (NaN for the first len_input rows).
    """
    # 1. Build edge_index via KNN on station coordinates
    station_names = list(locations.keys())
    coords = np.array([locations[name] for name in station_names])
    A = kneighbors_graph(coords, n_neighbors=k_neighbors,
                         mode='connectivity', include_self=False)
    edges = np.vstack(A.nonzero())
    edge_index = torch.tensor(edges, dtype=torch.long)

    # 2. Pivot to matrix: rows=time, cols=stations
    df_pivot = (train_df
                .pivot(index='Date', columns='station_name',
                       values='Electricity(kW)')
                .reindex(columns=station_names)
                .fillna(method='ffill')
                .fillna(0))
    data_matrix = df_pivot.values  # shape (T, N)
    T, N = data_matrix.shape

    # 3. Create sliding windows
    num_windows = T - len_input
    X = np.stack([data_matrix[i:i + len_input] for i in range(num_windows)])
    # reshape to (B, N, F_in, T_in)
    X = torch.tensor(X, dtype=torch.float).permute(0, 2, 1).unsqueeze(2)

    # 4. Initialize ASTGCN
    if astgcn_params is None:
        astgcn_params = dict(nb_block=1, in_channels=1, K=3,
                             nb_chev_filter=64, nb_time_filter=64,
                             time_strides=1, num_for_predict=1,
                             len_input=len_input, num_of_vertices=N)
    model = ASTGCN(**astgcn_params)

    # 5. (If using 'rw' or no normalization) compute lambda_max:
    # trans = LaplacianLambdaMax()
    # data = Data(edge_index=edge_index, num_nodes=N)
    # data = trans(data)
    # lambda_max = data.lambda_max

    # 6. Forward pass to get embeddings
    with torch.no_grad():
        # For symmetric normalization (default), lambda_max not needed
        out = model(X, edge_index)  # (B, N, num_for_predict)
    emb = out.squeeze(-1).cpu().numpy()  # (B, N)

    # 7. Build embedding DataFrame
    emb_dates = df_pivot.index[len_input:]
    df_emb = (
        pd.DataFrame(emb, index=emb_dates, columns=station_names)
          .reset_index()
          .melt(id_vars='Date',
                var_name='station_name',
                value_name='astgcn_emb')
    )

    # 8. Merge back into original train_df
    train_df = train_df.merge(df_emb, on=['Date', 'station_name'], how='left')
    return train_df


# === Example usage ===

train_df_with_emb = add_astgcn_embeddings(train_df, locations, len_input=24, k_neighbors=2)
test_df_with_emb = add_astgcn_embeddings(test_df, locations, len_input=24, k_neighbors=2)

  .fillna(method='ffill')
  .fillna(method='ffill')


In [None]:
train_df_with_emb

Unnamed: 0,station_name,Date,Electricity(kW),astgcn_emb
0,Data_สถานีชาร์จ,2023-12-01 00:00:00,0.57,
1,Data_สถานีชาร์จ,2023-12-01 00:15:00,1.08,
2,Data_สถานีชาร์จ,2023-12-01 00:30:00,0.70,
3,Data_สถานีชาร์จ,2023-12-01 00:45:00,0.89,
4,Data_สถานีชาร์จ,2023-12-01 01:00:00,1.01,
...,...,...,...,...
163348,Data_อาคารวิทยนิเวศน์,2024-09-15 22:45:00,0.00,1.457748
163349,Data_อาคารวิทยนิเวศน์,2024-09-15 23:00:00,0.00,1.524272
163350,Data_อาคารวิทยนิเวศน์,2024-09-15 23:15:00,0.00,1.592857
163351,Data_อาคารวิทยนิเวศน์,2024-09-15 23:30:00,0.00,1.571176


## Modeling

In [None]:
!pip install -U autogluon

Collecting autogluon
  Downloading autogluon-1.3.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.core==1.3.1 (from autogluon.core[all]==1.3.1->autogluon)
  Downloading autogluon.core-1.3.1-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.features==1.3.1 (from autogluon)
  Downloading autogluon.features-1.3.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.tabular==1.3.1 (from autogluon.tabular[all]==1.3.1->autogluon)
  Downloading autogluon.tabular-1.3.1-py3-none-any.whl.metadata (14 kB)
Collecting autogluon.multimodal==1.3.1 (from autogluon)
  Downloading autogluon.multimodal-1.3.1-py3-none-any.whl.metadata (13 kB)
Collecting autogluon.timeseries==1.3.1 (from autogluon.timeseries[all]==1.3.1->autogluon)
  Downloading autogluon.timeseries-1.3.1-py3-none-any.whl.metadata (12 kB)
Collecting boto3<2,>=1.10 (from autogluon.core==1.3.1->autogluon.core[all]==1.3.1->autogluon)
  Downloading boto3-1.39.4-py3-none-any.whl.metadata (6.6 kB)
Collecting autogluon.common==1.3.

## Baseline Model

In [None]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularPredictor
from autogluon.core.metrics import make_scorer

# -------------------------
# 1. Load and Clean Data
# -------------------------


# Clean non-finite values
def clean_non_finite(df):
    return df[np.isfinite(df['Electricity(kW)'])].copy()

train_df = clean_non_finite(train_df)
test_df = clean_non_finite(test_df)

print("✅ Cleaned non-finite values from 'Electricity(kW)' in train and test sets.")

# -------------------------
# 2. Feature Engineering
# -------------------------
# Convert Date to datetime
train_df['Date'] = pd.to_datetime(train_df['Date'])
test_df['Date'] = pd.to_datetime(test_df['Date'])

# Extract time-based features
for df in [train_df, test_df]:
    df['hour'] = df['Date'].dt.hour
    df['minute'] = df['Date'].dt.minute
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['month'] = df['Date'].dt.month
    df['year'] = df['Date'].dt.year

# Define features and target
feature_cols = ['station_name', 'hour', 'minute', 'dayofweek', 'month']
target_col = 'Electricity(kW)'

train_data = train_df[feature_cols + [target_col]]
test_data = test_df[feature_cols + [target_col]]

# Convert categorical features
for df in [train_data, test_data]:
    df['station_name'] = df['station_name'].astype('category')




✅ Cleaned non-finite values from 'Electricity(kW)' in train and test sets.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['station_name'] = df['station_name'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['station_name'] = df['station_name'].astype('category')


In [None]:
train_data

Unnamed: 0,station_name,hour,minute,dayofweek,month,Electricity(kW)
0,Data_สถานีชาร์จ,0,0,4,12,0.57
1,Data_สถานีชาร์จ,0,15,4,12,1.08
2,Data_สถานีชาร์จ,0,30,4,12,0.70
3,Data_สถานีชาร์จ,0,45,4,12,0.89
4,Data_สถานีชาร์จ,1,0,4,12,1.01
...,...,...,...,...,...,...
163348,Data_อาคารวิทยนิเวศน์,22,45,6,9,0.00
163349,Data_อาคารวิทยนิเวศน์,23,0,6,9,0.00
163350,Data_อาคารวิทยนิเวศน์,23,15,6,9,0.00
163351,Data_อาคารวิทยนิเวศน์,23,30,6,9,0.00


In [None]:

# -------------------------
# 4. Train AutoGluon Model
# -------------------------
predictor = TabularPredictor(

    label=target_col,
    problem_type='regression',
    eval_metric="mse"
).fit(
    train_data=train_data,
    test_data=test_data,
    presets='high',
    hyperparameters={'RF': {}},
    time_limit=3600
)

# -------------------------
# 5. Predict and Evaluate
# -------------------------
y_true = test_data[target_col].values
y_pred = predictor.predict(test_data)

# Add predictions to dataframe
test_data['y_true'] = y_true
test_data['y_pred'] = y_pred
predictor.leaderboard(test_data, silent=True)

No path specified. Models will be saved in: "AutogluonModels/ag-20250714_142902"
Preset alias specified: 'high' maps to 'high_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.11.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Mar 30 16:01:29 UTC 2025
CPU Count:          2
Memory Avail:       10.94 GB / 12.67 GB (86.3%)
Disk Space Avail:   69.31 GB / 112.64 GB (61.5%)
Presets specified: ['high']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Note: `save_bag_folds=False`! This will greatly reduce peak disk usage during fit (by ~8x), but runs the risk of an out-of-memory error during model refit if memory is small relative to the data size.
	You can avoid this risk by setting `save_bag_folds=True`.
DyStack is enabled 

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForest_BAG_L1_FULL,-5355.989773,,mean_squared_error,1.531101,3.79386,42.350259,1.531101,3.79386,42.350259,1,True,5
1,WeightedEnsemble_L2_FULL,-5355.989773,,mean_squared_error,1.533055,,42.356532,0.001953,,0.006273,2,True,6
2,RandomForest_BAG_L1,-5355.989773,-3352.69912,mean_squared_error,1.561499,3.79386,42.350259,1.561499,3.79386,42.350259,1,True,1
3,WeightedEnsemble_L2,-5355.989773,-3352.69912,mean_squared_error,1.563969,3.796815,42.356532,0.00247,0.002954,0.006273,2,True,2
4,RandomForest_BAG_L2_FULL,-5440.032022,,mean_squared_error,2.58349,8.316378,136.313401,1.052388,4.522518,93.963142,2,True,7
5,WeightedEnsemble_L3_FULL,-5440.032022,,mean_squared_error,2.585468,,136.397757,0.001978,,0.084356,3,True,8
6,RandomForest_BAG_L2,-5440.032022,-938.99483,mean_squared_error,3.143151,8.316378,136.313401,1.581652,4.522518,93.963142,2,True,3
7,WeightedEnsemble_L3,-5440.032022,-938.99483,mean_squared_error,3.145681,8.318685,136.397757,0.00253,0.002306,0.084356,3,True,4


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Group by 'station_name' and calculate metrics
grouped_results = []

for station, group in test_data.groupby('station_name'):
    y_true = group['y_true']
    y_pred = group['y_pred']

    rmse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    grouped_results.append({
        'station_name': station,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'Count': len(group)
    })

# Create DataFrame for display
grouped_df = pd.DataFrame(grouped_results).sort_values(by="RMSE")

  for station, group in test_data.groupby('station_name'):


In [None]:
grouped_df

Unnamed: 0,station_name,RMSE,MAE,R2,Count
3,Data_อาคารจุลจักรพงษ์,888.93963,15.916984,0.658083,6893
2,Data_อาคารจามจุรี4,964.332814,15.988963,0.265642,6317
5,Data_อาคารวิทยนิเวศน์,2905.740468,33.198933,-0.567957,6912
0,Data_สถานีชาร์จ,4360.023642,34.775482,0.471917,6912
4,Data_อาคารบรมราชกุมารี,7871.225189,42.617716,0.474449,6893
1,Data_อาคารจามจุรี 9,15258.825194,78.907716,0.496512,6912


In [None]:
predictor.leaderboard(test_data, silent=True) # Baseline

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForest_BAG_L1,-5610.431217,-3515.134109,mean_squared_error,2.876228,4.994084,60.134012,2.876228,4.994084,60.134012,1,True,1
1,WeightedEnsemble_L2,-5610.431217,-3515.134109,mean_squared_error,2.880094,4.997679,60.142329,0.003866,0.003595,0.008318,2,True,2
2,RandomForest_BAG_L2,-5645.500003,-982.451667,mean_squared_error,5.147612,10.914464,195.925424,2.271384,5.92038,135.791412,2,True,3
3,WeightedEnsemble_L3,-5645.500003,-982.451667,mean_squared_error,5.150254,10.918097,196.055096,0.002642,0.003633,0.129672,3,True,4


In [None]:
test_data['y_pred'] = y_pred

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Create fake `day` assuming 15-minute intervals = 96 points per day
test_data = test_data.copy()
test_data['day'] = (test_data.groupby(['station_name', 'year', 'month']).cumcount() // 96) + 1

# Step 2: Build datetime
test_data['datetime'] = pd.to_datetime({
    'year': test_data['year'],
    'month': test_data['month'],
    'day': test_data['day'],
    'hour': test_data['hour'],
    'minute': test_data['minute']
}, errors='coerce')

# Drop invalid datetimes (e.g., day=31 in Feb)
test_data = test_data.dropna(subset=['datetime'])

# Step 3: Plot actual vs predicted by station with line + scatter
for station in test_data['station_name'].unique():
    station_df = test_data[test_data['station_name'] == station]

    plt.figure(figsize=(14, 5))

    # Line plot
    plt.plot(station_df['datetime'], station_df['Electricity(kW)'], label='Actual (Line)', linewidth=2)
    plt.plot(station_df['datetime'], station_df['y_pred'], label='Predicted (Line)', linestyle='--', linewidth=2)

    # Scatter plot
    plt.scatter(station_df['datetime'], station_df['Electricity(kW)'], label='Actual (Points)', color='blue', s=10, alpha=0.5)
    plt.scatter(station_df['datetime'], station_df['y_pred'], label='Predicted (Points)', color='orange', s=10, alpha=0.5)

    plt.title(f"🔌 Actual vs Predicted — {station}")
    plt.xlabel("Datetime")
    plt.ylabel("Electricity (kW)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Show test score
test_score = predictor.evaluate(test_data)

print("\n✅ Test score (based on your custom WAPE metric):")
print(test_score)


✅ Test score (based on your custom WAPE metric):
{'mean_squared_error': -5451.992391489654, 'root_mean_squared_error': np.float64(-73.83760824599923), 'mean_absolute_error': -37.29465069917298, 'r2': 0.6543136590211078, 'pearsonr': 0.8179052729580331, 'median_absolute_error': np.float64(-9.948104858398438)}


## Experiment

In [None]:
test_data.info()

<class 'autogluon.timeseries.dataset.ts_dataframe.TimeSeriesDataFrame'>
MultiIndex: 40839 entries, ('Data_สถานีชาร์จ', Timestamp('2024-09-16 00:00:00')) to ('Data_อาคารวิทยนิเวศน์', Timestamp('2024-11-27 23:45:00'))
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   station_name     40839 non-null  object 
 1   Electricity(kW)  40839 non-null  float64
dtypes: float64(1), object(1)
memory usage: 1.0+ MB


In [None]:
test_df

Unnamed: 0,station_name,Date,Electricity(kW),astgcn_emb,hour,minute,dayofweek,month,year
0,Data_สถานีชาร์จ,2024-09-16 00:00:00,0.13,,0,0,0,9,2024
1,Data_สถานีชาร์จ,2024-09-16 00:15:00,0.13,,0,15,0,9,2024
2,Data_สถานีชาร์จ,2024-09-16 00:30:00,0.13,,0,30,0,9,2024
3,Data_สถานีชาร์จ,2024-09-16 00:45:00,0.12,,0,45,0,9,2024
4,Data_สถานีชาร์จ,2024-09-16 01:00:00,0.01,,1,0,0,9,2024
...,...,...,...,...,...,...,...,...,...
40834,Data_อาคารวิทยนิเวศน์,2024-11-27 22:45:00,0.00,1.096033,22,45,2,11,2024
40835,Data_อาคารวิทยนิเวศน์,2024-11-27 23:00:00,0.00,1.050954,23,0,2,11,2024
40836,Data_อาคารวิทยนิเวศน์,2024-11-27 23:15:00,0.00,1.036029,23,15,2,11,2024
40837,Data_อาคารวิทยนิเวศน์,2024-11-27 23:30:00,0.00,1.030732,23,30,2,11,2024


In [None]:
import pandas as pd
import numpy as np
from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame



train_df['Date'] = pd.to_datetime(train_df['Date'])
test_df['Date'] = pd.to_datetime(test_df['Date'])

train_df['item_id'] = train_df['station_name']
test_df['item_id'] = test_df['station_name']

# 2. Convert to TimeSeriesDataFrame
train_data = TimeSeriesDataFrame.from_data_frame(train_df, id_column='item_id', timestamp_column='Date')
test_data = TimeSeriesDataFrame.from_data_frame(test_df, id_column='item_id', timestamp_column='Date')

# 3. Set prediction length (e.g., 96 steps = next 24 hours for 15min freq)
prediction_length = 96
freq = "15min"

# 4. Train the predictor
predictor = TimeSeriesPredictor(
    label="Electricity(kW)",
    prediction_length=prediction_length,
    eval_metric="mse",
    path="AutogluonTimeSeriesModels",
    freq=freq
)

predictor.fit(
    train_data=train_data,
    enable_ensemble=False,
    presets='best_quality',
    hyperparameters={'PatchTST': {}},
    time_limit=3600,
)

Beginning AutoGluon training... Time limit = 3600s
AutoGluon will save models to '/content/AutogluonTimeSeriesModels'
AutoGluon Version:  1.3.1
Python Version:     3.11.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Mar 30 16:01:29 UTC 2025
CPU Count:          2
GPU Count:          1
Memory Avail:       11.06 GB / 12.67 GB (87.3%)
Disk Space Avail:   69.85 GB / 112.64 GB (62.0%)
Setting presets to: best_quality

Fitting with arguments:
{'enable_ensemble': False,
 'eval_metric': MSE,
 'freq': '15min',
 'hyperparameters': {'PatchTST': {}},
 'known_covariates_names': [],
 'num_val_windows': 2,
 'prediction_length': 96,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'Electricity(kW)',
 'time_limit': 3600,
 'verbosity': 2}

train_data with frequency 'IRREG' has been resampled to frequency '15min'.
Provi

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7e13f7bfe2d0>

In [None]:
predictions = predictor.predict(test_data, model="PatchTST")

data with frequency 'IRREG' has been resampled to frequency '15min'.


In [None]:
import pandas as pd
from autogluon.timeseries.metrics import TimeSeriesScorer

class WeightedWAPE(TimeSeriesScorer):
    """
    Weighted Absolute Percentage Error (WAPE), where each series
    is weighted by a user‐provided mapping of normalized_reverse_weight.
    """
    greater_is_better_internal = False  # lower is better
    optimum = 0.0

    def __init__(self, prediction_length: int, station_weights: pd.Series):
        # Only pass prediction_length to the super; no `name=` argument.
        super().__init__(prediction_length=prediction_length)
        # station_weights: pd.Series indexed by item_id (i.e. station_name)
        self.station_weights = station_weights

    def compute_metric(self, data_future, predictions, target, **kwargs) -> float:
        """
        data_future[target]  : TimeSeriesDataFrame of true values over the forecast horizon
        predictions["mean"]  : TimeSeriesDataFrame of point forecasts over the same horizon
        """
        # Flatten out the multi‐index into aligned 1D arrays
        y_true = data_future[target]      # pd.Series, index: (item_id, timestamp)
        y_pred = predictions["mean"]      # pd.Series, same index

        # Extract the item_ids for each row in the forecast horizon
        item_ids = y_true.index.get_level_values("item_id")
        # Map to the weights you computed earlier
        weights  = pd.Series(item_ids).map(self.station_weights).to_numpy()

        abs_err    = (y_true - y_pred).abs().to_numpy()
        abs_actual = y_true.abs().to_numpy()

        weighted_error    = (weights * abs_err).sum()
        weighted_actual   = (weights * abs_actual).sum()
        return weighted_error / weighted_actual

In [None]:
from autogluon.timeseries import TimeSeriesPredictor
from autogluon.timeseries.metrics import SMAPE, MASE
# 1) turn your station_weights_df into a Series indexed by station_name
station_weights = station_weights_df.set_index("station_name")["normalized_reverse_weight"]

# 2) instantiate your metric
wwape = WeightedWAPE(
    prediction_length=predictor.prediction_length,
    station_weights=station_weights
)


# Now call leaderboard on your held-out data, asking for extra metrics:
lb = predictor.leaderboard(
    data=test_data,
    extra_metrics=[
        "MSE",
        "MAE",        # mean absolute error
        SMAPE(),      # symmetric MAPE as a TimeSeriesScorer object
        MASE(),       # mean absolute scaled error as a TimeSeriesScorer object
        wwape
    ],
)
lb

data with frequency 'IRREG' has been resampled to frequency '15min'.
Additional data provided, testing on additional data. Resulting leaderboard will be sorted according to test score (`score_test`).


Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time_marginal,fit_order,MSE,MAE,SMAPE,MASE,WeightedWAPE
0,PatchTST,-1979.70683,-2829.959814,0.075835,0.08227,404.250046,11,-1979.70683,-21.84022,-0.535284,-0.538238,-0.209549
1,DirectTabular,-2174.131611,-2196.477459,0.38237,1.235511,14.216663,3,-2174.131611,-21.697794,-0.499528,-0.512376,-0.208101
2,DynamicOptimizedTheta,-2528.263916,-10392.182081,8.003008,5.602563,7.049314,5,-2528.263916,-28.548972,-0.5908,-0.722789,-0.27479
3,ChronosZeroShot[bolt_base],-2720.612089,-1357.175819,1.676965,1.665461,10.174235,7,-2720.612089,-24.0946,-0.523142,-0.534986,-0.230438
4,ChronosFineTuned[bolt_small],-2743.501322,-1159.91025,0.991712,0.162521,590.402603,8,-2743.501322,-24.371484,-0.519402,-0.549547,-0.233208
5,RecursiveTabular,-2747.43368,-2491.898946,1.523252,1.674302,26.087509,2,-2747.43368,-22.510119,-0.49029,-0.49897,-0.215372
6,TiDE,-2810.483691,-1116.094873,0.163878,0.598751,1410.30422,12,-2810.483691,-24.261119,-0.479123,-0.524727,-0.232225
7,SeasonalNaive,-3320.026493,-6608.071581,0.032696,0.055572,0.48292,1,-3320.026493,-25.506667,-0.290535,-0.545236,-0.243881
8,TemporalFusionTransformer,-3610.816286,-699.524238,0.136522,0.691653,585.756244,9,-3610.816286,-29.751925,-0.562481,-0.665116,-0.284624
9,DeepAR,-5105.787292,-2073.273384,0.535085,0.832941,421.826988,10,-5105.787292,-42.193726,-0.779038,-1.074585,-0.405665


In [None]:
predictor.evaluate(test_data)

data with frequency 'IRREG' has been resampled to frequency '15min'.
Model not specified in predict, will default to the model with the best validation score: TemporalFusionTransformer


{'MSE': -3160.452874392981}

In [None]:
predictor.plot(test_data,predictions)

TypeError: TimeSeriesPredictor.plot() got an unexpected keyword argument 'model'

In [None]:
predictor("PatchTST")