In [44]:
import os
import torch
import math
import os
import tempfile
import warnings
import time
import torch
import psutil
import shutil
import subprocess  # 用于调用 nvidia-smi 获取 GPU 内存信息
import threading

from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
import torch.nn as nn
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments, set_seed
from transformers.integrations import INTEGRATION_TO_CALLBACK

from tsfm_public import TinyTimeMixerForPrediction, TrackingCallback, count_parameters, load_dataset
from tsfm_public.toolkit.lr_finder import optimal_lr_finder
from tsfm_public.toolkit.visualization import plot_predictions
import numpy as np

# 设定镜像网站
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# 查看GPU情况
os.environ["CUDA_VISIBLE_DEVICES"] = "2"  # 指定使用第0个GPU
print(torch.cuda.device_count())  # 输出可用的 GPU 数量 # 输出可用的 GPU 数量
# Suppress all warnings
warnings.filterwarnings("ignore")

1


In [45]:
#seed
SEED = 42
set_seed(SEED)
# TTM Revision (1 or 2)
TTM_REVISION = 2
#上下文和预测长度
CONTEXT_LENGTH = 512
FORECAST_LENGTH = 96 
# Dataset
TARGET_DATASET = "electricity"
DATASET_PATH = "/home/zhupengtian/zhangqingliang/granite-tsfm/datasets/electricity/electricity.csv"
# Results dir
OUT_DIR = "/home/zhupengtian/zhangqingliang/granite-tsfm/ttm_finetuned_models/"

In [46]:
# ----- TTM model path -----
if TTM_REVISION == 1:
    TTM_MODEL_PATH = "ibm-granite/granite-timeseries-ttm-r1"
    # ----- TTM model branch -----
    # For R1 models
    if CONTEXT_LENGTH == 512:
        TTM_MODEL_REVISION = "main"
    elif CONTEXT_LENGTH == 1024:
        TTM_MODEL_REVISION = "1024_96_v1"
    else:
        raise ValueError(f"Unsupported CONTEXT_LENGTH for TTM_MODEL_PATH={TTM_MODEL_PATH}")
elif TTM_REVISION == 2:
    TTM_MODEL_PATH = "ibm-granite/granite-timeseries-ttm-r2"
    # ----- TTM model branch -----
    # For R2 models
    if CONTEXT_LENGTH == 512:
        TTM_MODEL_REVISION = "main"
    elif CONTEXT_LENGTH == 1024:
        TTM_MODEL_REVISION = "1024-96-r2"
    elif CONTEXT_LENGTH == 1536:
        TTM_MODEL_REVISION = "1536-96-r2"
    else:
        raise ValueError(f"Unsupported CONTEXT_LENGTH for TTM_MODEL_PATH={TTM_MODEL_PATH}")
else:
    raise ValueError("Wrong TTM_REVISION. Stay tuned for future models.")
print("Chosen TTM model:")
print(f"{TTM_MODEL_PATH}, revision = {TTM_MODEL_REVISION}")

Chosen TTM model:
ibm-granite/granite-timeseries-ttm-r2, revision = main


In [47]:
# 打印导入数据集
dataset = load_dataset('electricity', context_length=512, forecast_length=96, dataset_path=DATASET_PATH)
# 直接打印数据集的类型和内容
print(type(dataset))

# 得到训练集
train_dataset, val_dataset, test_dataset = dataset  # dataset 是加载的数据集

# 获取测试集的一个示例
test_index = 0  # 你可以修改为需要的索引
test_sample = test_dataset[test_index]

# 打印数据的形状
for key, value in test_sample.items():
    if isinstance(value, (np.ndarray, torch.Tensor)):
        print(f"{key} 的形状: {value.shape}")
        print(f"{key} 的类型:{type(value)}")
    else:
        print(f"{key} 的内容: {value}")  # 如果不是数组或张量，直接打印内容


INFO:p-2574695:t-129782190835200:data_handling.py:load_dataset:Dataset name: electricity, context length: 512, prediction length 96
INFO:p-2574695:t-129782190835200:data_handling.py:load_dataset:Data lengths: train = 708, val = 234, test = 4837


<class 'tuple'>
past_values 的形状: torch.Size([512, 321])
past_values 的类型:<class 'torch.Tensor'>
future_values 的形状: torch.Size([96, 321])
future_values 的类型:<class 'torch.Tensor'>
past_observed_mask 的形状: torch.Size([512, 321])
past_observed_mask 的类型:<class 'torch.Tensor'>
future_observed_mask 的形状: torch.Size([96, 321])
future_observed_mask 的类型:<class 'torch.Tensor'>
timestamp 的内容: 2016-10-14 01:00:00
id 的内容: (0,)


In [48]:
# 自定义评估指标
from sklearn.metrics import mean_absolute_error, mean_squared_error

def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def mase(y_true, y_pred, train, m=1):
    # 计算训练集的绝对误差
    train_errors = np.abs(np.array(train[m:]) - np.array(train[:-m]))  # 滞后 m
    mae_train = np.mean(train_errors)  # 训练集的平均绝对误差
    # 计算 MASE
    return np.mean(np.abs(y_true - y_pred)) / mae_train if mae_train != 0 else np.nan

    

In [49]:
# 全局变量，用于存储最大内存占用
max_cpu_memory = 0.0  
max_gpu_memory = 0.0  
def monitor_memory():
    global max_cpu_memory, max_gpu_memory
    process = psutil.Process(os.getpid())
    while True:
        # 获取当前 CPU 物理内存使用
        cpu_memory = process.memory_info().rss / (1024 ** 2)  # 转换为 MB
        max_cpu_memory = max(max_cpu_memory, cpu_memory)
        
        # 获取 GPU 最大内存使用
        if torch.cuda.is_available():
            gpu_memory = torch.cuda.max_memory_reserved() / (1024 ** 2)  # 转为 MB
            max_gpu_memory = max(max_gpu_memory, gpu_memory)
        
        # 每隔一秒采样一次
        time.sleep(1)

In [50]:
# 评估函数
def zeroshot_eval(dataset_name, batch_size, context_length=512, forecast_length=96, prediction_filter_length=None):
    torch.cuda.empty_cache()  # 清理缓存

    global max_cpu_memory, max_gpu_memory
    max_cpu_memory = 0.0  
    max_gpu_memory = 0.0  
    
    # 启动监控线程
    monitor_thread = threading.Thread(target=monitor_memory)
    monitor_thread.daemon = True  # 设置为守护线程，在主线程结束时自动退出
    monitor_thread.start()

    if prediction_filter_length is not None:
        if prediction_filter_length >= forecast_length:
            raise ValueError(
                "`prediction_filter_length` should be less than the original `forecast_length` of the pre-trained TTM model."
            )
        # forecast_length = forecast_length - prediction_filter_length
        forecast_length =  prediction_filter_length

    # Get data
    _, _, dset_test = load_dataset(
        dataset_name=dataset_name,
        context_length=context_length,
        forecast_length=forecast_length,
        # fewshot_fraction=1.0,
        dataset_path=DATASET_PATH,
    )

    # Load model
    if prediction_filter_length is None:
        zeroshot_model = TinyTimeMixerForPrediction.from_pretrained(TTM_MODEL_PATH, revision=TTM_MODEL_REVISION)
    else:
        if prediction_filter_length <= forecast_length:
            zeroshot_model = TinyTimeMixerForPrediction.from_pretrained(
                TTM_MODEL_PATH,
                revision=TTM_MODEL_REVISION,
                prediction_filter_length=prediction_filter_length,
            )
        else:
            raise ValueError("`prediction_filter_length` should be <= `forecast_length")
    temp_dir = tempfile.mkdtemp()
    # zeroshot_trainer
    zeroshot_trainer = Trainer(
        model=zeroshot_model,
        args=TrainingArguments(
            output_dir=temp_dir,
            per_device_eval_batch_size=batch_size,
            seed=SEED,
            report_to="none",
        ),
    )

    # # 评估
    # print("+" * 20, "Test MSE zero-shot", "+" * 20)
    # zeroshot_output = zeroshot_trainer.evaluate(dset_test)
    # print(zeroshot_output)
    
    # 进行预测
    start_time = time.time()
    prediction_output = zeroshot_trainer.predict(dset_test)  # 获取预测输出
    total_time = time.time() - start_time
    predictions = prediction_output.predictions[0]
    
    # 获取标签
    future_values = np.array([sample['future_values'] for sample in dset_test])  
    # 根据 predictions 的长度切片 future_values
    # future_values_selected = future_values[:len(predictions)]  
    # 在缩短预测长度的时候，出现了预测样本数和真实样本数不一致的情况，这里做一些截断
    min_length = min(len(future_values), len(predictions))
    future_values = future_values[:min_length]
    predictions =  predictions[:min_length]  
    #计算指标
    mse = mean_squared_error(future_values.reshape(-1), predictions.reshape(-1))
    mae = mean_absolute_error(future_values.reshape(-1), predictions.reshape(-1))
    mape = np.mean(np.abs((future_values - predictions) / future_values)) * 100  # 转为百分比
    smape_value = smape(future_values, predictions)
    mase_value = mase(future_values, predictions, future_values)  # 传入训练集（或验证集）

    # 输出结果
    print("均方误差 (MSE):", mse)
    print("平均绝对误差 (MAE):", mae)
    print("平均绝对百分比误差 (MAPE):", mape)
    print("对称平均绝对百分比误差 (SMAPE):", smape_value)
    print("平均绝对误差比 (MASE):", mase_value)

    print("总运行时间: {:.2f} 秒".format(total_time))
    print("CPU 内存最大占用: {:.2f} MB".format(max_cpu_memory))
    print("GPU 内存最大占用: {:.2f} MB".format(max_gpu_memory) if torch.cuda.is_available() else "GPU不可用")


    # # plot
    # plot_predictions(
    #     model=zeroshot_trainer.model,
    #     dset=dset_test,
    #     plot_dir=os.path.join(OUT_DIR, dataset_name),
    #     plot_prefix="test_zeroshot",
    #     # indices=[685, 118, 902, 1984, 894, 967, 304, 57, 265, 1015],
    #     channel=0,
    # )
    
    # 清理临时目录
    shutil.rmtree(temp_dir)

In [51]:
torch.cuda.empty_cache() 

In [54]:
zeroshot_eval(dataset_name=TARGET_DATASET, context_length=CONTEXT_LENGTH, batch_size=4,prediction_filter_length=1)

INFO:p-2574695:t-129782190835200:data_handling.py:load_dataset:Dataset name: electricity, context length: 512, prediction length 1


INFO:p-2574695:t-129782190835200:data_handling.py:load_dataset:Data lengths: train = 803, val = 329, test = 4932


均方误差 (MSE): 0.79112315
平均绝对误差 (MAE): 0.71543497
平均绝对百分比误差 (MAPE): 344.65527534484863
对称平均绝对百分比误差 (SMAPE): 125.94932317733765
平均绝对误差比 (MASE): 0.98382115
总运行时间: 219.35 秒
CPU 内存最大占用: 33603.12 MB
GPU 内存最大占用: 48340.00 MB


In [55]:
def fewshot_finetune_eval(
    dataset_name,
    batch_size,
    learning_rate=None,
    context_length=512,
    forecast_length=96,
    fewshot_percent=5,
    freeze_backbone=True,
    num_epochs=50,
    save_dir=OUT_DIR,
    prediction_filter_length=None,
):
    torch.cuda.empty_cache()  # 清理缓存
    
    global max_cpu_memory, max_gpu_memory
    max_cpu_memory = 0.0  
    max_gpu_memory = 0.0  
    # 启动监控线程
    monitor_thread = threading.Thread(target=monitor_memory)
    monitor_thread.daemon = True  # 设置为守护线程，在主线程结束时自动退出
    monitor_thread.start()
    
    out_dir = os.path.join(save_dir, dataset_name)

    # print("-" * 20, f"Running few-shot {fewshot_percent}%", "-" * 20)

    if prediction_filter_length is not None:
        if prediction_filter_length >= forecast_length:
            raise ValueError(
                "`prediction_filter_length` should be less than the original `forecast_length` of the pre-trained TTM model."
            )
        # forecast_length = forecast_length - prediction_filter_length
        forecast_length = prediction_filter_length

    # Data prep: Get dataset
    dset_train, dset_val, dset_test = load_dataset(
        dataset_name,
        context_length,
        forecast_length,
        fewshot_fraction=fewshot_percent / 100,
        dataset_path=DATASET_PATH,
    )

    # change head dropout to 0.7 for ett datasets
    if "ett" in dataset_name:
        if prediction_filter_length is None:
            finetune_forecast_model = TinyTimeMixerForPrediction.from_pretrained(
                TTM_MODEL_PATH, revision=TTM_MODEL_REVISION, head_dropout=0.7
            )
        elif prediction_filter_length <= forecast_length:
            finetune_forecast_model = TinyTimeMixerForPrediction.from_pretrained(
                TTM_MODEL_PATH,
                revision=TTM_MODEL_REVISION,
                head_dropout=0.7,
                prediction_filter_length=prediction_filter_length,
            )
        else:
            raise ValueError("`prediction_filter_length` should be <= `forecast_length")
    else:
        if prediction_filter_length is None:
            finetune_forecast_model = TinyTimeMixerForPrediction.from_pretrained(
                TTM_MODEL_PATH,
                revision=TTM_MODEL_REVISION,
            )
        elif prediction_filter_length <= forecast_length:
            finetune_forecast_model = TinyTimeMixerForPrediction.from_pretrained(
                TTM_MODEL_PATH,
                revision=TTM_MODEL_REVISION,
                prediction_filter_length=prediction_filter_length,
            )
        else:
            raise ValueError("`prediction_filter_length` should be <= `forecast_length")
    if freeze_backbone:
        print(
            "Number of params before freezing backbone",
            count_parameters(finetune_forecast_model),
        )

        # Freeze the backbone of the model
        for param in finetune_forecast_model.backbone.parameters():
            param.requires_grad = False

        # Count params
        print(
            "Number of params after freezing the backbone",
            count_parameters(finetune_forecast_model),
        )

    # Find optimal learning rate
    # Use with caution: Set it manually if the suggested learning rate is not suitable
    if learning_rate is None:
        learning_rate, finetune_forecast_model = optimal_lr_finder(
            finetune_forecast_model,
            dset_train,
            batch_size=batch_size,
        )
        print("OPTIMAL SUGGESTED LEARNING RATE =", learning_rate)

    print(f"Using learning rate = {learning_rate}")
    finetune_forecast_args = TrainingArguments(
        output_dir=os.path.join(out_dir, "output"),
        overwrite_output_dir=True,
        learning_rate=learning_rate,
        num_train_epochs=num_epochs,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        dataloader_num_workers=8,
        report_to="none",
        save_strategy="epoch",
        logging_strategy="epoch",
        save_total_limit=1,
        logging_dir=os.path.join(out_dir, "logs"),  # Make sure to specify a logging directory
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
        seed=SEED,
    )

    # Create the early stopping callback
    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=10,  # Number of epochs with no improvement after which to stop
        early_stopping_threshold=1e-5,  # Minimum improvement required to consider as improvement
    )
    tracking_callback = TrackingCallback()

    # Optimizer and scheduler
    optimizer = AdamW(finetune_forecast_model.parameters(), lr=learning_rate)
    scheduler = OneCycleLR(
        optimizer,
        learning_rate,
        epochs=num_epochs,
        steps_per_epoch=math.ceil(len(dset_train) / (batch_size)),
    )

    finetune_forecast_trainer = Trainer(
        model=finetune_forecast_model,
        args=finetune_forecast_args,
        train_dataset=dset_train,
        eval_dataset=dset_val, 
        callbacks=[early_stopping_callback, tracking_callback],
        optimizers=(optimizer, scheduler),
    )
    finetune_forecast_trainer.remove_callback(INTEGRATION_TO_CALLBACK["codecarbon"])

    # Fine tune
    start_time = time.time();
    finetune_forecast_trainer.train()

    # # Evaluation
    # print("+" * 20, f"Test MSE after few-shot {fewshot_percent}% fine-tuning", "+" * 20)
    # fewshot_output = finetune_forecast_trainer.evaluate(dset_test)
    # print(fewshot_output)
    # print("+" * 60)

    # 进行预测
    prediction_output = finetune_forecast_trainer.predict(dset_test)  # 获取预测输出
    total_time = time.time() - start_time
    predictions = prediction_output.predictions[0]
    # 获取标签
    future_values = np.array([sample['future_values'] for sample in dset_test])  

    #计算指标
    mse = mean_squared_error(future_values.reshape(-1), predictions.reshape(-1))
    mae = mean_absolute_error(future_values.reshape(-1), predictions.reshape(-1))
    mape = np.mean(np.abs((future_values - predictions) / future_values)) * 100  # 转为百分比
    smape_value = smape(future_values, predictions)
    mase_value = mase(future_values, predictions, future_values)  # 传入训练集（或验证集）

    # 输出结果
    print("均方误差 (MSE):", mse)
    print("平均绝对误差 (MAE):", mae)
    print("平均绝对百分比误差 (MAPE):", mape)
    print("对称平均绝对百分比误差 (SMAPE):", smape_value)
    print("平均绝对误差比 (MASE):", mase_value)

    print("总运行时间: {:.2f} 秒".format(total_time))
    print("CPU 内存最大占用: {:.2f} MB".format(max_cpu_memory))
    print("GPU 内存最大占用: {:.2f} MB".format(max_gpu_memory) if torch.cuda.is_available() else "GPU不可用")

    # plot
    # plot_predictions(
    #     model=finetune_forecast_trainer.model,
    #     dset=dset_test,
    #     plot_dir=os.path.join(OUT_DIR, dataset_name),
    #     plot_prefix="test_fewshot",
    #     # indices=[685, 118, 902, 1984, 894, 967, 304, 57, 265, 1015],
    #     channel=0,
    # )

In [58]:
fewshot_finetune_eval(
    dataset_name=TARGET_DATASET, context_length=CONTEXT_LENGTH, batch_size=4, fewshot_percent=50, learning_rate=0.001,prediction_filter_length=1
)

INFO:p-2574695:t-129782190835200:data_handling.py:load_dataset:Dataset name: electricity, context length: 512, prediction length 1


INFO:p-2574695:t-129782190835200:data_handling.py:load_dataset:Data lengths: train = 401, val = 329, test = 4932


Number of params before freezing backbone 805280
Number of params after freezing the backbone 289696
Using learning rate = 0.001


Epoch,Training Loss,Validation Loss
1,0.913,0.875476
2,0.8763,0.854696
3,0.857,0.854766
4,0.845,0.855906
5,0.8366,0.86116
6,0.8275,0.862338
7,0.8247,0.871303
8,0.8135,0.88582
9,0.7878,0.900694
10,0.7744,0.904885


[TrackingCallback] Mean Epoch Time = 4.248980005582173 seconds, Total Train Time = 87.2284848690033


均方误差 (MSE): 0.8085191
平均绝对误差 (MAE): 0.74176455
平均绝对百分比误差 (MAPE): 315.4524326324463
对称平均绝对百分比误差 (SMAPE): 136.31083965301514
平均绝对误差比 (MASE): 1.020028
总运行时间: 291.94 秒
CPU 内存最大占用: 33603.58 MB
GPU 内存最大占用: 48340.00 MB
