In [1]:
from collections.abc import Generator
from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
import datasets
import pandas as pd
from datasets import Features, Sequence, Value
ds_cpu_usage = datasets.load_from_disk("dataset_cpu").with_format("numpy")
ds_memory_usage = datasets.load_from_disk("dataset_memory").with_format("numpy")
ds_request = datasets.load_from_disk("dataset_request").with_format("numpy")

  from .autonotebook import tqdm as notebook_tqdm


### Step 1: 获取要保留的时间片段的索引范围
原始时间序列是不连续的时间片段，需要切割为连续的时间片段，第一步是获取要保留的时间片段的索引范围。

In [2]:
date_rng = pd.date_range(start='2023-01-01 00:00:00', periods=67680, freq='5T')
def get_segment_period(date_rng):
    # 定义要保留的日期范围（天数）
    day_ranges = [
        (1, 19),    # 第1~19天
        (29, 61),   # 第29~61天
        (118, 129), # 第118~129天
        (132, 139), # 第132~139天
        (148, 166), # 第148~166天
        (169, 185), # 第169~185天
        (198, 208), # 第198~208天
        (212, 222), # 第212~222天
        (225, 235)  # 第225~235天
    ]
    # 每天的数据点数量
    points_per_day = 288

    # 为每个日期范围设置掩码
    mask = np.zeros(len(date_rng), dtype=bool)
    for start_day, end_day in day_ranges:
        start_idx = (start_day - 1) * points_per_day
        end_idx = end_day * points_per_day
        mask[start_idx:end_idx] = True

    segment_period = []
    i = 0
    while i < len(mask):
        if mask[i] and (i == 0 or not mask[i-1]):
            start = i
            j = i + 1
            while j < len(mask) and mask[j]:
                j += 1
            segment_period.append((start, j))
            i = j
        else:
            i += 1
    return segment_period

segment_period = get_segment_period(date_rng)

### Step 2: 切割时间序列

In [3]:
# 存储切割后的数据集
segmented_datasets = {}

segment_stats = {
    'cpu_usage': [],
    'memory_usage': [],
    'request':[],
}

segmented_datasets['cpu_usage'] =pd.DataFrame()
segmented_datasets['memory_usage'] =pd.DataFrame()
segmented_datasets['request']=pd.DataFrame()

df_cpu_usage=pd.DataFrame(ds_cpu_usage['target'])
df_memory_usage=pd.DataFrame(ds_memory_usage['target'])
df_request=pd.DataFrame(ds_request['target'])


df_memory_usage=df_memory_usage.T
df_cpu_usage=df_cpu_usage.T
df_request=df_request.T

for i, (start, end) in enumerate(segment_period):
    start_time = date_rng[start].strftime('%Y%m%d_%H%M')
    end_time = date_rng[end-1].strftime('%Y%m%d_%H%M')
    segment_id = f"{start_time}_to_{end_time}"
    for name, df in [('cpu_usage', df_cpu_usage), 
                     ('memory_usage', df_memory_usage),
                     ('request', df_request)]:
        segment_data = df.iloc[start:end, :].values 
        column_names = [f"function{col}_{i}" for col in range(segment_data.shape[1])]
        segment_data = pd.DataFrame(segment_data, columns=column_names)
        segment_length = end - start
        missing_values = np.isnan(segment_data).sum(axis=0)  # 计算每列的缺失值
        missing_ratio = missing_values / segment_length

        for col in range(segment_data.shape[1]):
            segment_stats[name].append({
                'id':f"function{col}_{i}",
                'segment_id': segment_id,
                'column_id': col,
                'start_idx': start,
                'end_idx': end,
                'length': segment_length,
                'missing_values': missing_values[col],
                'missing_ratio': missing_ratio[col],  # 添加缺失值比例
                'start_time': date_rng[start],
                'end_time': date_rng[end-1]
            })
            
        segmented_datasets[name] = pd.concat([segmented_datasets[name], segment_data], axis=1)

### Step 3: 删除缺失值过多的时间序列片段

In [4]:
stats_df_cpu_usage = pd.DataFrame(segment_stats['cpu_usage']).iloc[:,[0,7]]
stats_df_memory_usage = pd.DataFrame(segment_stats['memory_usage']).iloc[:,7]
stats_df_request = pd.DataFrame(segment_stats['request']).iloc[:,7]
stats_df=pd.concat([stats_df_cpu_usage,stats_df_memory_usage,stats_df_request],axis=1)

# 计算每行的最大缺失率,如果最大缺失率小于0.01，则保留
max_missing_ratio = stats_df.iloc[:, 1:].max(axis=1)
valid_rows = max_missing_ratio <= 0.01
valid_ids = stats_df.loc[valid_rows, 'id'].tolist()
print(len(valid_ids))

# 存储筛选后的数据集
filtered_datasets = {}
for name in segmented_datasets:
    filtered_datasets[name] = segmented_datasets[name][valid_ids]
filtered_datasets['cpu_usage'].shape
filtered_datasets['memory_usage'].shape
filtered_datasets['request'].shape


621


(9504, 621)

In [5]:
time_series_data = {}
id_lengths = pd.DataFrame(segment_stats['cpu_usage'])[['id', 'length']]
id_lengths = id_lengths.set_index('id')

for function_id in valid_ids:
    length = id_lengths.loc[function_id, 'length']
    series_data = pd.DataFrame({
        'cpu_usage': filtered_datasets['cpu_usage'][function_id][:length],
        'memory_usage': filtered_datasets['memory_usage'][function_id][:length],
        'request': filtered_datasets['request'][function_id][:length]
    })
    time_series_data[function_id] = series_data
print(len(time_series_data))
print(time_series_data.keys())

desired_columns = [
    'cpu_usage',
    'memory_usage',
    'request'
]
new_columns = [chr(65 + i) for i in range(3)]  
for function_id in time_series_data.keys():
    time_series_data[function_id] = (time_series_data[function_id][desired_columns]
                                   .rename(columns=dict(zip(desired_columns, new_columns))))


621
dict_keys(['function6_0', 'function10_0', 'function14_0', 'function27_0', 'function28_0', 'function34_0', 'function38_0', 'function39_0', 'function40_0', 'function54_0', 'function56_0', 'function57_0', 'function60_0', 'function65_0', 'function66_0', 'function72_0', 'function75_0', 'function87_0', 'function88_0', 'function89_0', 'function90_0', 'function91_0', 'function92_0', 'function93_0', 'function94_0', 'function96_0', 'function97_0', 'function100_0', 'function102_0', 'function104_0', 'function116_0', 'function124_0', 'function127_0', 'function129_0', 'function130_0', 'function131_0', 'function132_0', 'function134_0', 'function135_0', 'function136_0', 'function146_0', 'function147_0', 'function150_0', 'function155_0', 'function156_0', 'function163_0', 'function164_0', 'function165_0', 'function168_0', 'function169_0', 'function170_0', 'function171_0', 'function172_0', 'function173_0', 'function174_0', 'function179_0', 'function183_0', 'function190_0', 'function194_0', 'function1

### Step 2: 将数据存储为parrow格式
以dataset格式存储，便于后续处理。target为cpu_usage、memory_usage,shape为(var, time)；
past_feat_dynamic_real为request,shape为(1, time)


In [10]:
from collections.abc import Generator
from pathlib import Path
from typing import Any
import datasets
import pandas as pd
from datasets import Features, Sequence, Value

def multivar_gen_func() -> Generator[dict[str, Any], None, None]:
    for i, (function_id, df) in enumerate(time_series_data.items()):
        yield {
            "target": df.iloc[:,:2].to_numpy().T,  # array of shape (var, time)
            "start": df.index[0],
            "freq": "5T",
            "item_id": f"item_{i}",  
            "past_feat_dynamic_real":df.iloc[:,2:3].to_numpy().T,
        }

features = Features(
    dict(
        target=Sequence(
            Sequence(Value("float32")), length=2
        ),  
        start=Value("timestamp[s]"),
        freq=Value("string"),
        item_id=Value("string"),
        past_feat_dynamic_real=Sequence(
            Sequence(Value("float32")), length=1
        ),
    )
)

def reindex_data(df):
    start_time = pd.Timestamp('2023-01-01 00:00:00')
    df.index = pd.date_range(start=start_time, periods=len(df), freq='5T')
    df.index.name = 'timestamp'
    return df


In [11]:
for function_id in time_series_data.keys():
    time_series_data[function_id] = reindex_data(time_series_data[function_id])

dataset_fragment = datasets.Dataset.from_generator(
    multivar_gen_func, features=features
)
dataset_fragment.save_to_disk("hwfunc_dataset_multi")


Generating train split: 621 examples [00:14, 42.53 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 621/621 [00:00<00:00, 11285.66 examples/s]
