In [44]:
import os
import warnings

import pandas as pd
from tqdm import tqdm

In [45]:
# 预设值
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None  # 展示所有列

In [46]:
folders = ['requests_minute', 'memory_usage_minute', 'cpu_usage_minute', 'instances_minute', 'memory_limit_minute',
           'cpu_limit_minute']
base_path = '../datasets/huawei/private_dataset'  # 更新为你的数据集的基本路径

In [47]:
# 初始化空DataFrame来存储最终结果
final_df = pd.DataFrame()

day_count = 1
# 遍历每一天
for day in tqdm(range(235)):  # 从day_000.csv到day_234.csv
    if day_count > 52:
        break
    day_str = f'day_{day:03}.csv'
    day_data = None

    # 遍历每个文件夹（数据类型）
    for folder in folders:
        file_path = os.path.join(base_path, folder, day_str)

        try:
            # 读取数据
            temp_df = pd.read_csv(file_path)
        except FileNotFoundError:
            # print(f'文件{file_path}不存在')
            continue

        # 将数据从宽格式转换为长格式
        temp_df_long = temp_df.melt(id_vars=['day', 'time'], var_name='API_ID', value_name=folder)

        # 转换API_ID为整型，以便于后续处理
        temp_df_long['API_ID'] = temp_df_long['API_ID'].astype(int)

        # 合并数据
        if day_data is None:
            day_data = temp_df_long
        else:
            day_data = pd.merge(day_data, temp_df_long, on=['day', 'time', 'API_ID'])

    # 将当天的数据添加到最终结果DataFrame中
    final_df = pd.concat([final_df, day_data], ignore_index=True)
    if day_data is not None:
        day_count += 1

# 填充缺失值为0
final_df.fillna(0, inplace=True)

# 重命名列以符合要求
final_df.rename(columns={'requests_minute': 'requests_minute',
                         'memory_usage_minute': 'memory_usage_minute',
                         'cpu_usage_minute': 'cpu_usage_minute',
                         'instances_minute': 'instances_minute',
                         'cpu_limit_minute': 'cpu_limit_minute',
                         'memory_limit_minute': 'memory_limit_minute',
                         }, inplace=True)

final_df = final_df.sort_values(by=['API_ID', 'time']).reset_index(drop=True)
seconds_per_day = 24 * 3600
final_df['seconds_since_midnight'] = final_df['time'] - (final_df['day'] * seconds_per_day)
# 计算相对于当天的小时
final_df['hour'] = final_df['seconds_since_midnight'] // 3600
# 计算一小时内的分钟
final_df['minute'] = (final_df['seconds_since_midnight'] % 3600) // 60

final_df['total_cpu_usage'] = (final_df['cpu_usage_minute'] * final_df['instances_minute']).round(3)
final_df['total_memory_usage'] = (final_df['memory_usage_minute'] * final_df['instances_minute']).round(3)
final_df['absolute_cpu_usage'] = (final_df['cpu_usage_minute'] * final_df['cpu_limit_minute']).round(3)
final_df['absolute_memory_usage'] = (final_df['memory_usage_minute'] * final_df['memory_limit_minute']).round(3)

final_df = final_df.drop(['seconds_since_midnight'], axis=1)

order = ['day', 'time', 'hour', 'minute', 'API_ID', 'requests_minute',
         'memory_usage_minute', 'cpu_usage_minute', 'instances_minute', 'memory_limit_minute',
         'cpu_limit_minute', 'total_cpu_usage', 'total_memory_usage', 'absolute_cpu_usage', 'absolute_memory_usage',
         ]
final_df = final_df[order]

print(final_df.head())
# 将合并后的数据保存到CSV文件
final_csv_path = os.path.join('../datasets/huawei', 'combined_dataset.csv')
final_df.to_csv(final_csv_path, index=False)

 26%|██▌       | 61/235 [00:36<01:44,  1.67it/s]


   day  time  hour  minute  API_ID  requests_minute  memory_usage_minute  \
0    0     0     0       0       0              0.0                  0.0   
1    0    60     0       1       0              0.0                  0.0   
2    0   120     0       2       0              0.0                  0.0   
3    0   180     0       3       0              0.0                  0.0   
4    0   240     0       4       0              0.0                  0.0   

   cpu_usage_minute  instances_minute  memory_limit_minute  cpu_limit_minute  \
0               0.0               0.0                  0.0               0.0   
1               0.0               0.0                  0.0               0.0   
2               0.0               0.0                  0.0               0.0   
3               0.0               0.0                  0.0               0.0   
4               0.0               0.0                  0.0               0.0   

   total_cpu_usage  total_memory_usage  absolute_cpu_usage  \
