## transfer to minibatches

* mini_batch: pd.DataFrame["node_features", "column_features", "edges", "dual_offsets"]
  * "node_features" : (node_num, node_feature_dim)
  * "column_features" : (column_num, column_feature_dim)
  * "edges" : (2, column_num*column_path_length) [ [ node_idx ], [ column_idx ] ] 
  * "dual_offsets" : (node_num)

In [None]:
import numpy as np
import json
import sys
sys.path.append("D:\Code\ML4pricing\CGAlgs")
from CGAlgs import GraphTool

def pack_mini_batches(graph_file_path, iter_file_path):
    graph = GraphTool.Graph(graph_file_path)
    iterFile = json.load(open(iter_file_path, "r"))["IterOfColumns"] #!check key
    mini_batches = []
    for iter_name, iter_data in iterFile.items():
        columns = iter_data["columns"] #!check key
        duals = np.array(iter_data["dual"]) #!check key
        dual_offsets = np.array(iter_data["offset"]) #!check key
        # 特征工程
        ## node_features
        node_features = [] 
        for i in range(graph.nodeNum):
            node_feature = [duals[i], graph.locations[i][0], graph.locations[i][1], graph.demand[i], graph.readyTime[i], graph.dueTime[i]] # dim = 6
            node_features.append(node_feature)
        node_features = np.array(node_features)
        ## column_features
        column_features = []
        for ci, column in enumerate(columns):
            path = column["path"][:-1] + [0] # change terminal idx #!check key
            dualSum = sum(duals[path])
            demand = sum(graph.demand[path])
            distance = sum([graph.disMatrix[path[i]][path[i+1]] for i in range(len(path)-1)])
            onehot_path = np.zeros(graph.nodeNum)
            onehot_path[path] = 1
            column_feature = [dualSum, demand, distance] + list(onehot_path) # dim = 3 + nodeNum
            column_features.append(column_feature)
        columnd_features = np.array(column_features)
        ## edges
        edges = [[], []]
        for ci, column in enumerate(columns):
            for ni in column["path"][:-1]: #!check key
                # node to column
                edges[0].append(ni) 
                edges[1].append(ci) 
        # 保存数据
        mini_batch = {
            "node_features": node_features,
            "column_features": column_features,
            "edges": edges, 
            "dual_offsets": dual_offsets
        }
        mini_batches.append(mini_batch)
    return mini_batches

In [None]:
# 定义各个文件夹路径
graph_folder_path = "D:/Code/ML4pricing/data/graph/" #! check path
iter_folder_path = "D:/Code/ML4pricing/data/iter/" #! check path
dataset_save_path = "D:/Code/ML4pricing/data/dataset/"

In [None]:
# 打包单个算例数据
file_name = "" # to set
dataset_name = "instance_" + file_name
graph_file_path = graph_folder_path + file_name + ".json" 
iter_file_path = iter_folder_path + file_name + ".json" 
mini_batches = pack_mini_batches(graph_file_path, iter_file_path)
json.dump(mini_batches, open(dataset_save_path + dataset_name + ".json", "w"))

In [None]:
# 打包多个算例数据
file_name_list = [] # to set
dataset_name = "instances_{}".format(len(file_name_list))
mini_batches = []
for file_name in file_name_list:
    graph_file_path = graph_folder_path + file_name + ".json" 
    iter_file_path = iter_folder_path + file_name + ".json" 
    mini_batches += pack_mini_batches(graph_file_path, iter_file_path)
json.dump(mini_batches, open(dataset_save_path + dataset_name + ".json", "w"))