In [1]:
import json
import re

def read_jsonl_file(file_path):
    """
    读取 JSONL 文件并解析每一行为 JSON 对象
    :param file_path: JSONL 文件的路径
    :return: 包含所有 JSON 对象的列表
    """
    json_objects = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # 去掉行首行尾的空白字符
            line = line.strip()
            if line:  # 确保不处理空行
                try:
                    # 将每一行解析为 JSON 对象
                    json_object = json.loads(line)
                    json_objects.append(json_object)
                except json.JSONDecodeError as e:
                    print(f"Error parsing JSON: {e}")
                    print(f"Offending line: {line}")
    return json_objects

def convert_to_sharegpt_format_geospatial(input_json):
    # 提取对话内容
    def convert(js):
        def deep_parse_json(data):
            """
            递归解析嵌套的 JSON 字符串，支持字典、列表和元组。
            """
            if isinstance(data, dict):
                # 如果是字典，递归处理每个键值对
                return {key: deep_parse_json(value) for key, value in data.items()}
            elif isinstance(data, (list, tuple)):
                # 如果是列表或元组，递归处理每个元素
                # 注意：元组在解析后需要转换回元组
                parsed_list = [deep_parse_json(item) for item in data]
                return tuple(parsed_list) if isinstance(data, tuple) else parsed_list
            elif isinstance(data, str):
                # 如果是字符串，尝试解析为 JSON
                try:
                    parsed_data = json.loads(data)
                    # 如果解析成功，递归处理解析后的数据
                    return deep_parse_json(parsed_data)
                except json.JSONDecodeError:
                    # 如果解析失败，说明这不是 JSON 字符串，直接返回原字符串
                    return data
            else:
                # 其他类型直接返回
                return data
        return json.dumps(deep_parse_json(js), ensure_ascii=False)
    
    inputs = input_json.get("inputs", {})
    messages_in = inputs.get("messages", {})
    tools_raw = inputs.get("tools", [])
    tools = [convert(tool['function']) for tool in tools_raw]
    message_out = input_json.get("outputs", {}).get("message", [])
    conversations = []
    record_tool_call_ids = []
    record_assistant_conversation = []
    ready_content = ""

    for message in messages_in:
        type = message.get("role", "")
        content = message.get("content", "")
        tool_calls = message.get("tool_calls", "")
        if type == "system":
            system = content
        elif type == "user":
            inputs.append({"from": "human", "value": content})
        elif type == "assistant":
            if tool_calls:
                for tool_call in tool_calls:
                    record_tool_call_ids.append(tool_call.get("id", ""))
                    if record_assistant_conversation:
                        record_assistant_conversation.append({"from": "function_call", "value": convert(tool_call.get("function", ""))})
                    else:
                        record_assistant_conversation.append({"from": "function_call", "value": "<think>"+content+"</think>"+convert(tool_call.get("function", ""))})
            else:
                ready_content = content
        elif type == "tool":
            tool_call_id = message.get("tool_call_id", "")
            if tool_call_id in record_tool_call_ids:
                i = record_tool_call_ids.index(tool_call_id)
                inputs.append(record_assistant_conversation.pop(i))
                inputs.append({"from": "observation", "value": content})
                record_tool_call_ids.pop(i)

    if message_out.get("content", ""):
        inputs.append({"from": "function_call", "value": "<think>"+message_out['content']+"</think>"+convert(message_out['tool_calls'][0].get("function", ""))})
    else:
        conversations.append({"from": "function_call", "value": "<think>"+ready_content+"</think>"+convert(message_out['tool_calls'][0].get("function", ""))})

    # 构造 ShareGPT 格式的数据
    sharegpt_data = {
        "conversations": conversations,
        "system": system,
        "tools": convert(tools)
    }

    return sharegpt_data

def convert_to_sharegpt_format_sturctured(input_json):
    # 提取对话内容
    def convert(js):
        def deep_parse_json(data):
            """
            递归解析嵌套的 JSON 字符串，支持字典、列表和元组。
            """
            if isinstance(data, dict):
                # 如果是字典，递归处理每个键值对
                return {key: deep_parse_json(value) for key, value in data.items()}
            elif isinstance(data, (list, tuple)):
                # 如果是列表或元组，递归处理每个元素
                # 注意：元组在解析后需要转换回元组
                parsed_list = [deep_parse_json(item) for item in data]
                return tuple(parsed_list) if isinstance(data, tuple) else parsed_list
            elif isinstance(data, str):
                # 如果是字符串，尝试解析为 JSON
                try:
                    parsed_data = json.loads(data)
                    # 如果解析成功，递归处理解析后的数据
                    return deep_parse_json(parsed_data)
                except json.JSONDecodeError:
                    # 如果解析失败，说明这不是 JSON 字符串，直接返回原字符串
                    return data
            else:
                # 其他类型直接返回
                return data
        return json.dumps(deep_parse_json(js), ensure_ascii=False)
    
    inputs = input_json.get("inputs", {})
    messages_in = inputs.get("messages", {})[0]
    with open('data/tools-Plans.json', 'r') as file:
        tools_raw = json.load(file)
    tools = [convert(tools_raw)]
    tool_call_function = input_json.get("outputs", {}).get("generations", [])[0][0].get("message", {}).get("kwargs", {}).get("additional_kwargs", {}).get("tool_calls", [])[0].get("function", {})
    # tool_call_value = {
    #     "name": tool_call.get("name", ""),
    #     "arguments": tool_call.get("args", {})
    # }
    conversations = []
    record_system = 0

    for message in messages_in:
        kwargs = message.get("kwargs", "")
        type = kwargs.get("type", "")
        content = kwargs.get("content", "")
        if record_system==0 and type == "system":
            record_system+=1
            system = re.sub("Below are few-shot examples:\n  ", "", content)
        elif record_system==1 and type == "system":
            record_system+=1
        elif record_system==2 and type == "human":
            conversations.append({"from": "human", "value": content})

    conversations.append({"from": "function_call", "value": convert(tool_call_function)})

    # 构造 ShareGPT 格式的数据
    sharegpt_data = {
        "conversations": conversations,
        "system": system,
        "tools": convert(tools)
    }
    return sharegpt_data

def generate_preferred_dataset(input_json):
    # 提取对话内容
    def convert(js):
        def deep_parse_json(data):
            """
            递归解析嵌套的 JSON 字符串，支持字典、列表和元组。
            """
            if isinstance(data, dict):
                # 如果是字典，递归处理每个键值对
                return {key: deep_parse_json(value) for key, value in data.items()}
            elif isinstance(data, (list, tuple)):
                # 如果是列表或元组，递归处理每个元素
                # 注意：元组在解析后需要转换回元组
                parsed_list = [deep_parse_json(item) for item in data]
                return tuple(parsed_list) if isinstance(data, tuple) else parsed_list
            elif isinstance(data, str):
                # 如果是字符串，尝试解析为 JSON
                try:
                    parsed_data = json.loads(data)
                    # 如果解析成功，递归处理解析后的数据
                    return deep_parse_json(parsed_data)
                except json.JSONDecodeError:
                    # 如果解析失败，说明这不是 JSON 字符串，直接返回原字符串
                    return data
            else:
                # 其他类型直接返回
                return data
        return json.dumps(deep_parse_json(js), ensure_ascii=False)
    inputs = input_json.get("inputs", {})
    messages_in = inputs.get("messages", {})[0]
    with open('data/tools-Plans.json', 'r') as file:
        tools_raw = json.load(file)
    tools = [convert(tools_raw)]
    tool_call_function = input_json.get("outputs", {}).get("generations", [])[0][0].get("message", {}).get("kwargs", {}).get("additional_kwargs", {}).get("tool_calls", [])[0].get("function", {})
    invalid_tool_call = input_json.get("outputs", {}).get("generations", [])[0][0].get("message", {}).get("invalid_tool_calls", {})
    invalid = {"accepted":"", "rejected":""}
    if invalid_tool_call:
        invalid["accepted"] = convert(tool_call_function)[:-1]
        invalid["rejected"] = convert(tool_call_function)
    else:
        invalid["accepted"] = convert(tool_call_function)
        invalid["rejected"] = convert(tool_call_function)+"}"
    # tool_call_value = {
    #     "name": tool_call.get("name", ""),
    #     "arguments": tool_call.get("args", {})
    # }
    conversations = []
    record_system = 0

    for message in messages_in:
        kwargs = message.get("kwargs", "")
        type = kwargs.get("type", "")
        content = kwargs.get("content", "")
        if record_system==0 and type == "system":
            record_system+=1
            system = re.sub("Below are few-shot examples:\n  ", "", content)
        elif record_system==1 and type == "system":
            record_system+=1
        elif record_system==2 and type == "human":
            conversations.append({"from": "human", "value": content})

    conversations.append({"from": "function_call", "value": convert(tool_call_function)})

    # 构造 ShareGPT 格式的数据
    sharegpt_data = {
        "conversations": conversations,
        "system": system,
        "tools": convert(tools),
        "accepted": invalid["accepted"],
        "rejected": invalid["rejected"]
    }
    return sharegpt_data



# 推理

In [2]:
# js = read_jsonl_file('data/geospatial-raw.jsonl')
# output = []
# for j in js:
#     output.append(convert_to_sharegpt_format_geospatial(j))
# with open("data/output.json", "w", encoding="utf-8") as file:
#     # 写入json
#     json.dump(output, file, ensure_ascii=False, indent=4)

# 结构化

In [3]:
# js = read_jsonl_file('data/structured-raw.jsonl')

js = read_jsonl_file('data/temp_more.jsonl')
output = []
for j in js:
    output.append(generate_preferred_dataset(j))

with open("data/output.json", "w", encoding="utf-8") as file:
    # 写入json
    json.dump(output, file, ensure_ascii=False, indent=4)

# 获得结构化的input和output (187对输入输出)

In [24]:
import json
inputs = []
outputs = []
with open('data/structured-information-extraction.json','r', encoding='utf-8') as file:
    # print(l[:10])
    data = json.load(file)
    count=0

    for dictionary in data:
        # print(dictionary['conversations'][0]['value'][10:-1])
        # break
        try:
            input = dictionary['conversations'][0]['value'][10:-1]
            output = dictionary['conversations'][1]['value']
        except: continue
        inputs.append(input)
        outputs.append(output)

save = dict(zip(inputs, outputs))
for k,v in save.items():
    print(k)
    break
with open('data/structured-input.json', "w", encoding="utf-8") as file:
    json.dump(inputs, file, ensure_ascii=False, indent=4)
with open('data/structured-output.json', "w", encoding="utf-8") as file:
    json.dump(outputs, file, ensure_ascii=False, indent=4)
with open('data/structured-input-output.json', "w", encoding="utf-8") as file:
    json.dump(save, file, ensure_ascii=False, indent=4)
# output = []
# for j in js:
#     output.append(generate_preferred_dataset(j))


# with open("data/output.json", "w", encoding="utf-8") as file:
#     # 写入json
#     json.dump(output, file, ensure_ascii=False, indent=4)

 'Blue Army Deployment Plan\n- Mobile Repair Unit 1: [-73.9879519, 40.7477058]\n- Mobile Repair Unit 2: [-73.9878032, 40.7476405]\n- Mobile Repair Unit 3: [-73.9878032, 40.7475099]\n- Mobile Repair Unit 4: [-73.9879519, 40.7474446]\n- Mobile Repair Unit 5: [-73.9881006, 40.7475099]\n- Mobile Repair Unit 6: [-73.9881006, 40.7476405]'


### 制作复杂结构化输入集

In [None]:
import random
import json
def save_list(l, path):
    with open(path, 'w') as f:
        f.write("orders = {\n")
        for key, value in enumerate(l):
            f.write(f"    \"order_{repr(key)}\": {repr(value)},\n")  # 使用 repr 确保正确格式化
        f.write("}\n")
with open('data/structured-input-1.json','r', encoding='utf-8') as file:
    two_n = 50
    two = []
    three_n = 50
    three = []
    four_n = 50
    four = []
    five_n = 50
    five = []
    file_list = list(file)
    for i in range(two_n):
        two.append('; '.join(random.sample(file_list, 2)))
          

    for i in range(three_n):
        three.append('; '.join(random.sample(file_list, 3)))
    

    for i in range(four_n):
        four.append('; '.join(random.sample(file_list, 4)))
    

    for i in range(five_n):
        five.append('; '.join(random.sample(file_list, 5)))

    # save_list(list(set(two)), 'data/structured_input_2.py')
    # save_list(list(set(three)), 'data/structured_input_3.py')
    # save_list(list(set(four)), 'data/structured_input_4.py')
    # save_list(list(set(five)), 'data/structured_input_5.py')
    # save_list(list(set(file_list)), 'data/structured_input_1.py')
    # save_list(list(set(random.sample(file_list, 10)+random.sample(two, 10)+random.sample(three, 10)+random.sample(four, 10)+random.sample(five, 10))), 'data/structured_input_all.py')


# jsonl格式化

In [2]:
# js = read_jsonl_file('data/structured-raw.jsonl')
import sys
sys.path.append('/home/root/coorGen')

from scripts.convert import convert

js = read_jsonl_file('data/test1.jsonl')
output = []
for j in js:
    output.append(convert(j))

with open("data/output.jsonl", "w", encoding="utf-8") as file:
    # 写入json
    json.dump(output, file, ensure_ascii=False, indent=4)


# with open(file_path, 'r', encoding='utf-8') as file:
#         for line in file:
#             # 去掉行首行尾的空白字符
#             line = line.strip()
#             if line:  # 确保不处理空行
#                 try:
#                     # 将每一行解析为 JSON 对象
#                     json_object = json.loads(line)
#                     json_objects.append(json_object)
#                 except json.JSONDecodeError as e:
#                     print(f"Error parsing JSON: {e}")
#                     print(f"Offending line: {line}")

Error parsing JSON: Expecting value: line 1 column 362 (char 361)
Offending line: {"config": {"tags": [], "metadata": "ChainMap({})", "callbacks": "", "recursion_limit": 25, "configurable": {"checkpoint_ns": "", "thread_id": "dc7f5b9d-4b7e-4029-8a1c-138df1984f7c", "checkpoint_id": "1f0443ca-af8c-648e-bfff-ea819b4b3646"}}, "parent_config": "", "values": {"messages": []}, "metadata": {"source": "input", "writes": {"__start__": {"messages": [("user", "正五边形队形，中心点在[121.1489224, 24.9048013]，部署数量5，两两间距30米，其他要求为空")]}}, "step": -1, "parents": {}}, "next": ["__start__"], "tasks": [{"id": "c4e24f80-d687-1880-b68f-f9b9308136b8", "name": "__start__", "interrupts": (), "state": ""}]}
Error parsing JSON: Expecting value: line 1 column 515 (char 514)
Offending line: {"config": {"tags": [], "metadata": "ChainMap({})", "callbacks": "", "recursion_limit": 25, "configurable": {"checkpoint_ns": "", "thread_id": "dc7f5b9d-4b7e-4029-8a1c-138df1984f7c", "checkpoint_id": "1f0443ca-af90-6c6e-8000-94147baa7fe0"}