In [1]:
import os
import json

def mkdir(dir):
    if os.path.isdir(dir):
        print(f"Directory '{dir}' already exists.")
    else:
        os.makedirs(dir)
        print(f"Directory '{dir}' created successfully.")

def json_load(file_path):
    #print("load data from: " + file_path)
    with open(file_path, 'r') as f:
        data_dict = json.load(f)
    return data_dict

def json_dump(file_path, data):
    """
    Dumps a dictionary to a JSON file.
    
    Parameters:
    - data (dict): The dictionary to be dumped.
    - filename (str): The path to the file where the JSON will be saved.
    """
    try:
        with open(file_path, 'w') as f:
            print("write result to: " + file_path)
            json.dump(data, f, indent=1, ensure_ascii=False)

    except Exception as e:
        print(f"Error occurred: {e}")

source_dir = os.path.join('..', 'day18', 'data')
response_dir = os.path.join(source_dir, 'temp')
print(os.listdir(response_dir))
answer_dir = os.path.join(source_dir, 'source')
print(os.listdir(answer_dir))

['4_gemma.json', '3_llama_chat.json', '5_json_gemma.json', '2_llama_zh.json', '1_llama_en.json']
['structured_output_dataset.json']


In [2]:
# SemanticSimilarityEvaluator
## input: response, reference_answer
### response: json_gemma, llama_en
## output: source, padding and similarity_threshold

# CorrectnessEvaluator:
## input: query, response, reference_answer
### query: extract_prompt + text
### response: json_gemma, llama_en
### reference_answer: re
## output: feedback, score, passing, and _score_threshold

# FaithfulnessEvaluator:
## input: prompt_faithfulness, response, context
### response: json_gemma, fake_json_gemma
### context: context
## output: (0/1) feedback, passing, score

In [3]:
# let's collect our requirement
data = {}

In [4]:
redata = json_load(os.path.join(answer_dir, 'structured_output_dataset.json'))
redata['examples'][0]

{'query': 'Pydantic',
 'query_by': 'human',
 'reference_answer': {'qid': '1',
  'stem': '常見針灸配穴法中,所指的「四關穴」,為下列何穴位之組合?',
  'A': '上星、日月',
  'B': '合谷、太衝',
  'C': '內關、外關',
  'D': '上關、下關'},
 'reference_answer_by': {'model_name': 're', 'type': 'ai'},
 'reference_context': ['1.常見針灸配穴法中，所指的「四關穴」，為下列何穴位之組合？\n\xa0\nA.上星、日月\nB.合谷、太衝\nC.內關、外關\nD.上關、下關']}

In [5]:
context = redata['examples'][0]['reference_context'][0]
context

'1.常見針灸配穴法中，所指的「四關穴」，為下列何穴位之組合？\n\xa0\nA.上星、日月\nB.合谷、太衝\nC.內關、外關\nD.上關、下關'

In [6]:
reference_answer = redata['examples'][0]['reference_answer']
reference_answer

{'qid': '1',
 'stem': '常見針灸配穴法中,所指的「四關穴」,為下列何穴位之組合?',
 'A': '上星、日月',
 'B': '合谷、太衝',
 'C': '內關、外關',
 'D': '上關、下關'}

In [7]:
# get response from json_gemma
json_gemma_answer = json_load(os.path.join(response_dir, '5_json_gemma.json'))
json_gemma_response = json_gemma_answer[0]['response']
json_gemma_response

{'qid': 1,
 'stem': '常見針灸配穴法中，所指的「四關穴」，為下列何穴位之組合？',
 'A': '上星、日月',
 'B': '合谷、太衝',
 'C': '內關、外關',
 'D': '上關、下關'}

In [8]:
llama_en_answer = json_load(os.path.join(response_dir, '1_llama_en.json'))
llama_en_response = llama_en_answer[7]['response']
llama_en_response

{'A': '主蒙，曜月',
 'B': '合座，夡里',
 'C': '到递，割递',
 'D': '主递，一递',
 'qid': 1,
 'stem': '台九気动组化。\n\nA.主蒙，曜月\nB.合座，夡里\nC.到递，割递\nD.主递，一递'}

In [9]:
data['context'] = context
data['reference_answer'] = reference_answer
data['json_gemma_response'] = json_gemma_response
data['llama_en_response'] = llama_en_response
data

{'context': '1.常見針灸配穴法中，所指的「四關穴」，為下列何穴位之組合？\n\xa0\nA.上星、日月\nB.合谷、太衝\nC.內關、外關\nD.上關、下關',
 'reference_answer': {'qid': '1',
  'stem': '常見針灸配穴法中,所指的「四關穴」,為下列何穴位之組合?',
  'A': '上星、日月',
  'B': '合谷、太衝',
  'C': '內關、外關',
  'D': '上關、下關'},
 'json_gemma_response': {'qid': 1,
  'stem': '常見針灸配穴法中，所指的「四關穴」，為下列何穴位之組合？',
  'A': '上星、日月',
  'B': '合谷、太衝',
  'C': '內關、外關',
  'D': '上關、下關'},
 'llama_en_response': {'A': '主蒙，曜月',
  'B': '合座，夡里',
  'C': '到递，割递',
  'D': '主递，一递',
  'qid': 1,
  'stem': '台九気动组化。\n\nA.主蒙，曜月\nB.合座，夡里\nC.到递，割递\nD.主递，一递'}}

In [10]:
DEST_DIR = os.path.join('data', 'source')
mkdir(DEST_DIR)
save_file_path = os.path.join(DEST_DIR, 'example_test_data.json')
json_dump(save_file_path, data)

Directory 'data/source' created successfully.
write result to: data/source/example_test_data.json
