In [35]:
import json
from openai import OpenAI
import random

In [None]:
# merging the two datasets into one
final_data = {}
with open('Dataset/magyar_tortenelem_cr.json', 'r', encoding = 'utf-8') as f1, open('Dataset/tortenelem_cr.json', 'r', encoding = 'utf-8') as f2:
    data1 = json.load(f1)
    data2 = json.load(f2)
    id_counter = 0
    for key in data1:
        final_data[str(id_counter)] = data1[key]
        id_counter += 1
    
    for key in data2:
        final_data[str(id_counter)] = data2[key]
        id_counter += 1
        
# save the merged dataset
with open('Dataset/data.json', 'w', encoding = 'utf-8') as f_out:
    json.dump(final_data, f_out, ensure_ascii = False, indent = 4)

In [None]:
# Check the quality of the merged dataset with a metric and a LLM call
# The metric will be correctness_relevance(chosen) + delta(correctness_relevance(chosen), correctness_relevance(rejected))
# The score will be between [0,5] 
# Entry is accepted if the score is above 5.0
client = OpenAI(
    base_url="http://mobydick.elte-dh.hu:23432/v1",
    api_key="7TL4I2Me17MZ2J0zFSAKMMIIpoRXv26A"
)

In [None]:
# First example
example_1_json = {
    "prompt": [
        {
            "role": "user", 
                "content": f"Forrás szöveg: Mussolini apja elkötelezett és szűkebb környezetében elismert szocialista volt. Gyakran megfordultak nála olasz anarchista marxisták, otthonában pedig több ilyen szellemiségű művet tartott - nem csoda, hogy Benito már 17 évesen anarchistának tartotta magát Kérdés: Milyen politikai irányultságúnak tartotta magát Benito Mussolini?"
        }
    ],
    "chosen": [{"role": "assistant", "content": "Anarchista."}],
    "rejected": [{"role": "assistant", "content": "Kapitalista."}],
}


score_1_json = {
    "context" : example_1_json,
    "score": [
        {
            "chosen_score" : 5.0,
            "rejected_score" : 0.0
        }
    ]
}

In [None]:
# second example
example_2_json = {
    "prompt": [
        {
            "role": "user", 
                "content": f"Forrás szöveg: Joszif Sztálin 1904-ben megszökött a száműzetésből, majd Tbiliszibe ment az Egyesült Kaukázusi Bizottságot vezetni. „Credo” címmel programtervezetet készített, amiben hangot adott egyet nem értésének a Párttal, és annak egyes eszméivel kapcsolatos kérdésekkel. Kérdés: Milyen névvel tervezett programtervezetet Sztálin, miután megszökött a száműzetésből?"
        }
    ],
    "chosen": [{"role": "assistant", "content": "A kommunista manifesztó."}],
    "rejected": [{"role": "assistant", "content": "Credo."}],
}


score_2_json = {
    "context" : example_2_json,
    "score": [
        {
            "chosen_score" : 0.0,
            "rejected_score" : 5.0
        }
    ]
}

In [None]:
# third example: to provide a contextual understanding as well
example_3_json = {
    "prompt": [
        {
            "role": "user", 
                "content": f"Forrás szöveg: A két szám összege (vagyis összegük) az a végeredmény, ami akkor adódik, ha a két számot összeadjuk. Kérdés: Mennyi 17 és 35 összege?"
        }
    ],
    "chosen": [{"role": "assistant", "content": "Az 1914-ben kirobbanó háborút később az első világháborúnak nevezték."}],
    "rejected": [{"role": "assistant", "content": "~50."}],
}


score_3_json = {
    "context" : example_3_json,
    "score": [
        {
            "chosen_score" : 0.0,
            "rejected_score" : 1.0
        }
    ]
}

In [10]:
few_shot_prompt = f"""
You are a dataset validator. Assign a score to the chosen and rejected andswer based on correctness and relevance to the source.
The score is between 0 and 5, where 0 is completely incorrect/irrelevant and 5 is completely correct/relevant.
IMPORTANT: You must include the input json inside the 'prompt' field so the model can read it.

### Example 1
[Input JSON]:
{json.dumps(example_1_json, ensure_ascii=False)}

[Target JSON]:
{json.dumps(score_1_json, ensure_ascii=False)}

### Example 2
[Input JSON]:
{json.dumps(example_2_json, ensure_ascii=False)}
[Target JSON]:
{json.dumps(score_2_json, ensure_ascii=False)}

### Example 3
[Input JSON]:
{json.dumps(example_3_json, ensure_ascii=False)}
[Target JSON]:
{json.dumps(score_3_json, ensure_ascii=False)}
"""

In [11]:
system_prompt = """You are an expert dataset validator.
Task:
1. Analyze the provided [Input JSON].
2. Score the chosen and rejected answers based on correctness and relevance to the source.
3. Construct a valid JSON object with:
   - "context": The provided [Input JSON]."
   - "score_chosen": The score for the chosen answer.
   - "score_rejected": The score for the rejected answer.
"""

In [12]:
data = {
    "prompt": [
        {
            "role": "user", 
                "content": f"Forrás szöveg: Az ég kék. Kérdés: Milyen színű az ég?"
        }
    ],
    "chosen": [{"role": "assistant", "content": "Türkiz kék."}],
    "rejected": [{"role": "assistant", "content": "Fekete ha este van."}],
}

In [16]:
def score_chosen_rejected(data):
    try:

        chat_completion = client.chat.completions.create(
            model="zai-org/GLM-4.5-Air-FP8",
            messages=[
                {"role": "system", "content": system_prompt },
                {
                    "role": "user", 
                    "content": f"{few_shot_prompt}\n\n### New Task\n[Input Text]:\n\"{data}\"\n\n[Target JSON]:"
                }
            ],
            temperature=0.7,
            stream=False,
            response_format={"type": "json_object" }
        )
        answer_json = chat_completion.choices[0].message.content
    except Exception as e:
        print(f"Error generating dataset entry: {e}")
        answer_json = {}
    return answer_json

In [36]:
def validate_chosen_rejected_dataset(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f :
        data = json.load(f)
    
    dataset = {}
    
    # validate 50 entries
    sample_keys = random.sample(list(data.keys()), 50)
    

    for item in sample_keys:
        #print(item, data[item])
        t = data[item]
        ans = score_chosen_rejected(t)
        #print(f"Generated for item {item}: {ans}")
        try:
            json_ans = json.loads(ans)
            # Check if keys exist
            if not all(k in json_ans for k in ["context", "score"]):
                raise ValueError("Missing keys")
            else:
                print(f"Successfully parsed generation for item {item}.")
                dataset[item] = json_ans
          
            
        except Exception as e:
            print(f"Failed to parse generation: {e}")

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)
        


In [37]:
validate_chosen_rejected_dataset("Dataset/merged_cr.json", "Dataset/scored_merged_cr.json")

Successfully parsed generation for item 1454.
Successfully parsed generation for item 951.
Successfully parsed generation for item 1495.
Successfully parsed generation for item 1438.
Successfully parsed generation for item 357.
Successfully parsed generation for item 86.
Successfully parsed generation for item 514.
Successfully parsed generation for item 124.
Successfully parsed generation for item 128.
Successfully parsed generation for item 1419.
Successfully parsed generation for item 1022.
Successfully parsed generation for item 737.
Successfully parsed generation for item 1234.
Failed to parse generation: Missing keys
Successfully parsed generation for item 239.
Successfully parsed generation for item 1246.
Successfully parsed generation for item 335.
Successfully parsed generation for item 552.
Successfully parsed generation for item 1177.
Successfully parsed generation for item 1378.
Successfully parsed generation for item 445.
Successfully parsed generation for item 508.
Succes

In [49]:
# calculate the average score of the dataset so that we can have an idea about its quality
scores = []
with open("Dataset/scored_merged_cr.json", "r", encoding="utf-8") as f :
    scored_data = json.load(f)
    for item in scored_data:
        #print(scored_data[item]["score"][0].keys())
        #print(scored_data[item]["score"][0]["chosen_score"], scored_data[item]["score"][0]["rejected_score"])
        ch_score = scored_data[item]["score"][0]["chosen_score"]
        rej_score = scored_data[item]["score"][0]["rejected_score"]
        delta = ch_score - rej_score
        total_score = ch_score + delta
#        print(f"Item {item} - Total score: {total_score}")
        scores.append(total_score)

In [50]:
print(f"Quality of the dataset - Average score: {sum(scores)/len(scores)}")

Quality of the dataset - Average score: 9.931818181818182


In [52]:
# Convert to jsonl for Hugging-face schenanigans
import pandas as pd

df = pd.read_json('Dataset/data.json')


In [55]:
df_transposed = df.transpose()

In [56]:
df_transposed.to_json('Dataset/data.jsonl', orient='records', lines=True, force_ascii=False)