In [None]:
import json
import pandas as pd

def safe_json_load(value):
    if not value or pd.isna(value) or (isinstance(value, str) and value.strip() == ""):
        return {}
    try:
        if isinstance(value, str):
            return json.loads(value)
        return value
    except json.JSONDecodeError:
        return {}

def analyze_json_references(data):
    """
    Process a JSON object (or dict) to count and analyze properties
    with 'referenced' and 'text_reference'. Returns a summary dictionary.
    """
    true_count = 0
    false_count = 0
    false_details = {}  # Initialize as dict instead of list
  
    def process_json(obj, parent_key=None):
        nonlocal true_count, false_count, false_details
        if isinstance(obj, dict):
            # Check if current dict has both "referenced" and "text_reference"
            if "referenced" in obj and "text_reference" in obj:
                if obj["referenced"] is True:
                    true_count += 1
                else:
                    false_count += 1
                    # Use parent key if available; otherwise, 'unknown'
                    prop_name = parent_key if parent_key is not None else 'unknown'
                    false_details[prop_name] = obj["text_reference"]
            # Recursively process each key-value pair
            for key, value in obj.items():
                process_json(value, parent_key=key)
        elif isinstance(obj, list):
            for item in obj:
                process_json(item, parent_key=parent_key)

    process_json(data)
    total_references = true_count + false_count
    percentage_true = (true_count / total_references * 100) if total_references else 0
   
    return {
        "prompt_related_score": percentage_true,
        "total_references": total_references,
        "true_references": true_count,
        "false_references": false_count,
        "false_references_details": false_details
    }

def analyze_json_column(json_value):
    """
    Wrapper function to process a DataFrame cell.
    Expects json_value to be either a JSON string or a dict.
    """
    if isinstance(json_value, str):
        data = json.loads(json_value)
    else:
        data = json_value
    return analyze_json_references(data)

def count_json_fields(data):
    """
    Counts the total number of fields (keys) in a JSON object. Will include metadata like 'type', 'enum', 'required', etc.
    """
    field_count = 0

    def process_json(obj):
        nonlocal field_count
        if isinstance(obj, dict):
            field_count += len(obj)
            for value in obj.values():
                process_json(value)
        elif isinstance(obj, list):
            for item in obj:
                process_json(item)

    if isinstance(data, str):
        try:
            data = json.loads(data)
        except json.JSONDecodeError:
            return None
    process_json(data)
    return field_count

def count_json_properties(data):
    """
    Counts the total number of actual fields defined under 'properties' in a JSON schema.
    Ignores metadata like 'type', 'enum', 'required', etc.
    """
    if not data or pd.isna(data):
        return 0
    if isinstance(data, str):
        try:
            data = json.loads(data)
        except json.JSONDecodeError:
            return 0

    schema_field_count = 0

    def traverse(obj):
        nonlocal schema_field_count
        if isinstance(obj, dict):
            if "properties" in obj:
                schema_field_count += len(obj["properties"])
                for value in obj["properties"].values():
                    traverse(value)
            elif "items" in obj:
                traverse(obj["items"])

    traverse(data)
    return schema_field_count

def extract_schema_properties(schema):
    """
    Recursively extract all property keys (as hierarchical dot paths) from a JSON schema.
    This function only considers keys defined under the "properties" field.
    """
    properties = set()
    
    def traverse(obj, parent=""):
        if isinstance(obj, dict):
            if "properties" in obj:
                for key, value in obj["properties"].items():
                    # Create a dot-path for nested properties
                    current_path = f"{parent}.{key}" if parent else key
                    properties.add(current_path)
                    traverse(value, current_path)
            elif "items" in obj:
                # Handle arrays that might have items defined as objects with properties
                traverse(obj["items"], parent)
        elif isinstance(obj, list):
            for item in obj:
                traverse(item, parent)
    
    traverse(schema)
    return properties

def extract_referenced_json_keys(data):
    """
    Recursively extract all keys from a JSON object as hierarchical dot paths.
    """
    keys = set()
    
    def traverse(obj, parent=""):
        if isinstance(obj, dict):
            for key, value in obj.items():
                current_path = f"{parent}.{key}" if parent else key
                keys.add(current_path)
                traverse(value, current_path)
        elif isinstance(obj, list):
            for item in obj:
                traverse(item, parent)
    
    traverse(data)
    return keys

def extract_referenced_json_keys_nonmetadata(data):
    """
    Recursively extracts all keys from a JSON object as hierarchical dot paths,
    but ignores common metadata keys like "type", "referenced", and "text_reference".
    """
    metadata_keys = {"type", "referenced", "text_reference"}
    keys = set()
    
    def traverse(obj, parent=""):
        if isinstance(obj, dict):
            for key, value in obj.items():
                # Skip metadata keys
                if key in metadata_keys:
                    continue
                current_path = f"{parent}.{key}" if parent else key
                keys.add(current_path)
                traverse(value, current_path)
        elif isinstance(obj, list):
            for item in obj:
                traverse(item, parent)
    
    traverse(data)
    return keys

def analyze_json_and_schema(referenced_json, schema_json):
    """
    Combines analysis of the referenced JSON with schema matching.
    Returns the analysis dict with:
      - matching_properties_count: number of schema properties present in the JSON data.
      - non_matching_properties_count: number of schema properties missing in the JSON data.
      - non_matching_properties: list of missing schema property paths.
      - schema_field_count: total number of properties defined in the schema.
    """
    # Analyze JSON references
    analysis = analyze_json_column(referenced_json)
    
    # Parse schema JSON if necessary
    if isinstance(schema_json, str):
        try:
            schema_data = json.loads(schema_json)
        except json.JSONDecodeError:
            schema_data = {}
    else:
        schema_data = schema_json

    # Parse referenced JSON data if necessary for key extraction
    if isinstance(referenced_json, str):
        try:
            data = json.loads(referenced_json)
        except json.JSONDecodeError:
            data = {}
    else:
        data = referenced_json

    # Extract property paths from schema and data
    schema_props = extract_schema_properties(schema_data)
    data_keys = extract_referenced_json_keys_nonmetadata(data)
    
    matching_props = schema_props.intersection(data_keys)
    non_matching_props = schema_props - data_keys
    
    # Append matching stats to analysis
    # analysis["schema_field_count"] = len(schema_props)
    # analysis["data_property_count"] = len(data_keys)
    # analysis["matching_properties_count"] = len(matching_props)
    # analysis["non_matching_properties_count"] = len(non_matching_props)
    # analysis["non_matching_properties"] = sorted(list(non_matching_props))
    
    return analysis


# Helper function to get sorted schema keys as a comma-separated string
def get_sorted_schema_keys(schema_value):
    schema_data = safe_json_load(schema_value)
    keys = extract_schema_properties(schema_data)
    return ", ".join(sorted(keys))

# Helper function to get sorted data keys as a comma-separated string
def get_referenced_json_keys(data_value):
    data = safe_json_load(data_value)
    keys = extract_referenced_json_keys_nonmetadata(data)
    return ", ".join(sorted(keys))

def get_schema_key_count(schema_value):
    schema_data = safe_json_load(schema_value)
    keys = extract_schema_properties(schema_data)
    return len(keys)

def get_referenced_json_key_count(data_value):
    data = safe_json_load(data_value)
    keys = extract_referenced_json_keys_nonmetadata(data)
    return len(keys)

def create_key_comparison_aligned(row):
    # Get lists of keys from each column (assuming they're comma‑separated strings)
    schema_keys = {k.strip() for k in row["schema_keys"].split(",") if k.strip()}
    data_keys   = {k.strip() for k in row["referenced_json_keys"].split(",") if k.strip()}
    # Create a sorted list of all keys from both sets
    all_keys = sorted(schema_keys.union(data_keys))
    
    rows = []
    for key in all_keys:
        rows.append({
            "Key": key,
            "Schema Key": key if key in schema_keys else "",
            "Referenced JSON Key": key if key in data_keys else ""
        })
    return pd.DataFrame(rows)


In [124]:
# load df
df1 = pd.read_csv('/Users/ezequiel.manoukian/repos/reviewing_json_schema_dataset/data/batch_004_L-1_400-500/test.csv')
df2 = pd.read_csv('/Users/ezequiel.manoukian/repos/reviewing_json_schema_dataset/data/batch_004_L-1_400-500/02_results_410_500.csv')
# df['analysis'] = df['REFERENCED_JSON_FORMATED'].apply(analyze_json_column)

df = pd.concat([df1, df2], ignore_index=True)

# df['analysis'] = df.apply(lambda row: {**row['analysis'], "schema_field_count": row['schema_field_count']}, axis=1)

# Apply analysis to each row using both the referenced JSON and the schema columns.
df['analysis'] = df.apply(lambda row: json.dumps(
    analyze_json_and_schema(row['REFERENCED_JSON_FORMATED'], row['schema']),
    indent=2
), axis=1)

df["schema_keys"] = df["schema"].apply(get_sorted_schema_keys)
df["schema_key_count"] = df["schema"].apply(get_schema_key_count)
df["referenced_json_keys"] = df["REFERENCED_JSON_FORMATED"].apply(get_referenced_json_keys)
df["referenced_json_key_count"] = df["REFERENCED_JSON_FORMATED"].apply(get_referenced_json_key_count)

In [125]:
df

Unnamed: 0,languageCode,internal_id,prompt,schema,response,SUMMARY,MR_EVAL_SUB_SCHEMA,REFERENCED_JSON,SCHEMA_SIMPLIFIED,REFERENCED_JSON_FORMATED,analysis,schema_keys,schema_key_count,referenced_json_keys,referenced_json_key_count
0,en_US,401,Imagine you are the lead cybersecurity analyst...,"{""type"": ""object"", ""properties"": {""system_info...","```json\n{\n ""system_info"": {\n ""system_na...",,"[{'property_name': 'system_info', 'analysis': ...","[{'system_name': {'type': 'string', 'reference...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""system_info"": {\n ""system_name"": {\n ...","{\n ""prompt_related_score"": 97.10144927536231...","alerts, alerts.additional_info, alerts.alert_i...",92,"alerts, alerts.additional_info, alerts.alert_i...",78
1,en_US,402,Imagine you are a seasoned screenwriter tasked...,"{""type"": ""object"", ""properties"": {""story_title...","```json\n{\n ""story_title"": ""Neon City Blues""...",,"[{'property_name': 'story_title', 'analysis': ...","[{'story_title': {'type': 'string', 'reference...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""story_title"": {\n ""type"": ""string"",\n...","{\n ""prompt_related_score"": 70.0,\n ""total_r...","characters, characters.backstory, characters.c...",53,"characters, characters.backstory, characters.c...",45
2,en_US,403,"Imagine you are the creative director at ""Tren...","{""type"": ""object"", ""properties"": {""forecastPer...","```json\n{\n ""forecastPeriod"": {\n ""season...",,"[{'property_name': 'forecastPeriod', 'analysis...",[{'forecastPeriod': {'season': {'type': 'strin...,"{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""forecastPeriod"": {\n ""season"": {\n ...","{\n ""prompt_related_score"": 89.65517241379311...","analystTeam, forecastDate, forecastPeriod, for...",14,"analystTeam, forecastDate, forecastPeriod, for...",31
3,en_US,404,Imagine you are a tech journalist writing a de...,"{""type"": ""object"", ""properties"": {""systemName""...","```json\n{\n ""systemName"": ""GeminiDB"",\n ""ve...",,"[{'property_name': 'systemName', 'analysis': '...","[{'systemName': {'type': 'string', 'referenced...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""systemName"": {\n ""type"": ""string"",\n ...","{\n ""prompt_related_score"": 98.79518072289156...","backupAndRecovery, backupAndRecovery.backupSch...",99,"backupAndRecovery, backupAndRecovery.backupSch...",104
4,en_US,405,Imagine you are advising a newly established p...,"{""type"": ""object"", ""properties"": {""donorProfil...","```json\n{\n ""donorProfile"": {\n ""donorTyp...",,"[{'property_name': 'donorProfile', 'analysis':...","[{'donorProfile': {'type': 'object', 'referenc...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""donorProfile"": {\n ""donorProfile"": {\...","{\n ""prompt_related_score"": 92.10526315789474...","budgetAndAllocation, budgetAndAllocation.alloc...",58,"budgetAndAllocation, budgetAndAllocation.alloc...",46
5,en_US,406,"Imagine you are the lead analyst at ""Innovate ...","{""type"": ""object"", ""properties"": {""productsRan...","```json\n{\n ""productsRanking"": [\n {\n ...",,"[{'property_name': 'productsRanking', 'analysi...","[{'productsRanking': {'type': 'array', 'refere...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""productsRanking"": {\n ""type"": ""array""...","{\n ""prompt_related_score"": 100.0,\n ""total_...","productsRanking, productsRanking.attributes, p...",47,productsRanking,1
6,en_US,407,Please provide the detailed information about ...,"{""type"": ""object"", ""properties"": {""eventTitle""...","```json\n{\n ""eventTitle"": ""Global AI Summit ...",,"[{'property_name': 'eventTitle', 'analysis': '...","[{'property_a': {'type': 'string', 'referenced...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""eventTitle"": ""Global AI Summit 2024"",\n ...","{\n ""prompt_related_score"": 0,\n ""total_refe...","accessibilityInformation, accessibilityInforma...",94,"accessibilityInformation, accessibilityInforma...",79
7,en_US,408,"As the newly appointed Head of HR at ""Synergy ...","{""type"": ""object"", ""properties"": {""plan_name"":...","```json\n{\n ""plan_name"": ""Synergy Harmony In...",,"[{'property_name': 'plan_name', 'analysis': '<...","[{'plan_name': {'type': 'string', 'referenced'...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""plan_name"": {\n ""type"": ""string"",\n ...","{\n ""prompt_related_score"": 100.0,\n ""total_...","approval_information, approval_information.app...",56,"approval_information, approval_information.app...",61
8,en_US,409,Please provide a JSON object that represents t...,"{""type"": ""object"", ""properties"": {""document"": ...","```json\n{\n ""document"": {\n ""paperSize"": ...",,"[{'property_name': 'document', 'analysis': '<A...","[{'paperSize': {'type': 'string', 'referenced'...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""document"": {\n ""paperSize"": {\n ...","{\n ""prompt_related_score"": 100.0,\n ""total_...","document, document.footer, document.footer.ali...",83,"document, document.footer.alignment, document....",62
9,en_US,410,Imagine you are tasked with developing a compr...,"{""type"": ""object"", ""properties"": {""strategy_na...","```json\n{\n ""strategy_name"": ""Sustainable Pa...",,"[{'property_name': 'strategy_name', 'analysis'...","[{'strategy_name': {'type': 'string', 'referen...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""strategy_name"": {\n ""type"": ""string"",...","{\n ""prompt_related_score"": 83.33333333333334...","evaluation, evaluation.lessons_learned, evalua...",60,"evaluation, evaluation.lessons_learned, evalua...",42


In [128]:
# For example, display the comparison for row index 0:
for i in range(len(df)):    
    create_key_comparison_aligned(df.iloc[0])
