In [4]:
import json
import pandas as pd

def analyze_json_references(data):
    """
    Process a JSON object (or dict) to count and analyze properties
    with 'referenced' and 'text_reference'. Returns a summary dictionary.
    """
    true_count = 0
    false_count = 0
    false_details = {}  # Initialize as dict instead of list

    def process_json(obj, parent_key=None):
        nonlocal true_count, false_count, false_details
        if isinstance(obj, dict):
            # Check if current dict has both "referenced" and "text_reference"
            if "referenced" in obj and "text_reference" in obj:
                if obj["referenced"] is True:
                    true_count += 1
                else:
                    false_count += 1
                    # Use parent key if available; otherwise, 'unknown'
                    prop_name = parent_key if parent_key is not None else 'unknown'
                    false_details[prop_name] = obj["text_reference"]
            # Recursively process each key-value pair
            for key, value in obj.items():
                process_json(value, parent_key=key)
        elif isinstance(obj, list):
            for item in obj:
                process_json(item, parent_key=parent_key)

    process_json(data)
    total_references = true_count + false_count
    percentage_true = (true_count / total_references * 100) if total_references else 0

    return {
        "percentage_true": percentage_true,
        "total_references": total_references,
        "true_references": true_count,
        "false_references": false_count,
        "false_references_details": false_details
    }

def analyze_json_column(json_value):
    """
    Wrapper function to process a DataFrame cell.
    Expects json_value to be either a JSON string or a dict.
    """
    # If the input is a JSON string, convert it to a dict
    if isinstance(json_value, str):
        data = json.loads(json_value)
    else:
        data = json_value
    return analyze_json_references(data)


In [5]:
# load df
df = pd.read_csv('/Users/dan.rambado/Documents/reviewing_json_schema_dataset/output/batch_004_L-1_400-500/test.csv')

df['analysis'] = df['REFERENCED_JSON_FORMATED'].apply(analyze_json_column)

In [6]:
df

Unnamed: 0,languageCode,internal_id,prompt,schema,response,SUMMARY,MR_EVAL_SUB_SCHEMA,REFERENCED_JSON,SCHEMA_SIMPLIFIED,REFERENCED_JSON_FORMATED,analysis
0,en_US,401,Imagine you are the lead cybersecurity analyst...,"{""type"": ""object"", ""properties"": {""system_info...","```json\n{\n ""system_info"": {\n ""system_na...",,"[{'property_name': 'system_info', 'analysis': ...","[{'system_name': {'type': 'string', 'reference...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""system_info"": {\n ""system_name"": {\n ...","{'percentage_true': 97.10144927536231, 'total_..."
1,en_US,402,Imagine you are a seasoned screenwriter tasked...,"{""type"": ""object"", ""properties"": {""story_title...","```json\n{\n ""story_title"": ""Neon City Blues""...",,"[{'property_name': 'story_title', 'analysis': ...","[{'story_title': {'type': 'string', 'reference...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""story_title"": {\n ""type"": ""string"",\n...","{'percentage_true': 70.0, 'total_references': ..."
2,en_US,403,"Imagine you are the creative director at ""Tren...","{""type"": ""object"", ""properties"": {""forecastPer...","```json\n{\n ""forecastPeriod"": {\n ""season...",,"[{'property_name': 'forecastPeriod', 'analysis...",[{'forecastPeriod': {'season': {'type': 'strin...,"{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""forecastPeriod"": {\n ""season"": {\n ...","{'percentage_true': 89.65517241379311, 'total_..."
3,en_US,404,Imagine you are a tech journalist writing a de...,"{""type"": ""object"", ""properties"": {""systemName""...","```json\n{\n ""systemName"": ""GeminiDB"",\n ""ve...",,"[{'property_name': 'systemName', 'analysis': '...","[{'systemName': {'type': 'string', 'referenced...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""systemName"": {\n ""type"": ""string"",\n ...","{'percentage_true': 98.79518072289156, 'total_..."
4,en_US,405,Imagine you are advising a newly established p...,"{""type"": ""object"", ""properties"": {""donorProfil...","```json\n{\n ""donorProfile"": {\n ""donorTyp...",,"[{'property_name': 'donorProfile', 'analysis':...","[{'donorProfile': {'type': 'object', 'referenc...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""donorProfile"": {\n ""donorProfile"": {\...","{'percentage_true': 92.10526315789474, 'total_..."
5,en_US,406,"Imagine you are the lead analyst at ""Innovate ...","{""type"": ""object"", ""properties"": {""productsRan...","```json\n{\n ""productsRanking"": [\n {\n ...",,"[{'property_name': 'productsRanking', 'analysi...","[{'productsRanking': {'type': 'array', 'refere...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""productsRanking"": {\n ""type"": ""array""...","{'percentage_true': 100.0, 'total_references':..."
6,en_US,407,Please provide the detailed information about ...,"{""type"": ""object"", ""properties"": {""eventTitle""...","```json\n{\n ""eventTitle"": ""Global AI Summit ...",,"[{'property_name': 'eventTitle', 'analysis': '...","[{'property_a': {'type': 'string', 'referenced...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""eventTitle"": ""Global AI Summit 2024"",\n ...","{'percentage_true': 0, 'total_references': 0, ..."
7,en_US,408,"As the newly appointed Head of HR at ""Synergy ...","{""type"": ""object"", ""properties"": {""plan_name"":...","```json\n{\n ""plan_name"": ""Synergy Harmony In...",,"[{'property_name': 'plan_name', 'analysis': '<...","[{'plan_name': {'type': 'string', 'referenced'...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""plan_name"": {\n ""type"": ""string"",\n ...","{'percentage_true': 100.0, 'total_references':..."
8,en_US,409,Please provide a JSON object that represents t...,"{""type"": ""object"", ""properties"": {""document"": ...","```json\n{\n ""document"": {\n ""paperSize"": ...",,"[{'property_name': 'document', 'analysis': '<A...","[{'paperSize': {'type': 'string', 'referenced'...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""document"": {\n ""paperSize"": {\n ...","{'percentage_true': 100.0, 'total_references':..."
9,en_US,410,Imagine you are tasked with developing a compr...,"{""type"": ""object"", ""properties"": {""strategy_na...","```json\n{\n ""strategy_name"": ""Sustainable Pa...",,"[{'property_name': 'strategy_name', 'analysis'...","[{'strategy_name': {'type': 'string', 'referen...","{\n ""type"": ""object"",\n ""properties"": [\...","{\n ""strategy_name"": {\n ""type"": ""string"",...","{'percentage_true': 83.33333333333334, 'total_..."
