In [1]:
import pandas as pd
df = pd.read_csv('../data/generated_training.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1300 entries, 0 to 1299
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   languageCode  1300 non-null   object
 1   prompt        1300 non-null   object
 2   schema        1300 non-null   object
 3   response      1300 non-null   object
dtypes: object(4)
memory usage: 40.8+ KB


In [None]:
# Create another column json_response with the content of response but removing ```json and ``` to only include the code
df['json_response'] = df['response'].str.replace('```json\n', '').str.replace('```', '')

In [5]:
import jsonschema
import json

def validate_response(row):
    if row['json_response']==None:
        return {"validate": False,
                "error_code": 3,
                "error_message":"Empty JSON"}
        
    try:
    # If no exception is raised by validate(), the instance is valid.
        schema=json.loads(row['schema'])
        response=json.loads(row['json_response'])
        jsonschema.validate(response, schema=schema)

        return {"validate": True,
                "error_code": 0,
                "error_message":"Valid JSON"}
    except jsonschema.exceptions.SchemaError as e:
        return {"validate": False,
                "error_code": 1, #schema
                "error_message": f"{e.message}"}
    except jsonschema.exceptions.ValidationError as e: 
        error_message=validate_response_all_errors(row)
        output={"validate": False,
        "error_code": 2,
        "error_message": error_message}
        return output
    except Exception as e:
        return {"validate": False,
                "error_code": 3,
                "error_message":f"Exception {e}"}

def validate_response_all_errors(row):
        schema=json.loads(row['schema'])
        response=json.loads(row['json_response'])
        errors=jsonschema.Draft202012Validator(schema).iter_errors(response)
        error_message=[]
        for e in errors:
            error_message.append({
"message": f'{e.message}',
"validator": f'{e.validator}',
"validator_type": f'{e.validator_value}',
"json_path": e.json_path,
#"json_path": e.relative_path,
#"context": e.context,
#"cause": e.cause
})
        return error_message


In [None]:
# Create new column with the error message of validation of the json_response and the schema using the function validate_response
df['validation_result'] = df.apply(lambda row: validate_response(row), axis=1)
df['error_message'] = df['validation_result'].apply(lambda x: x['error_message'])
df['valid'] = df['validation_result'].apply(lambda x: x['validate'])
df['error_code'] = df['validation_result'].apply(lambda x: x['error_code'])
df['error_message_len'] = df['error_message'].apply(lambda x: len(x) if isinstance(x, list) else 0)

In [11]:
df['valid'].value_counts()

valid
False    928
True     372
Name: count, dtype: int64

In [None]:
# Create a column with the len of json in column error_message if the content is a json array


In [16]:
df['error_message_len'].value_counts()

error_message_len
0     428
1     359
2     173
3      91
4      44
5      43
6      33
8      19
10     17
7      15
9      14
12     11
13      5
11      5
16      4
18      4
33      3
20      3
17      3
15      3
14      3
30      2
37      2
23      1
27      1
25      1
57      1
31      1
24      1
42      1
21      1
46      1
68      1
43      1
28      1
32      1
71      1
36      1
26      1
Name: count, dtype: int64

In [22]:
filter_one_error=df[df['error_message_len']==2]
#print(filter_one_error['error_message'].head(20).to_markdown())
print(filter_one_error['validation_result'].head(20).to_markdown())

|     | validation_result                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               