# QA on EES API data

This code is used to do QA checks on data which has been published on the EES API.


In [0]:
%pip install openai

In [0]:
API_URL = "https://api.education.gov.uk/statistics/v1/data-sets/1d419801-435d-c676-b428-1217e08290c3/csv?dataSetVersion=1.0"
numeric_cols = ["achievements_percent", "starts_percent", "participation_count", "achievement_count", "start_count"]
output_name = "app headlines"

In [0]:
from openai import OpenAI
import os
 
DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

client = OpenAI(
    api_key=DATABRICKS_TOKEN,
    base_url="https://adb-2220072380334347.7.azuredatabricks.net/serving-endpoints"
)

# Ask the API to write code which will QA the data on the API in the input
response = client.chat.completions.create(
    model="databricks-llama-4-maverick",
    messages=[
        {
            "role": "system",
            "content": "you are an analyst"
        },

        {
            "role": "user",
            "content": f"""write Python code to read in and quality assure the data on this API: {API_URL}. Please check for outliers,  missing values, and data type. Please also present summary statistics on the data. Do not include any backticks.
            
            Please convert any columns mentioned in {numeric_cols} to numeric type using pandas. This is very important.

            Please output **only** valid, bug-free Python code. Ensure that:
            1. The code is properly indented.
            2. Do not include any markdown or formatting characters like backticks (`), triple quotes (```), or explanations.
            3. The code should be plain Python without any comments, explanations, or descriptions.
            4. Ensure the code is formatted to be executable without further modifications, with the correct indentation maintained throughout.
            5. Print all the outputs at the end, along with an explanation of what they are.
            6. Save the outputs in a text file in the Outputs folder which is stored a level above this script, and include {output_name} in the filename."""
        }
    ],
    max_tokens=8000
)

generated_code = response.choices[0].message.content

# Run the code the LLM generated
exec(generated_code, globals())

In [0]:
print(generated_code)

In [0]:

model_opts = ["databricks-gpt-oss-120b", "databricks-gpt-oss-20b", "databricks-claude-3-7-sonnet", "databricks-claude-sonnet-4", "databricks-llama-4-maverick", "databricks-gemma-3-12b", "databricks-meta-llama-3-3-70b-instruct", "databricks-meta-llama-3-1-8b-instruct"]

from openai import OpenAI
import os
 
DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

client = OpenAI(
    api_key=DATABRICKS_TOKEN,
    base_url="https://adb-2220072380334347.7.azuredatabricks.net/serving-endpoints"
)

for model in model_opts:
    print(f"Testing model {model}")
    # Ask the API to write code which will QA the data on the API in the input
    response = client.chat.completions.create(
        model="databricks-llama-4-maverick",
        messages=[
            {
                "role": "system",
                "content": "you are an analyst"
            },

            {
                "role": "user",
                "content": f"""write Python code to read in and quality assure the data on this API: {API_URL}. Please check for outliers,  missing values, and data type. Please also present summary statistics on the data. Do not include any backticks.
            
                Please convert any columns mentioned in {numeric_cols} to numeric type using pandas. This is very important.

                Please output **only** valid, bug-free Python code. Ensure that:
                1. The code is properly indented.
                2. Do not include any markdown or formatting characters like backticks (`), triple quotes (```), or explanations.
                3. The code should be plain Python without any comments, explanations, or descriptions.
                4. Ensure the code is formatted to be executable without further modifications, with the correct indentation maintained throughout.
                5. Print all the outputs at the end, along with an explanation of what they are."""
            }
        ],
        max_tokens=8000
    )

    generated_code = response.choices[0].message.content.replace("`", "").replace("python", "")

    try:
        # Run the code the LLM generated
        exec(generated_code, globals())
        print(f"Model {model} passed!")

    except (SyntaxError, TypeError) as e:
        print(f"Model {model} failed due to {e}.")
        continue

In [0]:
import pandas as pd
import numpy as np
from urllib.request import urlretrieve

url = 'https://api.education.gov.uk/statistics/v1/data-sets/1d419801-435d-c676-b428-1217e08290c3/csv?dataSetVersion=1.0'
urlretrieve(url, 'data.csv')

df = pd.read_csv('data.csv')

cols_to_convert = ['achievements_percent', 'starts_percent', 'participation_count', 'achievement_count', 'start_count']
for col in cols_to_convert:
    df[col] = pd.to_numeric(df[col], errors='coerce')

print('Data Types:')
print(df.dtypes)
print()

print('Summary Statistics:')
print(df.describe())
print()

print('Missing Values Count:')
print(df.isnull().sum())
print()

Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print('Outliers Count:')
print(((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum())