
## Create policy focused narrative sections

In [0]:
%pip install pandas
%pip install openai
%restart_python

In [0]:
import pandas as pd
import openai
import os
import json

In [0]:
national_char_data = pd.read_csv("/Workspace/Users/daniel.dodgson@education.gov.uk/hackathon-2025-llm-stats-qa/data/202324_national_char_data_revised.csv")
national_char_metadata = pd.read_csv("/Workspace/Users/daniel.dodgson@education.gov.uk/hackathon-2025-llm-stats-qa/data/202324_national_char_data_revised.meta.csv")

# Filter for only All state-funded school type
national_char_filtered_data = national_char_data[national_char_data['establishment_type_group'] == 'All state-funded']
national_char_filtered_data = national_char_filtered_data[national_char_filtered_data['religious_denomination'] == 'Total']
national_char_filtered_data = national_char_filtered_data[national_char_filtered_data['admission_type'] == 'Total']
national_char_filtered_data = national_char_filtered_data[national_char_filtered_data['sen_primary_need'] == 'Total']
national_char_filtered_data = national_char_filtered_data[national_char_filtered_data['sen_status'] == 'Total']
national_char_filtered_data = national_char_filtered_data[national_char_filtered_data['free_school_meals'] == 'Total']
national_char_filtered_data = national_char_filtered_data[national_char_filtered_data['ethnicity_minor'] == 'Total']


national_char_filtered_data = national_char_filtered_data[['time_period', 'sex', 'ethnicity_major', 'sen_provision', 'disadvantage', 'first_language', 't_pupils', 'avg_att8', 'pt_l2basics_95', 'avg_p8score', 'pt_ebacc_e_ptq_ee', 'avg_ebaccaps']]



In [0]:
json_data_for_prompt = national_char_filtered_data.to_json(orient="records")
json_metadata_for_prompt = national_char_metadata.to_json(orient="records")

In [0]:
# Retrieve the API key from Databricks secrets
DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

client = openai.OpenAI(
    api_key=DATABRICKS_TOKEN,
    base_url="https://adb-2220072380334347.7.azuredatabricks.net/serving-endpoints"
)

#What is the accordion name?
accordion_title = "EBacc entry and achievement"

# What indicators are the focus?
performance_measures = [
    "pt_ebacc_e_ptq_ee",
    "pt_ebacc_94",
    "pt_ebacc_95",
    "avg_ebaccaps"
]

# Define the prompt with the data and metadata
prompt = f"""
Here is the data:
{json_data_for_prompt}
Here is the metadata: 
{json_metadata_for_prompt}
Please summarize the data within the data file with a focus on the following indicators: {performance_measures}. Use the metadata to understand what those indicators are; don't guess what the indicators are the metadata will tell you. The summary needs to fit under the heading {accordion_title}. Focus on the overall trends over time (the time_period column specifies the academic year in the format 202324 is 2023/24). but do pick out any very (emphasis on very!) interesting characteristic effects. Please also provide a table summarising the most interesting story. Please note the structure of the data, there are characteristic columns, i.e., denoted as filter columns in the metadata. When all these filter columns equal Total then you are looking at the overall data.
"""
# print(prompt)
response = client.chat.completions.create(
    model="databricks-meta-llama-3-3-70b-instruct",
    messages=[
        {
            "role": "system",
            "content": "You are creating narrative sections around specific policy areas based on the data provided. The narrative is a national statistics, statistical release so it has to be very matter-of-fact. "
        },
        {
            "role": "user",
            "content": prompt
        }
    ],
    temperature=0.01,
    max_tokens=5000
)

print(response.choices[0].message.content)