Install libraries

In [0]:
%pip install -qU openai databricks_langchain transformers
dbutils.library.restartPython()

Load libraries

In [0]:
# LLM
from openai import OpenAI
import mlflow
mlflow.tracing.disable_notebook_display()
import tiktoken

# Analysis
import pandas as pd

# General
import os

Getting the LLM to do the work for me

In [0]:
DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

client = OpenAI(
    api_key=DATABRICKS_TOKEN,
    base_url="https://adb-2220072380334347.7.azuredatabricks.net/serving-endpoints"
)

response = client.chat.completions.create(
    model="databricks-meta-llama-3-3-70b-instruct",
    messages=[
        {
            "role": "user",
            "content": "My task is to build a LLM process so that it can take a table of data and its metadata and provide an accurate and insightful overview of the data. How do you recommend doing this?"
        }
    ],
    max_tokens=5000
)

print(response.choices[0].message.content)

### Step 1: Data Preparation

In [0]:
# Read in data
ees_data = pd.read_csv("../data/202324_national_char_data_revised.csv")
ees_meta = pd.read_csv("../data/202324_national_char_data_revised.meta.csv")

print("EES Data:\n")
ees_data.display()
print("\n---\nEES Metadata:\n")
ees_meta.display()

Give columns their full names

In [0]:
# Map the full column names to the dataset
col_map = dict(zip(ees_meta["col_name"], ees_meta["label"]))
ees_data_labelled = ees_data.rename(columns=col_map)

Chunking the data

In [0]:
# Keep latest data
ees_data_max_date_total = ees_data_labelled[
    # (ees_data_labelled["time_period"] == ees_data_labelled["time_period"].max()) &
    (ees_data_labelled["breakdown"] == "Total")
]

# ees_data_state_funded = ees_data_max_date[ees_data_max_date["School type"] == "All state-funded"]

ees_data_max_date_total.display()

In [0]:
# Convert to CSV string (limit rows!)
ees_data_snippet = ees_data_max_date_total.to_csv(index=False)

# Initialize the tokenizer for the 'databricks-meta-llama-3-3-70b-instruct' model
enc = tiktoken.get_encoding("cl100k_base")

# Encode the text to get the token count
token_count = len(enc.encode(ees_data_snippet))
print(f"Token count: {token_count}")

### Step 2: LLM

Simple prompt

In [0]:

simple_response = client.chat.completions.create(
    model="databricks-meta-llama-3-3-70b-instruct",
    messages=[
        {
            "role": "system",
            "content": "You are an expert data analyst at the Department for Education."
        },
        {
            "role": "user",
            "content": f"""
            Below is the Data set from Key stage 4 performance National characteristics data in CSV format:
            {ees_data_snippet}
            
            Please provide a short, accurate and insightful summary of the Latest headline data for pupils at the end of Key Stage 4 in 2023/24. Including:
            1. An overview of what the story the data is telling.
            2. Key statistics, including time series comparisons.
            3. Notable patterns or trends.
            """
        }
    ],
    max_tokens=5000
)

print(simple_response.choices[0].message.content)

Asking LLM to QA the headline summary

In [0]:
latest_headline_data_summary = f"""
Average Attainment 8 has decreased compared with last year and 2018/19. The average Attainment 8 score is 45.9 in 2024, which has decreased by 0.4 points since 2022/23 from 46.3, and decreased by 0.8 points from 46.7 in 2018/19. The decrease in Attainment 8 in 2023/24 has been driven by a decrease in the open bucket, and in particular non-GCSEs i.e. Vocational Technical Qualifications (VTQs). This follows reforms designed to strengthen KS4 Technical Awards awarded for the first time in 2024. See the ‘Comparing KS4 measures over time’ section for more information.
45.9% of pupils achieved a grade 5 or higher in both English and maths. This is an increase of 0.6 percentage points (from 45.3%) compared to 2022/23, and an increase of 2.7 percentage points (from 43.2%) in comparison with 2018/19.
40.4% of pupils were entered into the full EBacc. This is the highest EBacc entry rate since the measure was introduced. This is also an increase of 1.1 percentage points in comparison with 2022/23. In 2018/19 40.0% of pupils were entered into the full EBacc.
Average EBacc APS in 2024 has increased by 0.02 to 4.07 compared with last year, and is now the same score as in 2019.
The KS4 disadvantage gap index has narrowed slightly compared to 2022/23, from 3.94 to 3.92. Before the pandemic, the gap index had widened going from 3.66 to 3.70 between 2017 and 2019, before narrowing slightly in 2020 to 3.66 when centre assessed grades were used.
"""

latest_headline_data_summary_incorrect = f"""
Average Attainment 8 has decreased compared with last year and 2018/19. The average Attainment 8 score is 50.1 in 2024, which has decreased by 0.4 points since 2022/23 from 46.3, and decreased by 0.8 points from 46.7 in 2018/19. The decrease in Attainment 8 in 2023/24 has been driven by a decrease in the open bucket, and in particular non-GCSEs i.e. Vocational Technical Qualifications (VTQs). This follows reforms designed to strengthen KS4 Technical Awards awarded for the first time in 2024. See the ‘Comparing KS4 measures over time’ section for more information.
45.9% of pupils achieved a grade 5 or higher in both English and maths. This is an increase of 0.6 percentage points (from 45.3%) compared to 2022/23, and an increase of 2.7 percentage points (from 43.2%) in comparison with 2018/19.
40.4% of pupils were entered into the full EBacc. This is the highest EBacc entry rate since the measure was introduced. This is also an increase of 1.1 percentage points in comparison with 2022/23. In 2018/19 40.0% of pupils were entered into the full EBacc.
Average EBacc APS in 2024 has increased by 0.02 to 4.07 compared with last year, and is now the same score as in 2019.
The KS4 disadvantage gap index has narrowed slightly compared to 2022/23, from 3.94 to 3.92. Before the pandemic, the gap index had widened going from 3.66 to 3.70 between 2017 and 2019, before narrowing slightly in 2020 to 3.66 when centre assessed grades were used.
"""

The true headline summary

In [0]:
simple_response = client.chat.completions.create(
    model="databricks-meta-llama-3-3-70b-instruct",
    messages=[
        {
            "role": "system",
            "content": "You are an expert data analyst at the Department for Education."
        },
        {
            "role": "user",
            "content": f"""
            Below is the Data set from Key stage 4 performance National characteristics data in CSV format:
            {ees_data_snippet}
            
            Please quality assure this piece of text which summarises the latest headline data:
            {latest_headline_data_summary}

            QA Instructions:
            - Report any errors you find in the text, if there are none return "Information accurate".
            - Also suggest specific missing statsitic insights at a national level. Provide the statistic too.
            """
        }
    ],
    max_tokens=5000
)

print(simple_response.choices[0].message.content)


An incorrect version of the headline summary

In [0]:
simple_response = client.chat.completions.create(
    model="databricks-meta-llama-3-3-70b-instruct",
    messages=[
        {
            "role": "system",
            "content": "You are an expert data analyst at the Department for Education."
        },
        {
            "role": "user",
            "content": f"""
            Below is the Data set from Key stage 4 performance National characteristics data in CSV format:
            {ees_data_snippet}
            
            Please quality assure this piece of text which summarises the latest headline data:
            {latest_headline_data_summary_incorrect}

            QA Instructions:
            - Using the data provided report any errors you find in the text. If there are no discrepencies return "Information accurate".
            - Also suggest specific missing statsitic insights at a national level. Provide the statistic too.
            """
        }
    ],
    max_tokens=5000
)

print(simple_response.choices[0].message.content)


### Accessible tables

In [0]:
ees_table = pd.read_csv("../data/Table 1 Key Stage 4 headline measures for all pupils in state-funded schools in England between 201415 and 202324.csv")

ees_tabl_str = ees_table.to_csv(index=False)


In [0]:
ees_tabl_str

In [0]:
access_table_reponse = client.chat.completions.create(
    model="databricks-meta-llama-3-3-70b-instruct",
    messages=[
        {
            "role": "system",
            "content": "You are an expert data analyst at the Department for Education."
        },
        {
            "role": "user",
            "content": f"""
            Below is the a Table set from the Academic year 2023/24 Key stage 4 performance statistical release in CSV format:
            {ees_tabl_str}
            
            Please provide an accurate and insightful summary of the table.
            Do this for each measure.
            This will be provided as alternative text for visually impaired users.
            """
        }
    ],
    max_tokens=5000
)

print(access_table_reponse.choices[0].message.content)

### ChatBot

In [0]:
# Read in the text file
with open("../Scraped_data/KS4_performance.txt", "r") as file:
    ks4_performance_txt = file.read()

In [0]:
user_question = "What is the most worrying trend from this release"

chatbot_response = client.chat.completions.create(
    model="databricks-meta-llama-3-3-70b-instruct",
    messages=[
        {
            "role": "system",
            "content": "You are an expert data analyst at the Department for Education. Your task is to answer people's questions regarding a DfE statistical release."
        },
        {
            "role": "user",
            "content": f"""
            Below is the text from the DfE performance statistical release:
            {ks4_performance_txt}
            
            Instructions:
            - Answer the below user question.
            - Use only the text provided and do not make anything up.
            - Provide accurate statistics and information.

            User question:
            {user_question}
            """
        }
    ],
    max_tokens=5000
)

print(chatbot_response.choices[0].message.content)