In [1]:
# BIG GOTCHA WARNING - Must figure out why column for year/FSM/SEN/gender
# is not being created for some schools, likely to be an issue in dataset (e.g. missing data)

from pathlib import Path

import pandas as pd


In [2]:
# Read in the RAG dataset
rag_df = pd.read_csv(
    "../data/real/standard_area_aggregate_scores_rag.csv",
)
rag_df.head()

Unnamed: 0,variable,mean,count,school_lab,year_group_lab,fsm_lab,sen_lab,gender_lab,total_pupils,group_n,group_wt_mean,group_wt_std,lower,upper,rag,variable_lab,description
0,autonomy_score,20.021661,277.0,BRAUNTON SCHOOL AND C.C.,All,All,All,All,1189.0,5.0,19.206056,0.757852,18.448203,19.963908,above,Autonomy,How 'in control' young people feel of their life
1,life_satisfaction_score,6.90681,279.0,BRAUNTON SCHOOL AND C.C.,All,All,All,All,1216.0,5.0,6.401316,0.396911,6.004405,6.798226,above,Life satisfaction,How satisfied young people feel with their life
2,optimism_score,11.409594,271.0,BRAUNTON SCHOOL AND C.C.,All,All,All,All,1179.0,5.0,11.087362,0.393545,10.693817,11.480907,average,Optimism,Young people's hopefulness and confidence for ...
3,wellbeing_score,22.876812,276.0,BRAUNTON SCHOOL AND C.C.,All,All,All,All,1197.0,5.0,21.84127,0.784775,21.056495,22.626044,above,Psychological wellbeing,How positive and generally happy young people ...
4,esteem_score,14.666667,273.0,BRAUNTON SCHOOL AND C.C.,All,All,All,All,1188.0,5.0,13.978956,0.462649,13.516307,14.441605,above,Self-esteem,How much young people value themselves


In [3]:
RESULT_NUMBER_THRESHOLD = 10

# Select relevant columns
rag_columns = [
    "variable_lab",
    "rag",
    "school_lab",
    "year_group_lab",
    "gender_lab",
    "fsm_lab",
    "sen_lab",
    "count",
]
filtered_rag_df = rag_df[rag_columns]

filtered_rag_df["rag"] = filtered_rag_df["rag"].replace(
    {
        "average": "Average",
        "above": "Above average",
        "below": "Below average",
    },
)

# **1. Convert 'count' column to numeric**
filtered_rag_df["count"] = (
    pd.to_numeric(filtered_rag_df["count"], errors="coerce").fillna(0).astype(int)
)
# Create dictionary to store dfs for each school
school_dfs: dict[str, pd.DataFrame] = {}

# Get the unique school names
schools = filtered_rag_df["school_lab"].unique()

# Iterate over each school and create a DataFrame for "All pupils"
for school in schools:
    # Filter by school
    school_df = filtered_rag_df[filtered_rag_df["school_lab"] == school]

    # Filter by "All pupils" (where all grouping variables are set to "All")
    all_pupils_df = school_df[
        (school_df["year_group_lab"] == "All")
        & (school_df["gender_lab"] == "All")
        & (school_df["fsm_lab"] == "All")
        & (school_df["sen_lab"] == "All")
    ].copy()

    # **2. Ensure 'count' is numeric in all_pupils_df as well**
    all_pupils_df["count"] = (
        pd.to_numeric(all_pupils_df["count"], errors="coerce").fillna(0).astype(int)
    )
    # Insert "n<10" into the 'rag' column where 'count' is less than 10
    all_pupils_df.loc[all_pupils_df["count"] < RESULT_NUMBER_THRESHOLD, "rag"] = f"n<{RESULT_NUMBER_THRESHOLD}"

    # Select relevant columns: 'variable_lab', 'rag', 'school_lab', and add 'All pupils' as the group
    all_pupils_df = all_pupils_df[["variable_lab", "rag", "school_lab"]]
    all_pupils_df["group"] = "All pupils"

    # Store the DataFrame in the school_dfs dictionary
    school_dfs[f"{school}_all_pupils"] = all_pupils_df

# To view the DataFrame for 'School B' for "All pupils"
school_b_all_pupils_df = school_dfs.get("School B_all_pupils")
# Display the first few rows of the DataFrame for 'School B'


# Save each DataFrame to a CSV file
for school in schools:
    # Ensure the directory exists
    output_dir = Path(f"outputs/{school}/rag_ratings")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Save the DataFrame to a CSV file
    school_dfs[f"{school}_all_pupils"].to_csv(output_dir / f"rag_all_pupils_{school}.csv", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_rag_df["rag"] = filtered_rag_df["rag"].replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_rag_df["count"] = (


In [4]:
# By year group
# Create a year group pivot DataFrame for each school
for school in schools:
    school_df = filtered_rag_df[filtered_rag_df["school_lab"] == school][rag_columns]

    # Filter by year groups
    year_group_rag: pd.DataFrame = school_df[
        school_df["year_group_lab"].isin(["Year 8", "Year 10"])
    ]

    # Pivot the table for Year 8 and Year 10 RAG summaries
    year_group_pivot = year_group_rag.pivot_table(
        index=["school_lab", "variable_lab"],
        columns="year_group_lab",
        values="rag",
        aggfunc="first",
    ).reset_index()

    # Store the pivoted DataFrame in the school_dfs dictionary
    school_dfs[f"{school}_year_group"] = year_group_pivot

# Save each DataFrame to a CSV file
for school in schools:
    # Ensure the directory exists
    output_dir = Path(f"outputs/{school}/rag_ratings")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Save the DataFrame to a CSV file
    school_dfs[f"{school}_year_group"].to_csv(
        output_dir / f"rag_year_group_{school}.csv", index=False,
    )





In [5]:
# By FSM status
for school in schools:
    school_df = filtered_rag_df[filtered_rag_df["school_lab"] == school][rag_columns]

    # Filter by FSM
    fsm_pivot_rag: pd.DataFrame = school_df[
        school_df["fsm_lab"].isin(["FSM", "Non-FSM"])
    ]

    # Pivot the table for SEN status
    fsm_pivot = fsm_pivot_rag.pivot_table(
        index=["school_lab", "variable_lab"],
        columns="fsm_lab",
        values="rag",
        aggfunc="first",
    ).reset_index()

    # Store the pivoted DataFrame in the school_dfs dictionary
    school_dfs[f"{school}_fsm"] = fsm_pivot

# Save each DataFrame to a CSV file
for school in schools:
    # Ensure the directory exists
    output_dir = Path(f"outputs/{school}/rag_ratings")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Save the DataFrame to a CSV file
    school_dfs[f"{school}_fsm"].to_csv(
        output_dir / f"rag_fsm_{school}.csv", index=False,
    )


In [6]:
# By Gender
for school in schools:
    school_df = filtered_rag_df[filtered_rag_df["school_lab"] == school][rag_columns]

    # Filter by gender
    gender_rag: pd.DataFrame = school_df[school_df["gender_lab"].isin(["Boy", "Girl"])]

    # Pivot
    gender_pivot = gender_rag.pivot_table(
        index=["school_lab", "variable_lab"],
        columns="gender_lab",
        values="rag",
        aggfunc="first",
    ).reset_index()

    # Store the pivoted DataFrame in the school_dfs dictionary
    school_dfs[f"{school}_gender"] = gender_pivot

# Save each DataFrame to a CSV file
for school in schools:
    # Ensure the directory exists
    output_dir = Path(f"outputs/{school}/rag_ratings")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Save the DataFrame to a CSV file
    school_dfs[f"{school}_gender"].to_csv(
        output_dir / f"rag_gender_{school}.csv", index=False,
    )


In [7]:
# By SEN status

# Create a year group pivot DataFrame for each school
for school in schools:
    school_df = filtered_rag_df[filtered_rag_df["school_lab"] == school][rag_columns]

    # Filter by year groups
    sen_rag: pd.DataFrame = school_df[school_df["sen_lab"].isin(["SEN", "Non-SEN"])]

    # Pivot the table for SEN status
    sen_pivot = sen_rag.pivot_table(
        index=["school_lab", "variable_lab"],
        columns="sen_lab",
        values="rag",
        aggfunc="first",
    ).reset_index()

    # Store the pivoted DataFrame in the school_dfs dictionary
    school_dfs[f"{school}_sen"] = sen_pivot

# Save each DataFrame to a CSV file
for school in schools:
    # Ensure the directory exists
    output_dir = Path(f"outputs/{school}/rag_ratings")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Save the DataFrame to a CSV file
    school_dfs[f"{school}_sen"].to_csv(
        output_dir / f"rag_sen_{school}.csv", index=False,
    )
