In [6]:
# BIG GOTCHA WARNING - Must figure out why column for year/FSM/SEN/gender is not being created for some schools, likely to be an issue in dataset (e.g. missing data)

import pandas as pd
from pathlib import Path


In [7]:
# Read in the RAG dataset
rag_df = pd.read_csv(
    "../data/synthetic/standard/synthetic_standard_survey.standard_school_aggregate_scores_rag.0000000010000.csv"
)
rag_df.head()

Unnamed: 0,variable,mean,count,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std,lower,upper,rag,variable_lab,description
0,autonomy_score,17.18421052631579,76.0,School A,All,All,All,All,490,7,17.808163,0.439816,17.368347,18.247979,below,Autonomy,\nHow 'in control' young people feel of their ...
1,life_satisfaction_score,5.050420168067227,119.0,School A,All,All,All,All,725,7,5.06069,0.339266,4.721424,5.399955,average,Life satisfaction,\nHow satisfied young people feel with their life
2,optimism_score,12.035714285714286,112.0,School A,All,All,All,All,731,7,12.032832,0.260359,11.772473,12.293191,average,Optimism,\nYoung people's hopefulness and confidence fo...
3,wellbeing_score,21.21505376344086,93.0,School A,All,All,All,All,547,7,20.934186,0.507017,20.427169,21.441204,average,Psychological wellbeing,\nHow positive and generally happy young peopl...
4,esteem_score,12.546666666666669,75.0,School A,All,All,All,All,481,7,12.773389,0.406382,12.367007,13.179771,average,Self-esteem,\nHow much young people value themselves


In [8]:
# Select relevant columns
rag_columns = [
    "variable_lab",
    "rag",
    "school_lab",
    "year_group_lab",
    "gender_lab",
    "fsm_lab",
    "sen_lab",
]
filtered_rag_df = rag_df[rag_columns]

filtered_rag_df["rag"] = filtered_rag_df["rag"].replace(
    {
        "average": "Average",
        "Above average": "Above average",
        "Below average": "Below average",
    }
)


# Create dictionary to store dfs for each school
school_dfs: dict[str, pd.DataFrame] = {}

# Get the unique school names
schools = filtered_rag_df["school_lab"].unique()

# Iterate over each school and create a DataFrame for "All pupils"
for school in schools:
    # Filter by school
    school_df = filtered_rag_df[filtered_rag_df["school_lab"] == school]

    # Filter by "All pupils" (where all grouping variables are set to "All")
    all_pupils_df = school_df[
        (school_df["year_group_lab"] == "All")
        & (school_df["gender_lab"] == "All")
        & (school_df["fsm_lab"] == "All")
        & (school_df["sen_lab"] == "All")
    ]

    # Select relevant columns: 'variable_lab', 'rag', 'school_lab', and add 'All pupils' as the group
    all_pupils_df = all_pupils_df[["variable_lab", "rag", "school_lab"]]
    all_pupils_df["group"] = "All pupils"

    # Store the DataFrame in the school_dfs dictionary
    school_dfs[f"{school}_all_pupils"] = all_pupils_df

# To view the DataFrame for 'School B' for "All pupils"
school_b_all_pupils_df = school_dfs.get("School B_all_pupils")
# Display the first few rows of the DataFrame for 'School B'
# print whole dataframe
print(school_b_all_pupils_df)


# Save each DataFrame to a CSV file
for school in schools:
    # Ensure the directory exists
    output_dir = Path(f"outputs/{school}/rag_ratings")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Save the DataFrame to a CSV file
    school_dfs[f"{school}_all_pupils"].to_csv(
        output_dir / f"{school}_all_pupils.csv", index=False
    )


                      variable_lab      rag school_lab       group
288                       Autonomy  Average   School B  All pupils
289              Life satisfaction    above   School B  All pupils
290                       Optimism  Average   School B  All pupils
291        Psychological wellbeing  Average   School B  All pupils
292                    Self-esteem  Average   School B  All pupils
293              Stress and coping  Average   School B  All pupils
294     Feelings around appearance    above   School B  All pupils
295                Negative affect  Average   School B  All pupils
296                     Loneliness    above   School B  All pupils
297       Supporting own wellbeing  Average   School B  All pupils
298                          Sleep  Average   School B  All pupils
299              Physical activity  Average   School B  All pupils
300                      Free time  Average   School B  All pupils
301               Social media use  Average   School B  All pu

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_rag_df["rag"] = filtered_rag_df["rag"].replace(


In [9]:
# By year group
# Create a year group pivot DataFrame for each school
for school in schools:
    school_df = filtered_rag_df[filtered_rag_df["school_lab"] == school][rag_columns]

    # Filter by year groups
    year_group_rag: pd.DataFrame = school_df[
        school_df["year_group_lab"].isin(["Year 8", "Year 10"])
    ]

    # Pivot the table for Year 8 and Year 10 RAG summaries
    year_group_pivot = year_group_rag.pivot_table(
        index=["school_lab", "variable_lab"],
        columns="year_group_lab",
        values="rag",
        aggfunc="first",
    ).reset_index()

    # Store the pivoted DataFrame in the school_dfs dictionary
    school_dfs[f"{school}_year_group"] = year_group_pivot

# Save each DataFrame to a CSV file
for school in schools:
    # Ensure the directory exists
    output_dir = Path(f"outputs/{school}/rag_ratings")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Save the DataFrame to a CSV file
    school_dfs[f"{school}_year_group"].to_csv(
        output_dir / f"{school}_year_group.csv", index=False
    )


In [10]:
# By FSM status
for school in schools:
    school_df = filtered_rag_df[filtered_rag_df["school_lab"] == school][rag_columns]

    # Filter by FSM
    fsm_pivot_rag: pd.DataFrame = school_df[
        school_df["fsm_lab"].isin(["FSM", "Non-FSM"])
    ]

    # Pivot the table for SEN status
    fsm_pivot = fsm_pivot_rag.pivot_table(
        index=["school_lab", "variable_lab"],
        columns="fsm_lab",
        values="rag",
        aggfunc="first",
    ).reset_index()

    # Store the pivoted DataFrame in the school_dfs dictionary
    school_dfs[f"{school}_fsm"] = fsm_pivot

for school in schools:
    # Ensure the directory exists
    output_dir = Path(f"outputs/{school}/rag_ratings")
    output_dir.mkdir(parents=True, exist_ok=True)

    school_dfs[f"{school}_fsm"].to_csv(
        f"outputs/{school}/rag_ratings/{school}_fsm.csv", index=False
    )


In [11]:
# By Gender
for school in schools:
    school_df = filtered_rag_df[filtered_rag_df["school_lab"] == school][rag_columns]

    # Filter by gender
    gender_rag: pd.DataFrame = school_df[school_df["gender_lab"].isin(["Boy", "Girl"])]

    # Pivot
    gender_pivot = gender_rag.pivot_table(
        index=["school_lab", "variable_lab"],
        columns="gender_lab",
        values="rag",
        aggfunc="first",
    ).reset_index()

    # Store the pivoted DataFrame in the school_dfs dictionary
    school_dfs[f"{school}_gender"] = gender_pivot

# Save each DataFrame to a CSV file
for school in schools:
    # Ensure the directory exists
    output_dir = Path(f"outputs/{school}/rag_ratings")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    school_dfs[f"{school}_gender"].to_csv(
        f"outputs/{school}/rag_ratings/{school}_gender.csv", index=False
    )


In [12]:
# By SEN status

# Create a year group pivot DataFrame for each school
for school in schools:
    school_df = filtered_rag_df[filtered_rag_df["school_lab"] == school][rag_columns]

    # Filter by year groups
    sen_rag: pd.DataFrame = school_df[school_df["sen_lab"].isin(["SEN", "Non-SEN"])]

    # Pivot the table for SEN status
    sen_pivot = sen_rag.pivot_table(
        index=["school_lab", "variable_lab"],
        columns="sen_lab",
        values="rag",
        aggfunc="first",
    ).reset_index()

    # Store the pivoted DataFrame in the school_dfs dictionary
    school_dfs[f"{school}_sen"] = sen_pivot

# Save each DataFrame to a CSV file
for school in schools:
    # Ensure the directory exists
    output_dir = Path(f"outputs/{school}/rag_ratings")
    output_dir.mkdir(parents=True, exist_ok=True)

    school_dfs[f"{school}_sen"].to_csv(
        f"outputs/{school}/rag_ratings/{school}_sen.csv", index=False
    )
