In [5]:
import pandas as pd

In [6]:
# Path to the Excel file
file_path = "../raw/1272.0 australian standard classification of education (asced) structures.xlsx"

# Read the "Table 2" worksheet into a DataFrame
try:
    df = pd.read_excel(file_path, sheet_name="Table 1")
    print(df.head())  # printing first few rows for brevity
except Exception as e:
    print("Error reading Excel file:", e)

df

                     Australian Bureau of Statistics     Unnamed: 1  \
0  1272.0 Australian Standard Classification of E...            NaN   
1  Released at 11.30am (Canberra time) 29 Septemb...            NaN   
2          Table 1 Level of Education classification            NaN   
3                                       Broad Levels            NaN   
4                                                NaN  Narrow Levels   

  Unnamed: 2 Unnamed: 3  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


Unnamed: 0,Australian Bureau of Statistics,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,1272.0 Australian Standard Classification of E...,,,
1,Released at 11.30am (Canberra time) 29 Septemb...,,,
2,Table 1 Level of Education classification,,,
3,Broad Levels,,,
4,,Narrow Levels,,
...,...,...,...,...
91,,,991,Statements of Attainment not Identifiable by L...
92,,,992,Bridging and Enabling Courses not Identifiable...
93,,,999,"Education, n.e.c."
94,,,,


In [7]:
broad_fields = df.iloc[:, :2].copy()

# Rename the first two columns
broad_fields.rename(
    columns={
        broad_fields.columns[0]: "level_of_education_broad_code",
        broad_fields.columns[1]: "level_of_education_broad_name",
    },
    inplace=True,
)

# Filter rows where "level_of_education_broad_code" is not NaN
broad_fields = broad_fields[broad_fields["level_of_education_broad_code"].notna()].iloc[
               4:-1
               ]

broad_fields

Unnamed: 0,level_of_education_broad_code,level_of_education_broad_name
6,1,POSTGRADUATE DEGREE LEVEL
20,2,GRADUATE DIPLOMA AND GRADUATE CERTIFICATE LEVEL
32,3,BACHELOR DEGREE LEVEL
38,4,ADVANCED DIPLOMA AND DIPLOMA LEVEL
49,5,CERTIFICATE LEVEL
63,6,SECONDARY EDUCATION
73,7,PRIMARY EDUCATION
83,8,PRE-PRIMARY EDUCATION
86,9,OTHER EDUCATION


In [8]:
broad_fields = df.iloc[4:-1, :2].copy()

# Rename the first two columns
broad_fields.rename(
    columns={
        broad_fields.columns[0]: "level_of_education_broad_code",
        broad_fields.columns[1]: "level_of_education_broad_name",
    },
    inplace=True,
)

# Filter rows where "level_of_education_broad_code" is not NaN
broad_fields = broad_fields[broad_fields["level_of_education_broad_code"].notna()]

broad_fields

Unnamed: 0,level_of_education_broad_code,level_of_education_broad_name
6,1,POSTGRADUATE DEGREE LEVEL
20,2,GRADUATE DIPLOMA AND GRADUATE CERTIFICATE LEVEL
32,3,BACHELOR DEGREE LEVEL
38,4,ADVANCED DIPLOMA AND DIPLOMA LEVEL
49,5,CERTIFICATE LEVEL
63,6,SECONDARY EDUCATION
73,7,PRIMARY EDUCATION
83,8,PRE-PRIMARY EDUCATION
86,9,OTHER EDUCATION


In [11]:
narrow_fields = df.iloc[:, 2:3].copy()

# Create a copy with only the first three columns
narrow_fields = df.iloc[:, :3].copy()

# Find the row index where column 1 equals "Narrow Fields"
header_rows = narrow_fields[narrow_fields.iloc[:, 1] == "Narrow Levels"]
if header_rows.empty:
    raise ValueError("Could not find the 'Narrow Fields' header in the second column.")
header_row_index = header_rows.index[0]

# Take all rows after the header row as the narrow fields data
narrow_fields = narrow_fields.loc[header_row_index + 1:].copy()

# Rename columns for clarity
# narrow_fields.columns = ["level_of_education_narrow_code", "level_of_education_narrow_name"]

# Filter out any rows where the narrow code is NaN
# narrow_fields = narrow_fields[narrow_fields["level_of_education_narrow_code"].notna()]

narrow_fields

Unnamed: 0,Australian Bureau of Statistics,Unnamed: 1,Unnamed: 2
5,,,Detailed Levels
6,1,POSTGRADUATE DEGREE LEVEL,
7,,11,Doctoral Degree Level
8,,,111
9,,,112
...,...,...,...
91,,,991
92,,,992
93,,,999
94,,,


In [12]:
df_clean = df.copy()

df_clean.columns = ["L1", "L2", "L3", "L4"]

df_clean = df_clean.iloc[6:-1]

df_clean

Unnamed: 0,L1,L2,L3,L4
6,1,POSTGRADUATE DEGREE LEVEL,,
7,,11,Doctoral Degree Level,
8,,,111,Higher Doctorate
9,,,112,Doctorate by Research
10,,,113,Doctorate by Coursework
...,...,...,...,...
90,,99,Miscellaneous Education,
91,,,991,Statements of Attainment not Identifiable by L...
92,,,992,Bridging and Enabling Courses not Identifiable...
93,,,999,"Education, n.e.c."


In [13]:
broad_fields = df_clean[df_clean["L1"].notna() & df_clean["L2"].notna()].astype(str).copy()

broad_fields = broad_fields[["L1", "L2"]].rename(
    columns={
        "L1": "level_of_education_broad_code",
        "L2": "level_of_education_broad_name",
    }
)

broad_fields["level_of_education_broad_code"] = broad_fields["level_of_education_broad_code"].astype(str)

# broad_fields.set_index("level_of_education_broad_code", inplace=True, drop=False)

broad_fields

Unnamed: 0,level_of_education_broad_code,level_of_education_broad_name
6,1,POSTGRADUATE DEGREE LEVEL
20,2,GRADUATE DIPLOMA AND GRADUATE CERTIFICATE LEVEL
32,3,BACHELOR DEGREE LEVEL
38,4,ADVANCED DIPLOMA AND DIPLOMA LEVEL
49,5,CERTIFICATE LEVEL
63,6,SECONDARY EDUCATION
73,7,PRIMARY EDUCATION
83,8,PRE-PRIMARY EDUCATION
86,9,OTHER EDUCATION


In [16]:
narrow_fields = df_clean[df_clean["L2"].notna() & df_clean["L3"].notna()].astype(str).copy()

narrow_fields = narrow_fields[["L1", "L2", "L3"]].rename(
    columns={
        "L1": "level_of_education_broad_code",
        "L2": "level_of_education_narrow_code",
        "L3": "level_of_education_narrow_name",
    }
)

narrow_fields["level_of_education_broad_code"] = (
    narrow_fields["level_of_education_narrow_code"].astype(str).str[:1]
)

narrow_fields = pd.merge(
    narrow_fields,
    broad_fields[["level_of_education_broad_code", "level_of_education_broad_name"]],
    how="left",
)

narrow_fields = narrow_fields[
    [
        "level_of_education_broad_code",
        "level_of_education_broad_name",
        "level_of_education_narrow_code",
        "level_of_education_narrow_name",
    ]
]

# narrow_fields.set_index("level_of_education_narrow_code", inplace=True, drop=False)

narrow_fields

Unnamed: 0,level_of_education_broad_code,level_of_education_broad_name,level_of_education_narrow_code,level_of_education_narrow_name
0,1,POSTGRADUATE DEGREE LEVEL,11,Doctoral Degree Level
1,1,POSTGRADUATE DEGREE LEVEL,12,Master Degree Level
2,2,GRADUATE DIPLOMA AND GRADUATE CERTIFICATE LEVEL,21,Graduate Diploma Level
3,2,GRADUATE DIPLOMA AND GRADUATE CERTIFICATE LEVEL,22,Graduate Certificate Level
4,3,BACHELOR DEGREE LEVEL,31,Bachelor Degree Level
5,4,ADVANCED DIPLOMA AND DIPLOMA LEVEL,41,Advanced Diploma and Associate Degree Level
6,4,ADVANCED DIPLOMA AND DIPLOMA LEVEL,42,Diploma Level
7,5,CERTIFICATE LEVEL,51,Certificate III & IV Level
8,5,CERTIFICATE LEVEL,52,Certificate I & II Level
9,6,SECONDARY EDUCATION,61,Senior Secondary Education


In [17]:
detailed_fields = df_clean[df_clean["L3"].notna() & df_clean["L4"].notna()].astype(str).copy()

detailed_fields = detailed_fields[["L1", "L2", "L3", "L4"]].rename(
    columns={
        "L1": "level_of_education_broad_code",
        "L2": "level_of_education_narrow_code",
        "L3": "level_of_education_detailed_code",
        "L4": "level_of_education_detailed_name",
    }
)

detailed_fields["level_of_education_broad_code"] = (
    detailed_fields["level_of_education_detailed_code"].astype(str).str[:1]
)

detailed_fields["level_of_education_narrow_code"] = (
    detailed_fields["level_of_education_detailed_code"].astype(str).str[:2]
)

detailed_fields = pd.merge(
    detailed_fields,
    narrow_fields[
        [
            "level_of_education_narrow_code",
            "level_of_education_narrow_name",
            "level_of_education_broad_name",
        ]
    ],
    on="level_of_education_narrow_code",
    how="left",
)

detailed_fields = detailed_fields[
    [
        "level_of_education_broad_code",
        "level_of_education_broad_name",
        "level_of_education_narrow_code",
        "level_of_education_narrow_name",
        "level_of_education_detailed_code",
        "level_of_education_detailed_name",
    ]
]
# detailed_fields.set_index("level_of_education_detailed_code", inplace=True, drop=False)

detailed_fields

Unnamed: 0,level_of_education_broad_code,level_of_education_broad_name,level_of_education_narrow_code,level_of_education_narrow_name,level_of_education_detailed_code,level_of_education_detailed_name
0,1,POSTGRADUATE DEGREE LEVEL,11,Doctoral Degree Level,111,Higher Doctorate
1,1,POSTGRADUATE DEGREE LEVEL,11,Doctoral Degree Level,112,Doctorate by Research
2,1,POSTGRADUATE DEGREE LEVEL,11,Doctoral Degree Level,113,Doctorate by Coursework
3,1,POSTGRADUATE DEGREE LEVEL,11,Doctoral Degree Level,114,Professional Specialist Qualification at Docto...
4,1,POSTGRADUATE DEGREE LEVEL,11,Doctoral Degree Level,115,Statement of Attainment at Doctoral Degree Level
...,...,...,...,...,...,...
59,9,OTHER EDUCATION,91,Non-award Courses,911,Non-award Courses in Higher Education
60,9,OTHER EDUCATION,91,Non-award Courses,912,Other Non-award Courses
61,9,OTHER EDUCATION,99,Miscellaneous Education,991,Statements of Attainment not Identifiable by L...
62,9,OTHER EDUCATION,99,Miscellaneous Education,992,Bridging and Enabling Courses not Identifiable...


In [18]:
broad_fields.set_index("level_of_education_broad_code", inplace=True, drop=False)
narrow_fields.set_index("level_of_education_narrow_code", inplace=True, drop=False)
detailed_fields.set_index("level_of_education_detailed_code", inplace=True, drop=False)

In [19]:
from pathlib import Path

# Broad Fields
file_path = Path("../datasets/asced-field-of-education/formats/csv/level_of_education_broad.csv")
file_path.parent.mkdir(parents=True, exist_ok=True)
broad_fields.to_csv(
    Path("../datasets/asced-field-of-education/formats/csv/level_of_education_broad.csv"), index=False
)
broad_fields.to_json(
    Path("../datasets/asced-field-of-education/formats/json/level_of_education_broad.json"),
    orient="index"
)

# Narrow Fields
file_path = Path("../datasets/asced-field-of-education/formats/csv/level_of_education_narrow.csv")
file_path.parent.mkdir(parents=True, exist_ok=True)
narrow_fields.to_csv(
    Path("../datasets/asced-field-of-education/formats/csv/level_of_education_narrow.csv"), index=False
)
narrow_fields.to_json(
    Path("../datasets/asced-field-of-education/formats/json/level_of_education_narrow.json"),
    orient="index"
)

# Detailed Fields
file_path = Path("../datasets/asced-field-of-education/formats/csv/level_of_education_detailed.csv")
file_path.parent.mkdir(parents=True, exist_ok=True)
detailed_fields.to_csv(
    Path("../datasets/asced-field-of-education/formats/csv/level_of_education_detailed.csv"), index=False
)
detailed_fields.to_json(
    Path("../datasets/asced-field-of-education/formats/json/level_of_education_detailed.json"),
    orient="index"
)