In [None]:
# The code below converts a CSV file into a custom XML file that is usable for DCaR LERS submissions
# ***IMPORTANT***: Save and close the relevant CSV and XML files before running this script

# Import modules
import csv
import xml.etree.ElementTree as ET
import xml.dom.minidom

# Place the CSV file in the same folder as this script
# Open the CSV file for reading and create a DictReader object named "reader"
# DictReader reads each row into an ordered dictionary, with column headers as keys
# The CSV data must have the following columns: 
#   - "AcademicYear"
#   - "Provider"
#   - "ASN"
#   - "StudentID"
#   - "Birthdate"
#   - "Gender"
#   - "Language"
#   - "CountryOfCitizenship"
#   - "SourceCountry"
#   - "SourcePostalCode"
#   - "ProgramID"
#   - "SpecializationID"
#   - "ProviderLocation"
#   - "Session"
#   - "YearOfStudy"
#   - "RegistrationStatus"
#   - "CreditsEnrolled"
#   - "OnlineDistanceDelivery"
#   - "WorkIntegratedLearning"
#   - "LegalStatus"
#   - "IndigenousIndicator"
#   - "CompletionStatus"
with open("data_extract.csv", mode="r") as file:
    reader = csv.DictReader(file)

    # Create an empty dictionary called "data_dictionary" to hold enrolment data
    data_dictionary = {}

    # For each row of data stored in the "reader" object, store the value from the "StudentID" column in the variable "student_id"
    for row in reader:
        student_id = row["StudentID"]

        # If the "student_id" value does not exist in the "data_dictionary" variable, append the "student_id" value to the "data_dictionary" variable. 
        # For each "student_id" value, create an "attributes" key and an "enrolments" key
        # In the "attributes" key, create a nested dictionary to store the Parent (Lerner) element attributes
        # In the "enrolments" key, create an empty list
        if student_id not in data_dictionary:
            data_dictionary[student_id] = {
                "attributes": {
                    "StudentID": row["StudentID"],
                    "Gender": row["Gender"],
                    "Birthdate": row["Birthdate"],
                    "SourcePostalCode": row["SourcePostalCode"],
                    "ASN": row["ASN"],
                    "Language": row["Language"],
                    "CountryOfCitizenship": row["CountryOfCitizenship"],
                    "SourceCountry": row["SourceCountry"]
                },
                "enrolments": []
            }

        # For each "student_id" value, append all associated Child (Enrolment) element(s) attributes to the empty list in the "enrolments" key
        data_dictionary[student_id]["enrolments"].append({
            "Provider": row["Provider"],
            "ProviderLocation": row["ProviderLocation"],
            "Session": row["Session"],
            "LegalStatus": row["LegalStatus"],
            "RegistrationStatus": row["RegistrationStatus"],
            "ProgramID": row["ProgramID"],
            "SpecializationID": row["SpecializationID"],
            "YearOfStudy": row["YearOfStudy"],
            "CreditsEnrolled": row["CreditsEnrolled"],
            "CompletionStatus": row["CompletionStatus"],
            "OnlineDistanceDelivery": row["OnlineDistanceDelivery"],
            "WorkIntegratedLearning": row["WorkIntegratedLearning"],
            "AcademicYear": row["AcademicYear"],
            "AboriginalIndicator": row["IndigenousIndicator"]
        })

# Sort data_dictionary by StudentID in ascending lexicographical order, store the sorted StudentIDs in a list called "sorted_learner_ids"
sorted_learner_ids = sorted(data_dictionary.keys())

# Create the root element <LERS xmlns="http://psdata.eae.alberta.ca/enrol/3"></LERS>
lers_element = ET.Element("LERS", xmlns="http://psdata.eae.alberta.ca/enrol/3")

# Loop through each student_id in "sorted_enrolment_ids" and retrieve all data in "data_dictionary"
# This will retrieve data from "data_dictionary" in ascending order by StudentIDs
for student_id in sorted_learner_ids:
    learner_data = data_dictionary[student_id]

    # Create the Parent (Learner) element: <Learner></Learner>
    # The **learner_data["attributes"] part of the code unpacks the attributes in the "attributes" key in the "learner_data" dictionary and stores them as attributes of the Parent (Learner) element
    # This will create the Parent element in the format of: <Learner StudentID="#########" Gender="X" ... SourceCountry="XX"></Learner>
    learner_element = ET.SubElement(lers_element, "Learner", **learner_data["attributes"])

    # Create the Child (Enrolment) element: <Enrolment/>
    # The **enrolment_data part of the code unpacks the attributes in the "enrolments" key in the "learner_data" dictionary and stores them as attributes of the Child (Enrolment) element
    # This will create the Child element in the format of: <Enrolment Provider="XX" ProviderLocation="##" ... AboriginalIndicator="#"/>
    for enrolment_data in learner_data["enrolments"]:
        ET.SubElement(learner_element, "Enrolment", **enrolment_data)

# The "lers_element" variable now stores a XML tree object with all learner and enrolment data
# ET.tostring() converts the XML tree object (lers_element) into raw XML bytes
# The encoding="utf-8" argument outputs a bytes object
raw_xml_string = ET.tostring(lers_element, encoding="utf-8")

# xml.dom.minidom is a DOM (Document Object Model) parser
# parseString() takes "raw_xml_string" from the previous step and parses it into a DOM object â€” a structured, in-memory representation that allows easy formatting and traversal.
xml_parsed_DOM_object = xml.dom.minidom.parseString(raw_xml_string)

# toprettyxml() converts the DOM object ("xml_parsed_DOM_object") back into a readable XML string with indentation and newlines
# indent="  " adds 2 spaces in front of every Parent element and every Child element
# newl="\n" writes every element on a new line
# xml.dom.minidom.toprettyxml() adds the XML declaration of <?xml version="1.0"?> before the root element by default, encoding="utf-8" turns this XML declaration into <?xml version="1.0" encoding="utf-8"?>
# decode("utf-8") converts bytes into Python string
xml_readable_string = xml_parsed_DOM_object.toprettyxml(indent="  ", newl="\n", encoding="utf-8").decode("utf-8")

# Child (Enrolment) elements require a space before its closing syntax '/>' 
# Since only Child elements end in '"/>", replacing all '"/>" with '" />' will only add a space before the closing syntax of Child elements
xml_readable_string = xml_readable_string.replace('"/>', '" />')

# Remove all spaces at the end of the XML string, meaning remove all spaces after </LERS>
xml_readable_string = xml_readable_string.rstrip()

# Open a XML file named "LERS_CONVERTED_DATA.xml" in binary write mode ("wb")
# Write the UTF-8 Byte Order Mark (BOM) at the start of the file
# Write the XML string "xml_readable_string" as UTF-8 bytes
# Close the XML file
with open("LERS_CONVERTED_DATA.xml", "wb") as xml_file:
    xml_file.write(b'\xef\xbb\xbf')  # Write Byte BOM for UTF-8 encoding
    xml_file.write(xml_readable_string.encode('utf-8'))