# Test Data Generation: Canvas Roster Tables Class

**Affiliation**: *Kwantum Edu Analytics*. **Last Modified**: *5/12/2023*.

This OEA test data generation class notebook generates fictitous Canvas tables, as seen in the Canvas module. This notebook is needed to successfully run the canvas_test_data_gen_demo notebook.

For reference of all Canvas tables outlined below, see Canvas table schemas here: https://canvas.instructure.com/doc/api/all_resources.html

This class notebook primarily leans on the use of the OEA_py class notebook, ```Faker``` and ```random``` python packages, and already-generated base-truth tables to generate **7** Canvas module SIS/rostering tables:

 1. **accounts**
 2. **courses**
 3. **enrollments**
 4. **enrollment_terms**
 5. **roles**
 6. **sections**
 7. **users** 

There is one main method ```genCanvasRoster(startdate, enddate, reportgendate, use_general_module_base_truth)``` to generate the tables described. Parameter descriptions are given:
  - *startdate*: roster start date.
  - *enddate*: roster end date.
  - *reportgendate*: date the report(s) were generated (i.e., fictitous date when all tables were landed in the data lake).
  - *use_general_module_base_truth*: boolean argument indicating whether to use the general-module base-truth tables (i.e., base-truth tables that link students, courses, etc. across OEA modules)
    * If ```True``` - lands the general-module base-truth tables if they don't already exist, and generates Canvas test data based on these tables.
    * If ```False``` - uses the default, user-generated base-truth tables to generate Canvas test data.

In [1]:
import logging
import random, decimal
from tokenize import Ignore, String
from faker import Faker
import pandas as pd
import datetime as dt
import numpy as np
import json
from pyspark.sql import functions as F

class CanvasRosterDataGen():
    def __init__(self, startdate='2022-01-03T00:00:00', enddate='2022-06-03T00:00:00'):
        #self.startdate = startdate
        #self.enddate = enddate
        
        self.faker = Faker('en_US')

        # set current datetime for rundate folder for writing out files
        currentDate = dt.datetime.now()
        self.currentDateTime = currentDate.strftime("%Y-%m-%d %H-%M-%S")

        # initialize dfs for each Canvas table to be generated
        account_dim = {
            "id":[],
            "canvas_id":[],
            "name":[],
            "depth":[],
            "workflow_state":[],
            "parent_account":[],
            "parent_account_id":[],
            "grandparent_account":[],
            "grandparent_account_id":[],
            "root_account":[],
            "root_account_id":[],
            "subaccount1":[],
            "subaccount1_id":[],
            "subaccount2":[],
            "subaccount2_id":[],
            "subaccount3":[],
            "subaccount3_id":[],
            "subaccount4":[],
            "subaccount4_id":[],
            "subaccount5":[],
            "subaccount5_id":[],
            "subaccount6":[],
            "subaccount6_id":[],
            "subaccount7":[],
            "subaccount7_id":[],
            "subaccount8":[],
            "subaccount8_id":[],
            "subaccount9":[],
            "subaccount9_id":[],
            "subaccount10":[],
            "subaccount10_id":[],
            "subaccount11":[],
            "subaccount11_id":[],
            "subaccount12":[],
            "subaccount12_id":[],
            "subaccount13":[],
            "subaccount13_id":[],
            "subaccount14":[],
            "subaccount14_id":[],
            "subaccount15":[],
            "subaccount15_id":[],
            "sis_source_id":[]
        }
        self.canvas_account_dim = pd.DataFrame(account_dim, dtype=object)
        course_dim = {
            "id":[],
            "canvas_id":[],
            "root_account_id":[],
            "account_id":[],
            "enrollment_term_id":[],
            "name":[],
            "code":[],
            "type":[],
            "created_at":[],
            "start_at":[],
            "conclude_at":[],
            "publicly_visible":[],
            "sis_source_id":[],
            "workflow_state":[],
            "wiki_id":[],
            "syllabus_body":[]
        }
        self.canvas_course_dim = pd.DataFrame(course_dim, dtype=object)
        enrollments = {
            "id":[],
            "course_id":[],
            "sis_course_id":[],
            "course_integration_id":[],
            "course_section_id":[],
            "section_integration_id":[],
            "sis_account_id":[],
            "sis_section_id":[],
            "sis_user_id":[],
            "enrollment_state":[],
            "limit_privileges_to_course_section":[],
            "sis_import_id":[],
            "root_account_id":[],
            "type":[],
            "user_id":[],
            "associated_user_id":[],
            "role":[],
            "role_id":[],
            "created_at":[],
            "updated_at":[],
            "start_at":[],
            "end_at":[],
            "last_activity_at":[],
            "last_attended_at":[],
            "total_activity_time":[],
            "html_url":[],
            "grades":[],
            "user":[],
            "override_grade":[],
            "override_score":[],
            "unposted_current_grade":[],
            "unposted_final_grade":[],
            "unposted_current_score":[],
            "unposted_final_score":[]
        }
        self.canvas_enrollments = pd.DataFrame(enrollments, dtype=object)
        enroll_terms = {
            "enrollment_terms":[
                {
                    "id":[],
                    "name":[],
                    "start_at":[],
                    "end_at":[],
                    "created_at":[],
                    "workflow_state":[],
                    "grading_period_group_id":[],
                    "sis_term_id":[],
                    "course_count":[]
                }
            ]
        } 
        self.canvas_enrollment_terms = pd.DataFrame(enroll_terms, dtype=object)
        sections = {
            "id":[],
            "name":[],
            "sis_section_id":[],
            "integration_id":[],
            "sis_import_id":[],
            "course_id":[],
            "sis_course_id":[],
            "start_at":[],
            "end_at":[],
            "restrict_enrollments_to_section_dates":[],
            "nonxlist_course_id":[],
            "total_students":[]
        }
        self.canvas_sections = pd.DataFrame(sections, dtype=object)
        roles = {
            "label":[],
            "role":[],
            "base_role_type":[],
            "account":[],
            "workflow_state":[],
            "permissions":[]
        }
        self.canvas_roles = pd.DataFrame(roles, dtype=object)
        user_dim = {
            "id":[],
            "canvas_id":[],
            "root_account_id":[],
            "name":[],
            "time_zone":[],
            "created_at":[],
            "visibility":[],
            "school_name":[],
            "school_position":[],
            "gender":[],
            "locale":[],
            "public":[],
            "birthdate":[],
            "country_code":[],
            "workflow_state":[],
            "sortable_name":[],
            "global_canvas_id":[]
        }
        self.canvas_user_dim = pd.DataFrame(user_dim, dtype=object)
        

        # set static integration_id and sis_import_id for all imported SIS information
        self.integrationid = self.faker.uuid4() # NOTE: also acts as the education system's LTI guid for account
        self.sisimportid = self.faker.random_int(min=100000, max=999999)
        # 
        self.accountid = self.faker.random_int(min=100000, max=999999) # create static root account_id of the Canvas account of the entire education system
        self.enrollmenttermid = 2 # create static enrollment_term_id of the semester term for the test data

    def genCanvasRoster(self,startdate='2022-01-01T00:00:00',enddate='2022-06-01T00:00:00',reportgendate='2022-02-02T00:00:00',use_general_module_base_truth=False):
        self.startdate = dt.datetime.strptime(startdate, "%Y-%m-%dT%H:%M:%S")
        self.enddate = dt.datetime.strptime(enddate, "%Y-%m-%dT%H:%M:%S")
        self.reportdate = dt.datetime.strptime(reportgendate, "%Y-%m-%dT%H:%M:%S")
        self.use_general_module_base_truth = use_general_module_base_truth
        if use_general_module_base_truth:
            sourcepath = 'stage1/Transactional/test_data/v0.1/base_general_modules'
            if oea.path_exists(sourcepath):
                logger.info('General module base-truth tables already exist - delete the "base_general_modules" folder/directory if you want to replace these.')
            else:
                # manually delete and replace the general module base_truth_tables CSVs as needed
                logger.info('General module base-truth tables do not currently exist - landing in stage1/.../test_data/v0.1/base_general_modules/')
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/students.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_students', 'general_module_base_truth_students.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/schools.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_schools', 'general_module_base_truth_schools.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/courses.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_courses', 'general_module_base_truth_courses.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/sections.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_sections', 'general_module_base_truth_sections.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/enrollment.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_student_enrollment', 'general_module_base_truth_student_enrollment.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/instructors.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_instructors', 'general_module_base_truth_instructors.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/instructors_enroll.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_instructors_enroll', 'general_module_base_truth_instructors_enroll.csv', oea.SNAPSHOT_BATCH_DATA)
            # NOTE: if tables are not read in properly - you may need to rename the rundate folder to replace colons with hyphens
            self.students = oea.load_csv(sourcepath + '/base_students/', header=True).toPandas()
            self.schools = oea.load_csv(sourcepath + '/base_schools/', header=True).toPandas()
            self.courses = oea.load_csv(sourcepath + '/base_courses/', header=True).toPandas()
            self.sections = oea.load_csv(sourcepath + '/base_sections/', header=True).toPandas()
            self.enrollment = oea.load_csv(sourcepath + '/base_student_enrollment/', header=True).toPandas()
            self.instructors = oea.load_csv(sourcepath + '/base_instructors/', header=True).toPandas()
            self.instructors_enroll = oea.load_csv(sourcepath + '/base_instructors_enroll/', header=True).toPandas()
            logger.info('Generating Canvas test data based on general module base-truth tables...')
        else:
            # expectation is that base_truth_tables exist
            sourcepath = 'stage1/Transactional/test_data/v0.1/'
            self.students = oea.load_csv(sourcepath + 'base_students/', header=True).toPandas()
            self.schools = oea.load_csv(sourcepath + 'base_schools/', header=True).toPandas()
            self.courses = oea.load_csv(sourcepath + 'base_courses/', header=True).toPandas()
            self.sections = oea.load_csv(sourcepath + 'base_sections/', header=True).toPandas()
            self.enrollment = oea.load_csv(sourcepath + 'base_student_enrollment/', header=True).toPandas()
            self.instructors = oea.load_csv(sourcepath + 'base_instructors/', header=True).toPandas()
            self.instructors_enroll = oea.load_csv(sourcepath + 'base_instructors_enroll/', header=True).toPandas()
            logger.info('Generating Canvas test data based on user-generated base-truth tables...')
        # generate Canvas test data tables, based on base-truth tables
        self.genUsers()
        self.genCourses()
        self.genSections()
        self.genAccounts()
        self.genEnrollments()
        self.genRoles()
        self.genEnrollmentTerms()
        logger.info('Successfully generated Canvas rostering tables.')
        logger.info('Finished Canvas generation.')

    def __get_daterange(self):
        daterange = []
        startdate = dt.datetime(2022,1,3)
        enddate = dt.datetime(2022,1,28)
        while(startdate < enddate):
            daterange.append(startdate)
            startdate = startdate + dt.timedelta(days=1)
        return daterange
    
    def genUsers(self):
        # set base date for "lastaccess" field
        base_lastaccess = dt.datetime(2022,2,2)
        # set dynamic variable/value for the canvas IDs
        n = 1
        for index, instructor in self.instructors.iterrows():
            id = n
            last_name = instructor['LastName']
            first_name = instructor['FirstName']
            short_name = instructor['FirstName']
            sis_user_id = instructor['InstructorID']
            name = f"{first_name} {last_name}"
            sortable_name = f"{first_name}, {last_name}"
            sis_import_id = self.sisimportid
            integration_id = f"{self.integrationid}"
            login_id = instructor['Email']
            enrollments = "" # NOTE: nulled for now
            email = instructor['Email']
            locale = "en"
            last_login = f"{base_lastaccess}"
            time_zone = "America/Los_Angeles"
            bio = "sample instructor bio."
            self.canvas_users.loc[len(self.canvas_users.index)] = [id,name,sortable_name,last_name,first_name,short_name,sis_user_id,sis_import_id,integration_id,login_id,enrollments,email, \
                                                                    locale,last_login,time_zone,bio]
            n = n + 1
        for index, student in self.students.iterrows():
            id = n
            last_name = student['LastName']
            first_name = student['FirstName']
            short_name = student['FirstName']
            sis_user_id = student['StudentID']
            name = f"{first_name} {last_name}"
            sortable_name = f"{first_name}, {last_name}"
            sis_import_id = self.sisimportid
            integration_id = f"{self.integrationid}"
            login_id = student['Email']
            enrollments = "" # NOTE: nulled for now
            email = student['Email']
            locale = "en"
            last_login = f"{base_lastaccess}"
            time_zone = "America/Los_Angeles"
            bio = "sample student bio."
            self.canvas_users.loc[len(self.canvas_users)] = [id,name,sortable_name,last_name,first_name,short_name,sis_user_id,sis_import_id,integration_id,login_id,enrollments,email, \
                                                                    locale,last_login,time_zone,bio]
            n = n + 1
        self.writetojsonfile('users', self.canvas_users)

    def genCourses(self):
        # set static variables and then extract the number of students enrolled in each course
        gradingstandardid = self.faker.random_int(min=1, max=20)
        dfBT_enroll = spark.createDataFrame(self.enrollment)
        dfBT_enroll = dfBT_enroll.groupBy('CourseID').count()
        for index, course in self.courses.iterrows():
            id = self.faker.unique.random_int(min=100000, max=999999)
            sis_course_id = course['CourseID']
            uuid = self.faker.uuid4()
            integration_id = f"{self.integrationid}"
            sis_import_id = self.sisimportid
            name = course['CourseName']
            course_code = course['CourseID'] # NOTE: unsure
            original_name = course['CourseName']
            workflow_state = "available"
            account_id = self.accountid
            root_account_id = self.accountid
            enrollment_term_id = self.enrollmenttermid
            grading_periods = ''
            grading_standard_id = gradingstandardid
            grade_passback_setting = "nightly_sync"
            created_at = f"{self.startdate}"
            start_at = f"{self.startdate + dt.timedelta(days=2)}"
            end_at = f"{self.enddate}"
            locale = "en"
            enrollments = ""
            total_students = dfBT_enroll.filter(dfBT_enroll['CourseID'] == f'{sis_course_id}').collect()[0][1]
            calendar = ""
            default_view = "feed"
            # there are some fields skipped, as they are optional depending on the API call
            is_public = True
            is_public_to = True 
            public_syllabus = True
            public_syllabus_to_auth = True
            #public_description = ""
            storage_quota_mb = self.faker.random_int(min=3, max=10)
            storage_quota_used_mb = 2
            hide_final_grades = False
            license = "Creative Commons"
            allow_student_assignment_edits = False
            allow_wiki_comments = False
            allow_student_forum_attachments = False
            open_enrollment = True
            self_enrollment = False
            restrict_enrollments_to_course_dates = True
            course_format = "hybrid"
            # more skipped fields
            self.canvas_courses.loc[len(self.canvas_courses.index)] = [id,sis_course_id,uuid,integration_id,sis_import_id,name,course_code,original_name,workflow_state,account_id,root_account_id,enrollment_term_id, \
                                                                    grading_periods,grading_standard_id,grade_passback_setting,created_at,start_at,end_at,locale,enrollments,total_students,calendar,default_view, \
                                                                    is_public,is_public_to,public_syllabus,public_syllabus_to_auth,storage_quota_mb,storage_quota_used_mb,hide_final_grades,license,allow_student_assignment_edits, \
                                                                    allow_wiki_comments,allow_student_forum_attachments,open_enrollment,self_enrollment,restrict_enrollments_to_course_dates,course_format]
        self.writetojsonfile('courses', self.canvas_courses)
    
    def genSections(self):
        # set enrollment table to extract the number of students enrolled in each section
        dfBT_enroll = spark.createDataFrame(self.enrollment)
        dfBT_enroll = dfBT_enroll.groupBy('SectionID').count()
        # set previously generated course table to get the Canvas course ID
        df_courses = spark.createDataFrame(self.canvas_courses)
        df_courses = df_courses.select('id', 'sis_course_id')
        for index, section in self.sections.iterrows():
            id = self.faker.unique.random_int(min=1000, max=9999)
            name = section['SectionName']
            sis_section_id = section['SectionID']
            integration_id = f"{self.integrationid}"
            sis_import_id = self.sisimportid
            sis_course_id = section['CourseID']
            course_id = df_courses.filter(df_courses['sis_course_id'] == f'{sis_course_id}').collect()[0][0]
            start_at = f"{self.startdate + dt.timedelta(days=2)}"
            #start_at = f"{start}"
            end_at = f"{self.enddate}"
            restrict_enrollments_to_section_dates = True
            nonxlist_course_id = ''
            total_students = dfBT_enroll.filter(dfBT_enroll['SectionID'] == f'{sis_section_id}').select('count').collect()[0][0]
            self.canvas_sections.loc[len(self.canvas_sections.index)] = [id,name,sis_section_id,integration_id,sis_import_id,course_id,sis_course_id,start_at,end_at,restrict_enrollments_to_section_dates, \
                                                                        nonxlist_course_id,total_students]
        self.writetojsonfile('sections', self.canvas_sections)
    
    def genAccounts(self):
        # NOTE: the courses and sections tables must be created first
        id = self.accountid
        name = "Contoso University Canvas Account"
        uuid = self.faker.uuid4()
        parent_account_id = self.accountid
        root_account_id = self.accountid
        default_storage_quota_mb = 500
        default_user_storage_quota_mb = 5
        default_group_storage_quota_mb = 500
        default_time_zone = "America/Los_Angeles"
        integration_id = f"{self.integrationid}"
        sis_account_id = integration_id
        sis_import_id = self.sisimportid
        lti_guid = integration_id
        workflow_state = "active"
        self.canvas_accounts.loc[len(self.canvas_accounts)] = [id,name,uuid,parent_account_id,root_account_id,default_storage_quota_mb,default_user_storage_quota_mb,default_group_storage_quota_mb, \
                                                                default_time_zone,sis_account_id,integration_id,sis_import_id,lti_guid,workflow_state]
        for index, course in self.canvas_courses.iterrows():
            id = self.faker.unique.random_int(min=100000, max=999999)
            name = course['name']
            uuid = self.faker.uuid4()
            parent_account_id = self.accountid
            root_account_id = self.accountid
            default_storage_quota_mb = 500
            default_user_storage_quota_mb = 5
            default_group_storage_quota_mb = 50
            default_time_zone = "America/Los_Angeles"
            integration_id = f"{self.integrationid}"
            sis_account_id = course['sis_course_id'] # NOTE: unsure if this is an accurate assumption
            sis_import_id = self.sisimportid
            lti_guid = sis_account_id # NOTE: also unsure if this is an accurate assumption
            workflow_state = "active"
            self.canvas_accounts.loc[len(self.canvas_accounts)] = [id,name,uuid,parent_account_id,root_account_id,default_storage_quota_mb,default_user_storage_quota_mb,default_group_storage_quota_mb, \
                                                                default_time_zone,sis_account_id,integration_id,sis_import_id,lti_guid,workflow_state]
        # after adding course accounts, use canvas_accounts to scrape the id
        df_accounts = spark.createDataFrame(self.canvas_accounts)
        for index, section in self.canvas_sections.iterrows():
            id = self.faker.unique.random_int(min=100000, max=999999)
            name = section['name']
            uuid = self.faker.uuid4()
            parent_account_id = df_accounts.filter(df_accounts['sis_account_id'] == section['sis_course_id']).collect()[0][0]
            root_account_id = self.accountid
            default_storage_quota_mb = 500
            default_user_storage_quota_mb = 5
            default_group_storage_quota_mb = 15
            default_time_zone = "America/Los_Angeles"
            integration_id = f"{self.integrationid}"
            sis_account_id = section['sis_section_id'] # NOTE: unsure if this is an accurate assumption
            sis_import_id = self.sisimportid
            lti_guid = sis_account_id # NOTE: also unsure if this is an accurate assumption
            workflow_state = "active"
            self.canvas_accounts.loc[len(self.canvas_accounts)] = [id,name,uuid,parent_account_id,root_account_id,default_storage_quota_mb,default_user_storage_quota_mb,default_group_storage_quota_mb, \
                                                                default_time_zone,sis_account_id,integration_id,sis_import_id,lti_guid,workflow_state]
        self.writetojsonfile('accounts', self.canvas_accounts)

    def genEnrollments(self):
        # set courses, sections and users tables to extract the Canvas course, section and user IDs
        df_courses = spark.createDataFrame(self.canvas_courses)
        df_sections = spark.createDataFrame(self.canvas_sections)
        df_users = spark.createDataFrame(self.canvas_users)
        # set dynamic variable/value for the Canvas enrollment IDs
        n = 1
        for index, enroll in self.instructors_enroll.iterrows():
            id = n
            sis_course_id = enroll['Section_fromCourseId']
            course_id = df_courses.filter(df_courses['sis_course_id'] == f"{sis_course_id}").collect()[0][0]
            course_integration_id = f"{self.integrationid}"
            sis_section_id = enroll['InstructsClass_SectionId']
            course_section_id = df_sections.filter(df_sections['sis_section_id'] == f"{sis_section_id}").collect()[0][0]
            section_integration_id = f"{self.integrationid}"
            sis_account_id = enroll['InstructsClass_SectionId']
            sis_user_id = enroll['InstructorId']
            enrollment_state = "active"
            limit_privileges_to_course_section = False
            sis_import_id = self.sisimportid
            root_account_id = self.accountid
            type = "TeacherEnrollment"
            user_id = df_users.filter(df_users['sis_user_id'] == f"{sis_user_id}").collect()[0][0]
            associated_user_id = ""
            role = "TeacherEnrollment"
            role_id = 1
            created_at = f"{self.startdate}"
            updated_at = f"{self.startdate}"
            start_at = f"{self.startdate + dt.timedelta(days=2)}"
            end_at = f"{self.enddate}"
            last_activity_at = f"{self.reportdate - dt.timedelta(days=random.randint(0,3),hours=random.randint(0,23),minutes=random.randint(0,59))}"
            last_attended_at = f"{last_activity_at}"
            total_activity_time = self.faker.random_int(min=3600, max=270000) # in seconds; between 1 and 75 hours
            str_courseid = str(course_id)
            html_url = f"https://canvas.cu.edu/courses/{str_courseid}"
            grades = "" 
            user = "" # NOTE: nulled for now
            override_grade = ""
            override_score = ""
            unposted_current_grade = ""
            unposted_final_grade = ""
            unposted_current_score = ""
            unposted_final_score = ""
            # additional (optional) skipped fields are excluded
            self.canvas_enrollments.loc[len(self.canvas_enrollments.index)] = [id,course_id,sis_course_id,course_integration_id,course_section_id,section_integration_id,sis_account_id,sis_section_id,sis_user_id,enrollment_state, \
                                                                    limit_privileges_to_course_section,sis_import_id,root_account_id,type,user_id,associated_user_id,role,role_id,created_at,updated_at, \
                                                                    start_at,end_at,last_activity_at,last_attended_at,total_activity_time,html_url,grades,user,override_grade,override_score,unposted_current_grade, \
                                                                    unposted_final_grade,unposted_current_score,unposted_final_score]
            n = n + 1
        # now add student enrollments
        for index, enroll in self.enrollment.iterrows():
            id = n
            sis_course_id = enroll['CourseID']
            course_id = df_courses.filter(df_courses['sis_course_id'] == f"{sis_course_id}").collect()[0][0]
            course_integration_id = f"{self.integrationid}"
            sis_section_id = enroll['SectionID']
            course_section_id = df_sections.filter(df_sections['sis_section_id'] == f"{sis_section_id}").collect()[0][0]
            section_integration_id = f"{self.integrationid}"
            sis_account_id = enroll['SectionID']
            sis_user_id = enroll['StudentID']
            enrollment_state = "active"
            limit_privileges_to_course_section = True
            sis_import_id = self.sisimportid
            root_account_id = self.accountid
            type = "StudentEnrollment"
            user_id = df_users.filter(df_users['sis_user_id'] == f"{sis_user_id}").collect()[0][0]
            associated_user_id = ""
            role = "StudentEnrollment"
            role_id = 2
            created_at = f"{self.startdate}"
            updated_at = f"{self.startdate}"
            start_at = f"{self.startdate + dt.timedelta(days=2)}"
            end_at = f"{self.enddate}"
            last_activity_at = f"{self.reportdate - dt.timedelta(days=random.randint(0,3),hours=random.randint(0,23),minutes=random.randint(0,59))}"
            last_attended_at = f"{last_activity_at}"
            total_activity_time = self.faker.random_int(min=3600, max=270000) # in seconds; between 1 and 75 hours
            str_courseid = str(course_id)
            html_url = f"https://canvas.cu.edu/courses/{str_courseid}"
            grades = "" # NOTE: nulled for now
            user = "" # NOTE: nulled for now
            override_grade = ""
            override_score = ""
            unposted_current_grade = ""
            unposted_final_grade = ""
            unposted_current_score = ""
            unposted_final_score = ""
            # additional (optional) skipped fields are excluded
            self.canvas_enrollments.loc[len(self.canvas_enrollments)] = [id,course_id,sis_course_id,course_integration_id,course_section_id,section_integration_id,sis_account_id,sis_section_id,sis_user_id,enrollment_state, \
                                                                    limit_privileges_to_course_section,sis_import_id,root_account_id,type,user_id,associated_user_id,role,role_id,created_at,updated_at, \
                                                                    start_at,end_at,last_activity_at,last_attended_at,total_activity_time,html_url,grades,user,override_grade,override_score,unposted_current_grade, \
                                                                    unposted_final_grade,unposted_current_score,unposted_final_score]
            n = n + 1
        self.writetojsonfile('enrollments', self.canvas_enrollments)

    def genRoles(self):
        # add instructor or teacher role
        label = "Teacher"
        role = "TeacherEnrollment" # NOTE: unsure for this field
        base_role_type = "AccountMembership"
        account = "" # NOTE: nulled for now
        workflow_state = "active" 
        permissions = "" # NOTE: nulled for now
        self.canvas_roles.loc[len(self.canvas_roles)] = [label,role,base_role_type,account,workflow_state,permissions]
        # add student role
        label = "Student"
        role = "StudentEnrollment" # NOTE: unsure for this field
        base_role_type = "AccountMembership"
        account = "" # NOTE: nulled for now
        workflow_state = "active" 
        permissions = "" # NOTE: nulled for now
        self.canvas_roles.loc[len(self.canvas_roles)] = [label,role,base_role_type,account,workflow_state,permissions]
        self.writetojsonfile('roles', self.canvas_roles)
    
    def genEnrollmentTerms(self):
        # currently holds two enrollment terms for the test data
        # first term is an old fake term to demonstrate the data structure of the resulting API call
        id = 1
        name = "Fall 2021" 
        start_at = dt.datetime(2021,9,7)
        end_at = dt.datetime(2021,12,17)
        created_at = dt.datetime(2021,9,1)
        workflow_state = "deleted" 
        grading_period_group_id = 1
        sis_term_id = self.faker.uuid4()
        #overrides = "" # NOTE: excluded from test data for now, as though the API call did not include this field
        # find number of courses for this term
        course_count = 92
        self.canvas_enrollment_terms.loc["enrollment_terms"][len(self.canvas_enrollment_terms["enrollment_terms"])] = [id,name,start_at,end_at,created_at,workflow_state,grading_period_group_id,sis_term_id,course_count]
        # add the 
        id = self.enrollmenttermid
        name = "Spring 2022" 
        start_at = self.startdate + dt.timedelta(days=2)
        end_at = self.enddate
        created_at = self.startdate
        workflow_state = "active" 
        grading_period_group_id = self.enrollmenttermid
        sis_term_id = self.faker.uuid4()
        #overrides = "" # NOTE: excluded from test data for now, as though the API call did not include this field
        # find number of courses for this term
        dfBT_courses = spark.createDataFrame(self.courses)
        dfBT_courses = dfBT_courses.withColumn('dummy_col', F.lit("dummy_inputs")).groupBy('dummy_col').count()
        course_count = dfBT_courses.select('count').collect()[0][0]
        self.canvas_enrollment_terms.loc["enrollment_terms"][len(self.canvas_enrollment_terms["enrollment_terms"])] = [id,name,start_at,end_at,created_at,workflow_state,grading_period_group_id,sis_term_id,course_count]
        self.writetojsonfile('enrollment_terms', self.canvas_enrollment_terms, True)
    
    def writetojsonfile(self,filename,dfOutfile,create_nest=False):
        finalgenfilepath = 'stage1/Transactional/test_data/v0.1/canvas_gen2/' + filename + '/' + filename + '.json'
        if create_nest == True:
            if filename == 'enrollment_terms':
#                for index, term in self.canvas_enrollment_terms.iterrows():
#                    termid = term['id']
#                    termname = term['name']
#                    termstart = term['start_at']
#                    termend = term['end_at']
#                    termcreated = term['created_at']
#                    termworkflowstate = term['workflow_state']
#                    termgradeperiodid = term['grading_period_group_id']
#                    termsisid = term['sis_term_id']
#                    termcoursecount = term['course_count']
#                    data = {
#                        "enrollment_terms": [
#                        ]
#                    }
#                    data["enrollment_terms"].append(
#                        {
#                            "id":f"{termid}",
#                            "name":f"{termname}",
#                            "start_at":
#
#                        }
#                    )
#                dfOutfile.to_json(oea.to_url(finalgenfilepath), orient='records', force_ascii=False, lines=True)
                dfOutfile.to_json(oea.to_url(finalgenfilepath), orient='records', force_ascii=False, lines=True)
        else:
            dfOutfile.to_json(oea.to_url(finalgenfilepath), orient='records', force_ascii=False, lines=True)

StatementMeta(, , , Cancelled, )