# Test Data Generation: Canvas Roster Tables Class

**Affiliation**: *Kwantum Edu Analytics*. **Last Modified**: *5/18/2023*.

This OEA test data generation class notebook generates fictitous Canvas tables, as seen in the Canvas module. This notebook is needed to successfully run the canvas_test_data_gen_demo notebook.

For reference of all Canvas tables outlined below, see Canvas table schemas in the following resources: 
 - [**Canvas Data 2 vs. Canvas Data 1 schema info as of May 2023**](https://docs.google.com/spreadsheets/d/1kqCXAD9K45L0QeEtbuuMAFp2fW8o0oC8EBzJf58SjrY/edit#gid=2091525305)- this resource was used as the main reference for test data generation.
 - [**General Canvas Data Info**](https://portal.inshosteddata.com/docs)
 - [**Canvas API Info**](https://api-gateway.instructure.com/doc/)

This class notebook primarily leans on the use of the OEA_py class notebook, ```Faker``` and ```random``` python packages, and already-generated base-truth tables to generate **7** Canvas module SIS/rostering tables:

 1. **accounts**
 2. **courses**
 3. **course_sections**
 3. **enrollments**
 4. **enrollment_terms**
 5. **roles**
 7. **users** 

There is one main method ```genCanvasRoster(startdate, enddate, reportgendate, use_general_module_base_truth)``` to generate the tables described. Parameter descriptions are given:
  - *startdate*: roster start date.
  - *enddate*: roster end date.
  - *reportgendate*: date the report(s) were generated (i.e., fictitous date when all tables were landed in the data lake).
  - *use_general_module_base_truth*: boolean argument indicating whether to use the general-module base-truth tables (i.e., base-truth tables that link students, courses, etc. across OEA modules)
    * If ```True``` - lands the general-module base-truth tables if they don't already exist, and generates Canvas test data based on these tables.
    * If ```False``` - uses the default, user-generated base-truth tables to generate Canvas test data.

In [1]:
import logging
import random, decimal
from tokenize import Ignore, String
from faker import Faker
import pandas as pd
import datetime as dt
import numpy as np
import json
from pyspark.sql import functions as F

class CanvasRosterDataGen():
    def __init__(self):
        self.faker = Faker('en_US')

        # set current datetime for rundate folder for writing out files
        currentDate = dt.datetime.now()
        self.currentDateTime = currentDate.strftime("%Y-%m-%d %H-%M-%S")

        # initialize dfs for each Canvas table to be generated
        accounts = {
            "id":[],
            "name":[],
            "depth":[],
            "workflow_state":[],
            "parent_account_id":[],
            "sis_source_id":[]
        }
        self.canvas_accounts = pd.DataFrame(accounts, dtype=object)
        courses = {
            "id":[],
            "account_id":[],
            "enrollment_term_id":[],
            "name":[],
            "course_code":[],
            "created_at":[],
            "start_at":[],
            "conclude_at":[],
            "is_public":[],
            "sis_source_id":[],
            "workflow_state":[],
            "wiki_id":[],
            "syllabus_body":[]
        }
        self.canvas_courses = pd.DataFrame(courses, dtype=object)
        course_sections = {
            "id":[],
            "name":[],
            "course_id":[], # NOTE: unsure if this field is in this table
            "default_section":[],
            "accepting_enrollments":[],
            "start_at":[],
            "end_at":[],
            "created_at":[],
            "updated_at":[],
            "workflow_state":[],
            "restrict_enrollments_to_section_dates":[],
            "nonxlist_course_id":[],
            "sis_source_id":[]
        }
        self.canvas_course_sections = pd.DataFrame(course_sections, dtype=object)
        enrollments = {
            "id":[],
            "type":[],
            "workflow_state":[],
            "created_at":[],
            "updated_at":[],
            "start_at":[],
            "end_at":[],
            "completed_at":[],
            "self_enrolled":[],
            "last_activity_at":[],
            "course_section_id":[],
            "user_id":[] # NOTE: unsure if this field is in this table
        }
        self.canvas_enrollments = pd.DataFrame(enrollments, dtype=object)
        enrollment_terms = {
            "id":[],
            "name":[],
            "start_at":[],
            "end_at":[],
            "sis_source_id":[]
        } 
        self.canvas_enrollment_terms = pd.DataFrame(enrollment_terms, dtype=object)
        roles = {
            "id":[],
            "account_id":[],
            "name":[],
            "base_role_type":[],
            "workflow_state":[],
            "created_at":[],
            "updated_at":[],
            "deleted_at":[]
        }
        self.canvas_roles = pd.DataFrame(roles, dtype=object)
        users = {
            "id":[],
            "name":[],
            "time_zone":[],
            "created_at":[],
            "locale":[],
            "workflow_state":[],
            "sortable_name":[],
            "school_name":[],
            "school_position":[],
            "public":[],
            "global_canvas_id":[] # NOTE: unsure if this field is in this table
        }
        self.canvas_users = pd.DataFrame(users, dtype=object)
    
        # set static integration_id and sis_import_id for all imported SIS information
        self.integrationid = self.faker.uuid4() # NOTE: also acts as the education system's LTI guid for account
        self.accountid = self.faker.unique.random_int(min=10000000, max=99999999) # create static root account_id of the Canvas account of the entire education system
        self.enrollmenttermid = self.faker.unique.random_int(min=10000000, max=99999999) # create static enrollment_term_id of the semester term for the test data

    def genCanvasRoster(self,startdate='2022-01-01T00:00:00',enddate='2022-06-01T00:00:00',reportgendate='2022-02-02T00:00:00',use_general_module_base_truth=False):
        self.startdate = dt.datetime.strptime(startdate, "%Y-%m-%dT%H:%M:%S")
        self.enddate = dt.datetime.strptime(enddate, "%Y-%m-%dT%H:%M:%S")
        self.reportdate = dt.datetime.strptime(reportgendate, "%Y-%m-%dT%H:%M:%S")
        self.use_general_module_base_truth = use_general_module_base_truth
        if use_general_module_base_truth:
            sourcepath = 'stage1/Transactional/test_data/v0.1/base_general_modules'
            if oea.path_exists(sourcepath):
                logger.info('General module base-truth tables already exist - delete the "base_general_modules" folder/directory if you want to replace these.')
            else:
                # manually delete and replace the general module base_truth_tables CSVs as needed
                logger.info('General module base-truth tables do not currently exist - landing in stage1/.../test_data/v0.1/base_general_modules/')
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/students.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_students', 'general_module_base_truth_students.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/schools.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_schools', 'general_module_base_truth_schools.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/courses.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_courses', 'general_module_base_truth_courses.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/sections.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_sections', 'general_module_base_truth_sections.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/enrollment.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_student_enrollment', 'general_module_base_truth_student_enrollment.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/instructors.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_instructors', 'general_module_base_truth_instructors.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/instructors_enroll.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_instructors_enroll', 'general_module_base_truth_instructors_enroll.csv', oea.SNAPSHOT_BATCH_DATA)
            # NOTE: if tables are not read in properly - you may need to rename the rundate folder to replace colons with hyphens
            self.students = oea.load_csv(sourcepath + '/base_students/', header=True).toPandas()
            self.schools = oea.load_csv(sourcepath + '/base_schools/', header=True).toPandas()
            self.courses = oea.load_csv(sourcepath + '/base_courses/', header=True).toPandas()
            self.sections = oea.load_csv(sourcepath + '/base_sections/', header=True).toPandas()
            self.enrollment = oea.load_csv(sourcepath + '/base_student_enrollment/', header=True).toPandas()
            self.instructors = oea.load_csv(sourcepath + '/base_instructors/', header=True).toPandas()
            self.instructors_enroll = oea.load_csv(sourcepath + '/base_instructors_enroll/', header=True).toPandas()
            logger.info('Generating Canvas test data based on general module base-truth tables...')
        else:
            # expectation is that base_truth_tables exist
            sourcepath = 'stage1/Transactional/test_data/v0.1/'
            self.students = oea.load_csv(sourcepath + 'base_students/', header=True).toPandas()
            self.schools = oea.load_csv(sourcepath + 'base_schools/', header=True).toPandas()
            self.courses = oea.load_csv(sourcepath + 'base_courses/', header=True).toPandas()
            self.sections = oea.load_csv(sourcepath + 'base_sections/', header=True).toPandas()
            self.enrollment = oea.load_csv(sourcepath + 'base_student_enrollment/', header=True).toPandas()
            self.instructors = oea.load_csv(sourcepath + 'base_instructors/', header=True).toPandas()
            self.instructors_enroll = oea.load_csv(sourcepath + 'base_instructors_enroll/', header=True).toPandas()
            logger.info('Generating Canvas test data based on user-generated base-truth tables...')
        # generate Canvas test data tables, based on base-truth tables
        self.genUsers()
        self.genCourses()
        self.genCourseSections()
        self.genAccounts()
        self.genEnrollments()
        self.genRoles()
        self.genEnrollmentTerms()
        logger.info('Successfully generated Canvas rostering tables.')
        logger.info('Finished Canvas generation.')

    def __get_daterange(self):
        daterange = []
        startdate = dt.datetime(2022,1,3)
        enddate = dt.datetime(2022,1,28)
        while(startdate < enddate):
            daterange.append(startdate)
            startdate = startdate + dt.timedelta(days=1)
        return daterange

    def genUsers(self):
        # iterates through both the base-truth instructors and students tables and adds them to the canvas users table
        for index, instructor in self.instructors.iterrows():
            id = self.faker.unique.random_int(min=1000, max=9999)
            lastname = instructor['LastName']
            firstname = instructor['FirstName']
            name = f"{firstname} {lastname}"
            time_zone = "America/Los_Angeles"
            created_at = f"{self.startdate}"
            locale = "en"
            workflow_state = "registered"
            sortable_name = f"{firstname}, {lastname}"
            school_name = "Contoso University"
            school_position = "Professor"
            public = True
            global_canvas_id = instructor['InstructorID'] # NOTE: currently using this field for SIS IDs
            self.canvas_users.loc[len(self.canvas_users.index)] = [id,name,time_zone,created_at,locale,workflow_state,sortable_name,school_name,school_position,public,global_canvas_id]
        for index, student in self.students.iterrows():
            id = self.faker.unique.random_int(min=1000, max=9999)
            lastname = student['LastName']
            firstname = student['FirstName']
            name = f"{firstname} {lastname}"
            time_zone = "America/Los_Angeles"
            created_at = f"{self.startdate}"
            locale = "en"
            workflow_state = "registered"
            sortable_name = f"{firstname}, {lastname}"
            school_name = student['SchoolName']
            school_position = "Student"
            global_canvas_id = student['StudentID'] # NOTE: currently using this field for SIS IDs
            self.canvas_users.loc[len(self.canvas_users)] = [id,name,time_zone,created_at,locale,workflow_state,sortable_name,school_name,school_position,public,global_canvas_id]
        self.writetojsonfile('users', self.canvas_users)

    def genCourses(self):
        for index, course in self.courses.iterrows():
            id = self.faker.unique.random_int(min=100000, max=999999)
            account_id = self.faker.unique.random_int(min=10000000, max=99999999)
            enrollment_term_id = self.enrollmenttermid
            name = course['CourseName']
            course_code = course['CourseID'] # NOTE: unsure
            created_at = f"{self.startdate}"
            start_at = f"{self.startdate + dt.timedelta(days=2)}"
            conclude_at = f"{self.enddate}"
            is_public = True
            sis_source_id = course['CourseID']
            workflow_state = "available"
            wiki_id = None # NOTE: nulled at the moment
            syllabus_body = "<p>Students taking the course will learn the following...</p>"
            self.canvas_courses.loc[len(self.canvas_courses.index)] = [id,account_id,enrollment_term_id,name,course_code,created_at,start_at,conclude_at,is_public,sis_source_id, \
                                                                        workflow_state,wiki_id,syllabus_body]
        self.writetojsonfile('courses', self.canvas_courses)
    
    def genCourseSections(self):
        # set previously generated course table to get the Canvas course ID
        df_courses = spark.createDataFrame(self.canvas_courses)
        df_courses = df_courses.select('id', 'sis_source_id')
        for index, section in self.sections.iterrows():
            id = self.faker.unique.random_int(min=10000, max=99999)
            name = section['SectionName']
            siscourseid = section['CourseID']
            course_id = df_courses.filter(df_courses['sis_source_id'] == f'{siscourseid}').collect()[0][0]
            default_section = True
            accepting_enrollments = False
            start_at = f"{self.startdate + dt.timedelta(days=2)}"
            end_at = f"{self.enddate}"
            created_at = f"{self.startdate}"
            updated_at = f"{self.startdate}"
            workflow_state = "active"
            restrict_enrollments_to_section_dates = True
            nonxlist_course_id = ""
            sis_source_id = section['SectionID']
            self.canvas_course_sections.loc[len(self.canvas_course_sections.index)] = [id,name,course_id,default_section,accepting_enrollments,start_at,end_at,created_at, \
                                                                                updated_at,workflow_state,restrict_enrollments_to_section_dates,nonxlist_course_id,sis_source_id]
        self.writetojsonfile('course_sections', self.canvas_course_sections)

    def genAccounts(self):
        # NOTE: courses and course_sections must already be created
        # generally unsure if this is accurate
        id = self.accountid
        name = "Contoso University Canvas Accounts"
        depth = 0
        workflow_state = "active"
        parent_account_id = None
        sis_source_id = f"{self.integrationid}" # currently, just use the same sis_source_id for all accounts
        self.canvas_accounts.loc[len(self.canvas_accounts)] = [id,name,depth,workflow_state,parent_account_id,sis_source_id]
        for index, school in self.schools.iterrows():
            id = self.faker.unique.random_int(min=10000000, max=99999999) # create unique id per school
            schoolname = school['SchoolName']
            name = f"{schoolname} Canvas Account in Contoso University"
            depth = 1
            workflow_state = "active"
            parent_account_id = self.accountid
            self.canvas_accounts.loc[len(self.canvas_accounts)] = [id,name,depth,workflow_state,parent_account_id,sis_source_id]
        # use these two tables to find the previously generated id for finding the parent accounts (schools) of each course
        dfBT_courses = spark.createDataFrame(self.courses)
        df_accounts = spark.createDataFrame(self.canvas_accounts)
        for index, course in self.canvas_courses.iterrows():
            id = course['account_id'] # create unique id per course
            coursename = course['name']
            name = f"{coursename} Course Canvas Account in Contoso University"
            depth = 2
            workflow_state = "active"
            course_school = dfBT_courses.filter(dfBT_courses['CourseID'] == course['sis_source_id']).select('SchoolName').collect()[0][0] # find the school the course belongs to
            parent_account_id = df_accounts.filter(df_accounts['name'].contains(f"{course_school}")).collect()[0][0]
            self.canvas_accounts.loc[len(self.canvas_accounts)] = [id,name,depth,workflow_state,parent_account_id,sis_source_id]
        # use these two tables to find the previously generated id for finding the parent accounts (courses) of each section
        dfBT_sections = spark.createDataFrame(self.sections)
        df_accounts = spark.createDataFrame(self.canvas_accounts)
        df_accounts = df_accounts.filter(df_accounts['depth'] == 2)
        for index, section in self.canvas_course_sections.iterrows():
            id = self.faker.unique.random_int(min=10000000, max=99999999) # create unique id per section
            sectionname = section['name']
            name = f"{sectionname} Section Canvas Account in Contoso University"
            depth = 3
            workflow_state = "active"
            section_course = dfBT_sections.filter(dfBT_sections['SectionID'] == section['sis_source_id']).select('CourseName').collect()[0][0] # find the course the section belongs to
            parent_account_id = df_accounts.filter(df_accounts['name'].contains(f"{section_course}")).collect()[0][0]
            sis_source_id = f"{self.integrationid}"
            self.canvas_accounts.loc[len(self.canvas_accounts)] = [id,name,depth,workflow_state,parent_account_id,sis_source_id]
        self.writetojsonfile('accounts', self.canvas_accounts)

    def genEnrollments(self):
        # set courses, sections and users tables to extract the Canvas course, section and user IDs
        df_courses = spark.createDataFrame(self.canvas_courses)
        df_sections = spark.createDataFrame(self.canvas_course_sections)
        df_users = spark.createDataFrame(self.canvas_users)
        # add instructor enrollments to the canvas enrollments table
        for index, enroll in self.instructors_enroll.iterrows():
            id = self.faker.unique.random_int(min=1000000, max=9999999)
            type = "TeacherEnrollment"
            workflow_state = "active"
            created_at = f"{self.startdate}"
            updated_at = f"{self.startdate}"
            start_at = f"{self.startdate + dt.timedelta(days=2)}"
            end_at = f"{self.enddate}"
            completed_at = ""
            self_enrolled = True
            last_activity_at = f"{self.reportdate - dt.timedelta(days=random.randint(0,3),hours=random.randint(0,23),minutes=random.randint(0,59))}"
            sis_section_id = enroll['InstructsClass_SectionId'] # use the SIS section ID to find the associated/generated canvas course_section ID
            course_section_id = df_sections.filter(df_sections['sis_source_id'] == f"{sis_section_id}").collect()[0][0]
            sis_user_id = enroll['InstructorId'] # use the SIS instructor ID to find the associated/generated canvas user ID
            user_id = df_users.filter(df_users['global_canvas_id'] == f"{sis_user_id}").collect()[0][0]
            self.canvas_enrollments.loc[len(self.canvas_enrollments.index)] = [id,type,workflow_state,created_at,updated_at,start_at,end_at,completed_at,self_enrolled,last_activity_at,course_section_id,user_id]
        # now add student enrollments
        for index, enroll in self.enrollment.iterrows():
            id = self.faker.unique.random_int(min=1000000, max=9999999)
            type = "StudentEnrollment"
            workflow_state = "active"
            created_at = f"{self.startdate}"
            updated_at = f"{self.startdate}"
            start_at = f"{self.startdate + dt.timedelta(days=2)}"
            end_at = f"{self.enddate}"
            completed_at = ""
            self_enrolled = False
            last_activity_at = f"{self.reportdate - dt.timedelta(days=random.randint(0,3),hours=random.randint(0,23),minutes=random.randint(0,59))}"
            sis_section_id = enroll['SectionID'] # use the SIS section ID to find the associated/generated canvas course_section ID
            course_section_id = df_sections.filter(df_sections['sis_source_id'] == f"{sis_section_id}").collect()[0][0]
            sis_user_id = enroll['StudentID'] # use the SIS student ID to find the associated/generated canvas user ID
            user_id = df_users.filter(df_users['global_canvas_id'] == f"{sis_user_id}").collect()[0][0]
            self.canvas_enrollments.loc[len(self.canvas_enrollments)] = [id,type,workflow_state,created_at,updated_at,start_at,end_at,completed_at,self_enrolled,last_activity_at,course_section_id,user_id]
        self.writetojsonfile('enrollments', self.canvas_enrollments)

    def genRoles(self):
        # add instructor or teacher role
        id = self.faker.unique.random_int(min=10, max=99)
        account_id = self.accountid
        name = "TeacherEnrollment" 
        base_role_type = "AccountMembership"
        workflow_state = "active" 
        created_at = f"{self.startdate}"
        updated_at = f"{self.startdate}"
        deleted_at = f"{self.enddate}"
        self.canvas_roles.loc[len(self.canvas_roles)] = [id,account_id,name,base_role_type,workflow_state,created_at,updated_at,deleted_at]
        # add student role
        id = self.faker.unique.random_int(min=10, max=99)
        account_id = self.accountid
        name = "StudentEnrollment" 
        base_role_type = "AccountMembership"
        workflow_state = "active" 
        created_at = f"{self.startdate}"
        updated_at = f"{self.startdate}"
        deleted_at = f"{self.enddate}"
        self.canvas_roles.loc[len(self.canvas_roles)] = [id,account_id,name,base_role_type,workflow_state,created_at,updated_at,deleted_at]
        self.writetojsonfile('roles', self.canvas_roles)
    
    def genEnrollmentTerms(self):
        # currently holds two enrollment terms for the test data
        # first term is an old fake term to demonstrate data in the enrollment_terms table
        id = self.faker.unique.random_int(min=10000000, max=99999999)
        name = "Fall 2021" 
        start_at = dt.datetime(2021,9,7)
        end_at = dt.datetime(2021,12,17)
        sis_source_id = self.faker.uuid4()
        self.canvas_enrollment_terms.loc[len(self.canvas_enrollment_terms)] = [id,name,start_at,end_at,sis_source_id]
        # add the spring 2022 term (which is currently the setting for the test data)
        id = self.enrollmenttermid
        name = "Spring 2022" 
        start_at = f"{self.startdate + dt.timedelta(days=2)}"
        end_at = f"{self.enddate}"
        sis_source_id = self.faker.uuid4()
        self.canvas_enrollment_terms.loc[len(self.canvas_enrollment_terms)] = [id,name,start_at,end_at,sis_source_id]
        self.writetojsonfile('enrollment_terms', self.canvas_enrollment_terms)
    
    def writetojsonfile(self,filename,pdfOutfile):
        finalgenfilepath = 'stage1/Transactional/test_data/v0.1/canvas_gen3/' + filename + '/'
        dfOutfile = spark.createDataFrame(pdfOutfile)
        dfOutfile.coalesce(1).write.save(oea.to_url(f'{finalgenfilepath}'), format='json', mode='overwrite', header='true', mergeSchema='true')
        #dfOutfile.to_json(oea.to_url(finalgenfilepath), orient='records', force_ascii=False, lines=True)

StatementMeta(, , , Cancelled, )