# Test Data Generation: Canvas Activity Tables Class

**Affiliation**: *Kwantum Edu Analytics*. **Last Modified**: *5/19/2023*.

This OEA test data generation class notebook generates fictitous Canvas tables, as seen in the Canvas module. This notebook is needed to successfully run the canvas_test_data_gen_demo notebook.

For reference of all Canvas tables outlined below, see Canvas table schemas here: https://canvas.instructure.com/doc/api/all_resources.html

This class notebook primarily leans on the use of the OEA_py class notebook, ```Faker``` and ```random``` python packages, and already-generated base-truth tables to generate **9** Canvas module tables (only the activity tables; rostering tables are expected to already have been created):

 1. **assignments**
 2. **assignment_submissions**
 3. **assignment_submission_summary**
 4. **modules**
 5. **module_items**
 6. **outcome_results**
 7. **quizzes** 
 8. **quiz_submissions**
 9. **results**

There is one main method ```genCanvasActivity(startdate, enddate, reportgendate, canvas_roster_tables_source_path, max_num_activities_per_class)``` to generate the tables described. Parameter descriptions are given:
  - *startdate*: semester start date.
  - *enddate*: semester end date.
  - *reportgendate*: date the report(s) were generated (i.e., fictitous date when all tables were landed in the data lake).
  - *canvas_roster_tables_source_path*: source of Canvas roster/SIS tables previously generated.
  - *max_num_activities_per_class*: randomly samples all courses from Canvas roster data, then randomly selects the number of activities per course from 0 up to this parameter value. For example:
    * ```max_num_activities_per_class = 3``` means when generating assignment test data, it will go through every course and randomly choose how many assignments will be generated per class - from 0 to 3.

In [1]:
import logging
import random, decimal
from tokenize import Ignore
from faker import Faker
import pandas as pd
import datetime as dt
import numpy as np
from pyspark.sql import functions as F

class CanvasActivityDataGen():
    def __init__(self, startdate='2022-01-03T00:00:00', enddate='2022-06-03T00:00:00'):
        #self.startdate = startdate
        #self.enddate = enddate
        
        self.faker = Faker('en_US')

        # set current datetime for rundate folder for writing out files
        currentDate = dt.datetime.now()
        self.currentDateTime = currentDate.strftime("%Y-%m-%d %H-%M-%S")

        # initialize dfs for each Canvas table to be generated
        assignments = {
            "id":[],
            "name":[],
            "description":[],
            "created_at":[],
            "updated_at":[],
            "due_at":[],
            "lock_at":[],
            "unlock_at":[],
            "has_overrides":[],
            "course_id":[],
            "html_url":[],
            "submissions_download_url":[],
            "assignment_group_id":[],
            "due_date_required":[],
            "allow_extensions":[],
            "max_name_length":[],
            "turnitin_enabled":[],
            "verticite_enabled":[],
            "turnitin_settings":[],
            "grade_group_students_individually":[],
            #"external_tool_tag_attributes":[],
            "peer_reviews":[],
            "automatic_peer_reviews":[],
            "peer_review_count":[],
            "peer_reviews_assign_at":[],
            "intra_group_peer_reviews":[],
            "group_category_id":[],
            "needs_grading_count":[],
            "needs_grading_count_by_section":[],
            "position":[],
            "points_possible":[],
            "submission_types":[],
            "has_submitted_submissions":[],
            "grading_type":[],
            "grading_standard_id":[],
            "published":[],
            "unpublishable":[],
            "only_visible_to_overrides":[],
            "locked_for_user":[],
            #"submission":[],
            "moderated_grading":[],
            "grader_count":[],
            "final_grader_id":[],
            "grader_comments_visible_to_graders":[],
            "grader_anonymous_to_graders":[],
            "grader_names_visible_to_final_grader":[],
            "anonymous_grading":[],
            "allowed_attempts":[],
            "post_manually":[],
            #"score_statistics":[],
            "annotatable_attachment_id":[]
        }
        self.canvas_assignments = pd.DataFrame(assignments, dtype=object)
        assignment_submissions = {
            "assignment_id":[],
            "assignment":[],
            "course":[],
            "attempt":[],
            "body":[],
            "grade":[],
            "grade_matches_current_submission":[],
            "html_url":[],
            "preview_url":[],
            "score":[],
            #"submission_comments":[],
            "submission_type":[],
            "submitted_at":[],
            "url":[],
            "user_id":[],
            "grader_id":[],
            "graded_at":[],
            "user":[],
            "late":[],
            "assignment_visible":[],
            "excused":[],
            "missing":[],
            "late_policy_status":[],
            "points_deducted":[],
            "seconds_late":[],
            "workflow_state":[],
            "extra_attempts":[],
            "anonymous_id":[],
            "posted_at":[],
            #"read_status":[],
            "redo_request":[]
        }
        self.canvas_assignment_submissions = pd.DataFrame(assignment_submissions, dtype=object)
        assignment_submission_summary = {
            'assignment_id':[], # NOTE: this is field is not in production data - will need to be updated
            'graded':[],
            'ungraded':[],
            'not_submitted':[]
        }
        self.canvas_assignment_submission_summary = pd.DataFrame(assignment_submission_summary, dtype=object)
        modules = {
            "id":[],
            "workflow_state":[],
            "position":[],
            "name":[],
            "unlock_at":[],
            "require_sequential_progress":[],
            "prerequisite_module_ids":[],
            "items_count":[],
            "items_url":[],
            "items":[],
            "state":[],
            "completed_at":[],
            "publish_final_grade":[],
            "published":[]
        }
        self.canvas_modules = pd.DataFrame(modules, dtype=object)
        module_items = {
            "id":[],
            "module_id":[],
            "position":[],
            "title":[],
            "indent":[],
            "type":[],
            "content_id":[],
            "html_url":[],
            "url":[],
            "page_url":[],
            "external_url":[],
            "new_tab":[],
            "completion_requirement":[],
            "content_details":[],
            "published":[]
        }
        self.canvas_module_items = pd.DataFrame(module_items, dtype=object)
        outcome_results = {
            "id":[],
            "score":[],
            "submitted_or_assessed_at":[],
            "links":[],
            "percent":[]
        }
        self.canvas_outcome_results = pd.DataFrame(outcome_results, dtype=object)
        quizzes = {
            "id":[],
            "title":[],
            "html_url":[],
            "mobile_url":[],
            #"preview_url":[],
            "description":[],
            "quiz_type":[],
            "assignment_group_id":[],
            "time_limit":[],
            "shuffle_answers":[],
            "hide_results":[],
            "show_correct_answers":[],
            "show_correct_answers_last_attempt":[],
            "show_correct_answers_at":[],
            "hide_correct_answers_at":[],
            "one_time_results":[],
            "scoring_policy":[],
            "allowed_attempts":[],
            "one_question_at_a_time":[],
            "question_count":[],
            "points_possible":[],
            "cant_go_back":[],
            "access_code":[],
            "ip_filter":[],
            "due_at":[],
            "lock_at":[],
            "unlock_at":[],
            "published":[],
            "unpublishable":[],
            "locked_for_user":[],
            #"lock_info":[],
            #"lock_explanation":[],
            "speedgrader_url":[],
            "quiz_extensions_url":[],
            "permissions":[],
            "all_dates":[],
            "version_number":[],
            "question_types":[],
            "anonymous_submissions":[]
        }
        self.canvas_quizzes = pd.DataFrame(quizzes, dtype=object)
        quiz_submissions = {
            "id":[],
            "quiz_id":[],
            "user_id":[],
            "submission_id":[],
            "started_at":[],
            "finished_at":[],
            "end_at":[],
            "attempt":[],
            "extra_attempts":[],
            "extra_time":[],
            "manually_unlocked":[],
            "time_spent":[],
            "score":[],
            "score_before_regrade":[],
            "kept_score":[],
            "fudge_points":[],
            "has_seen_results":[],
            "workflow_state":[],
            "overdue_and_needs_submission":[]
        }
        self.canvas_quiz_submissions = pd.DataFrame(quiz_submissions, dtype=object)

    def genCanvasActivity(self,startdate='2022-01-01T00:00:00',enddate='2022-06-01T00:00:00',reportgendate='2022-02-02T00:00:00',canvas_roster_tables_source_path='stage1/Transactional/test_data/v0.1/canvas_gen',max_num_activities_per_class=5):
        self.startdate = dt.datetime.strptime(startdate, "%Y-%m-%dT%H:%M:%S")
        self.enddate = dt.datetime.strptime(enddate, "%Y-%m-%dT%H:%M:%S")
        self.reportdate = dt.datetime.strptime(reportgendate, "%Y-%m-%dT%H:%M:%S")
        use_general_module_base_truth = True
        if use_general_module_base_truth:
            sourcepath = 'stage1/Transactional/test_data/v0.1/base_general_modules'
            if oea.path_exists(sourcepath):
                logger.info('General module base-truth tables already exist - delete the "base_general_modules" folder/directory if you want to replace these.')
            else:
                # manually delete and replace the general module base_truth_tables CSVs as needed
                logger.info('General module base-truth tables do not currently exist - landing in stage1/.../test_data/v0.1/base_general_modules/')
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/students.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_students', 'general_module_base_truth_students.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/schools.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_schools', 'general_module_base_truth_schools.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/courses.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_courses', 'general_module_base_truth_courses.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/sections.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_sections', 'general_module_base_truth_sections.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/enrollment.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_student_enrollment', 'general_module_base_truth_student_enrollment.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/instructors.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_instructors', 'general_module_base_truth_instructors.csv', oea.SNAPSHOT_BATCH_DATA)
                data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_test_data_generation_kit/test_data/base_truth_tables/instructors_enroll.csv').text
                oea.land(data, 'test_data/v0.1/base_general_modules/base_instructors_enroll', 'general_module_base_truth_instructors_enroll.csv', oea.SNAPSHOT_BATCH_DATA)
            # NOTE: if tables are not read in properly - you may need to rename the rundate folder to replace colons with hyphens
            self.students = oea.load_csv(sourcepath + '/base_students/', header=True).toPandas()
            self.schools = oea.load_csv(sourcepath + '/base_schools/', header=True).toPandas()
            self.courses = oea.load_csv(sourcepath + '/base_courses/', header=True).toPandas()
            self.sections = oea.load_csv(sourcepath + '/base_sections/', header=True).toPandas()
            self.enrollment = oea.load_csv(sourcepath + '/base_student_enrollment/', header=True).toPandas()
            self.instructors = oea.load_csv(sourcepath + '/base_instructors/', header=True).toPandas()
            self.instructors_enroll = oea.load_csv(sourcepath + '/base_instructors_enroll/', header=True).toPandas()
            logger.info('Generating Canvas test data based on general module base-truth tables...')
        else:
            # expectation is that base_truth_tables exist
            sourcepath = 'stage1/Transactional/test_data/v0.1/'
            self.students = oea.load_csv(sourcepath + 'base_students/', header=True).toPandas()
            self.schools = oea.load_csv(sourcepath + 'base_schools/', header=True).toPandas()
            self.courses = oea.load_csv(sourcepath + 'base_courses/', header=True).toPandas()
            self.sections = oea.load_csv(sourcepath + 'base_sections/', header=True).toPandas()
            self.enrollment = oea.load_csv(sourcepath + 'base_student_enrollment/', header=True).toPandas()
            self.instructors = oea.load_csv(sourcepath + 'base_instructors/', header=True).toPandas()
            self.instructors_enroll = oea.load_csv(sourcepath + 'base_instructors_enroll/', header=True).toPandas()
            logger.info('Generating Canvas test data based on user-generated base-truth tables...')
        # load in Canvas SIS/roster tables (NOTE: these are expected to already have been created)
        self.canvas_accounts = pd.read_json(oea.to_url(f'{canvas_roster_tables_source_path}/accounts/*.json'), lines=True)
        self.canvas_courses = pd.read_json(oea.to_url(f'{canvas_roster_tables_source_path}/courses/*.json'), lines=True)
        self.canvas_enrollments = pd.read_json(oea.to_url(f'{canvas_roster_tables_source_path}/enrollments/*.json'), lines=True)
        self.canvas_roles = pd.read_json(oea.to_url(f'{canvas_roster_tables_source_path}/roles/*.json'), lines=True)
        self.canvas_sections = pd.read_json(oea.to_url(f'{canvas_roster_tables_source_path}/sections/*.json'), lines=True)
        self.canvas_users = pd.read_json(oea.to_url(f'{canvas_roster_tables_source_path}/users/*.json'), lines=True)
        #canvas_courses = oea.load_json(f'{canvas_roster_tables_source_path}/courses/*.json', multiline=True)
        #canvas_enrollments = oea.load_json(f'{canvas_roster_tables_source_path}/enrollments/*.json', multiline=True)
        #canvas_roles = oea.load_json(f'{canvas_roster_tables_source_path}/roles/*.json', multiline=True)
        #canvas_sections = oea.load_json(f'{canvas_roster_tables_source_path}/sections/*.json', multiline=True)
        #canvas_users = oea.load_json(f'{canvas_roster_tables_source_path}/users/*.json', multiline=True)
        # extract pre-existing static values (student and instructor role ids, context id, and tech admin modifier user id)
        #
        # then turn the canvas roster spark dfs into pandas dfs
        #self.canvas_accounts = canvas_accounts.toPandas()
        #self.canvas_courses = canvas_courses.toPandas()
        #self.canvas_enrollments = canvas_enrollments.toPandas()
        #self.canvas_roles = canvas_roles.toPandas()
        #self.canvas_sections = canvas_sections.toPandas()
        #self.canvas_users = canvas_users.toPandas()
        logger.info('Successfully loaded Canvas SIS/rostering tables. Now generating Canvas activity tables...')
        self.genAssignment_tables(max_num_assigns=max_num_activities_per_class)
        self.genQuiz_tables(max_num_quizzes=max_num_activities_per_class)
        #self.genForum_tables(max_num_forums=max_num_activities_per_class)
        #self.genLesson_tables(max_num_lessons=max_num_activities_per_class)
        #self.genMessage_tables(num_convos=num_courses_to_gen_activity)
        logger.info('Successfully generated Canvas activity tables (for assignments, quizzes, and modules).')
        logger.info('Finished Canvas generation.')

    def __get_daterange(self,start_date=dt.datetime(2022,1,3),end_date=dt.datetime(2022,1,28)):
        daterange = []
        startdate = start_date
        enddate = end_date
        while(startdate < enddate):
            daterange.append(startdate)
            startdate = startdate + dt.timedelta(days=1)
        return daterange
    
    def __get_lettergrade(self,percentage):
        if percentage >= 0.93:
            lettergrade = "A"
        elif percentage >= 0.9:
            lettergrade = "A-"
        elif percentage >= 0.87:
            lettergrade = "B+" 
        elif percentage >= 0.83:
            lettergrade = "B" 
        elif percentage >= 0.8:
            lettergrade = "B-" 
        elif percentage >= 0.77:
            lettergrade = "C+"
        elif percentage >= 0.73:
            lettergrade = "C" 
        elif percentage >= 0.7:
            lettergrade = "C-"
        elif percentage >= 0.67:
            lettergrade = "D+" 
        elif percentage >= 0.63:
            lettergrade = "D" 
        elif percentage >= 0.6:
            lettergrade = "D-"
        else:
            lettergrade = "F"
        return lettergrade
    
    def genAssignment_tables(self,max_num_assigns=3):
        """This method generates 3 assignment tables: assignments, assignment_submissions and assignment_submission_summary"""
        date_range = self.__get_daterange(start_date=dt.datetime(2022,1,3),end_date=dt.datetime(2022,1,21))
        # count the total number of assignments
        m = 1
        for index, section in self.canvas_sections.iterrows():
            # find section id and associated course id
            section_id = section['id']
            sis_section_id = section['sis_section_id']
            course_id = section['course_id']
            section_name = section['name']
            # find number of students in section
            dfBT_enroll = spark.createDataFrame(self.enrollment)
            dfEnroll = dfBT_enroll.filter(dfBT_enroll['SectionID'] == f'{sis_section_id}')
            num_students_in_section = dfEnroll.count()
            # find instructor ID for the section
            dfBT_instructor_enroll = spark.createDataFrame(self.instructors_enroll)
            instructor_sis_id = dfBT_instructor_enroll.filter(dfBT_instructor_enroll['InstructsClass_SectionId'] == f'{sis_section_id}').collect()[0][0]
            df_users = spark.createDataFrame(self.canvas_users)
            instructor_id = df_users.filter(df_users['sis_user_id'] == f'{instructor_sis_id}').collect()[0][0]
            # randomly choose how many assignments have been assigned in this class
            n = random.randint(0,max_num_assigns)
            l = 1
            while n > 0:
                # choose the day this assignment was assigned
                assign_day = random.choice(date_range)
                # generate the assignment id
                assign_id = self.faker.unique.random_int(min=100000, max=999999)
                # finally generate the assignment tables
                allowed_attempts,duedate = self._genAssignments(assign_id,course_id,l,m,section_name,assign_day,num_students_in_section,instructor_id)
                if duedate > dt.datetime(2022,2,2):
                    graded = False
                else:
                    graded = True
                self._genAssignmentSubmissions(assign_id,course_id,l,dfEnroll,allowed_attempts,graded,duedate,section_name,assign_day,num_students_in_section,instructor_id)
                self._genAssignmentSubmissionSummary(assign_id,graded,num_students_in_section,dfEnroll)
                n = n - 1
                m = m + 1
                l = l + 1
        self.writetojsonfile('assignments', self.canvas_assignments)
        self.writetojsonfile('assignment_submissions', self.canvas_assignment_submissions)
        self.writetojsonfile('assignment_submission_summary', self.canvas_assignment_submission_summary)

    def _genAssignments(self,assignid,courseid,assignnumber_in_section,assignnumber_in_system,sectionname,assignday,num_students_in_section,instructor_id):
        # NOTE: this code assumes there's supposed to be one row per assignment
        id = assignid
        name = f"Assignment {str(assignnumber_in_section)} for {sectionname}" # NOTE: can be modified for more unique names
        description = "<p>Do the following:</p>..."
        created_at = f"{assignday - dt.timedelta(days=random.randint(0,1),hours=random.randint(0,23),minutes=random.randint(0,59))}"
        updated_at = f"{created_at}"
        duedate = assignday + dt.timedelta(days=random.randint(7,14))
        due_at = f"{duedate}"
        lock_at = f"{self.enddate}"
        unlock_at = f"{assignday}"
        has_overrides = True
        course_id = courseid
        html_url = f"https://canvas.cu.edu/courses/{str(courseid)}/assignments/{str(assignid)}"
        submissions_download_url = f"https://canvas.cu.edu/courses/{str(courseid)}/assignments{str(assignid)}/submissions?zip=1"
        assignment_group_id = self.faker.random_int(min=1000, max=9999)
        due_date_required = True
        allow_extensions = "[docx, pptx, xlsx, pdf]" # NOTE: this is supposed to be an array, but temporarily creating this field as a string
        max_name_length = 20
        turnitin_enabled = True
        verticite_enabled = False 
        turnitin_settings = ""
        grade_group_students_individually = None # NOTE: typically boolean for group assignments - temp assumption that all assignments are individual
        #external_tool_tag_attributes =
        peer_reviews = False
        automatic_peer_reviews = False 
        peer_review_count = 0
        peer_reviews_assign_at = ""
        intra_group_peer_reviews = False 
        group_category_id = None
        if duedate < dt.datetime(2022,2,2):
            needs_grading_count = 0
        else:
            needs_grading_count = num_students_in_section
        needs_grading_count_by_section = "" # NOTE: supposed to be an array representing section_ids with the number of assignments needed to be graded
        position = assignnumber_in_system
        points_possible = 100
        submission_types = ["on_paper"] # NOTE: should be an arry; using string for now. can be: discussion_topic, online_quiz, on_paper, none, external_tool, online_text_entry, online_url, online_upload, media_recording, student_annotation
        has_submitted_submissions = True
        grading_type = "points" # NOTE: can be: pass_fail, percent, letter_grade, gpa_scale or points
        grading_standard_id = None
        published = True
        unpublishable = False
        only_visible_to_overrides = False 
        locked_for_user = False
        moderated_grading = True
        grader_count = 1
        final_grader_id = instructor_id
        grader_comments_visible_to_graders = True
        grader_anonymous_to_graders = False
        grader_names_visible_to_final_grader = True
        anonymous_grading = True
        allowed_attempts = random.randint(1,3)
        post_manually = True
        annotatable_attachment_id = None
        self.canvas_assignments.loc[len(self.canvas_assignments)] = [id,name,description,created_at,updated_at,due_at,lock_at,unlock_at,has_overrides,course_id,html_url,submissions_download_url,assignment_group_id, \
                                                                    due_date_required,allow_extensions,max_name_length,turnitin_enabled,verticite_enabled,turnitin_settings,grade_group_students_individually, \
                                                                    peer_reviews,automatic_peer_reviews,peer_review_count,peer_reviews_assign_at,intra_group_peer_reviews,group_category_id,needs_grading_count,needs_grading_count_by_section, \
                                                                    position,points_possible,submission_types,has_submitted_submissions,grading_type,grading_standard_id,published,unpublishable,only_visible_to_overrides,locked_for_user, \
                                                                    moderated_grading,grader_count,final_grader_id,grader_comments_visible_to_graders,grader_anonymous_to_graders,grader_names_visible_to_final_grader,anonymous_grading, \
                                                                    allowed_attempts,post_manually,annotatable_attachment_id]
        return allowed_attempts,duedate

    def _genAssignmentSubmissions(self,assignid,courseid,assignnumber_in_section,dfEnroll,allowed_attempts,graded,duedate,sectionname,assignday,num_students_in_section,instructor_id):
        # randomly sample the students enrolled in the course for assignment submissions
        half_of_students_in_class = round(num_students_in_section / 2)
        num_students_submit = random.randint(half_of_students_in_class, num_students_in_section)
        df_enroll = dfEnroll.toPandas()
        df_students_submit = df_enroll.sample(n=num_students_submit)
        dfStudents_submit = spark.createDataFrame(df_students_submit).select('StudentID').withColumnRenamed('StudentID','submit_studentID')
        dfStudents_not_submit = dfEnroll.join(dfStudents_submit, dfEnroll.StudentID == dfStudents_submit.submit_studentID, how='left_anti') # this table is used to add students that haven't submitted assignments
        df_students_not_submit = dfStudents_not_submit.toPandas()
        # set users roster table to extract user IDs
        df_users = spark.createDataFrame(self.canvas_users)
        for index, student in df_students_submit.iterrows():
            # assign static varibles per student submission(s)
            assignment_id = assignid
            assignment = f"Assignment {str(assignnumber_in_section)} for {sectionname}" # NOTE: can be modified for more unique names
            course = courseid
            body = "" # NOTE: assumption is that the assignment was uploaded rather than a filled field
            user_id = df_users.filter(df_users['sis_user_id'] == student['StudentID']).collect()[0][0]
            html_url = f"https://canvas.cu.edu/courses/{str(courseid)}/assignments/{str(assignid)}/submissions/{str(user_id)}"
            submission_type = "online_upload" # NOTE: accepted values - online_text_entry, online_url, online_upload, media_recording or student_annotation
            url = "" # NOTE: only for online_url submission type
            user = df_users.filter(df_users['sis_user_id'] == student['StudentID']).select('name').collect()[0][0]
            assignment_visible = True
            excused = True
            missing = False
            late_policy_status = "can be late" # NOTE: accepted values - can be late, missing, extended, none, or null
            points_deducted = 0
            workflow_state = "submitted"
            extra_attempts = 0
            redo_request = False
            # optional table attributes currently left out: submission_comments, read_status
            # randomly create the number of attempts/assignment submissions for this student
            if allowed_attempts == 1:
                random_num_attempts = 1
            else:
                random_num_attempts = random.randint(1,allowed_attempts)
            for n in range(0,random_num_attempts):
                # if only one attempt for the student, then simply generate that single submission
                attempt = n + 1
                preview_url = html_url = f"https://canvas.cu.edu/courses/{str(courseid)}/assignments/{str(assignid)}/submissions/{str(user_id)}?preview={str(attempt)}"
                submitted_daytime = assignday + dt.timedelta(days=random.randint(1,10),hours=random.randint(0,23),minutes=random.randint(0,59))
                submitted_at = f"{submitted_daytime}"
                anonymous_id = self.faker.uuid4()
                if graded == True:
                    grader_id = instructor_id
                    score = round(random.triangular(40.00,100.00,78.00))
                    # calculate letter grade associated
                    percent = score/100
                    grade = self.__get_lettergrade(percent)
                    graded_daytime = submitted_daytime + dt.timedelta(days=random.randint(0,3),hours=random.randint(0,23),minutes=random.randint(0,59))
                    graded_at = f"{graded_daytime}"
                    grade_matches_current_submission = True # NOTE: manual work will be needed for correcting this
                    posted_at = f"{graded_daytime}"
                else: 
                    grader_id = None
                    score = None
                    grade = ""
                    graded_at = ""
                    grade_matches_current_submission = False
                    posted_at = ""
                if submitted_daytime > duedate:
                    late = True
                    #seconds_late = F.unix_timestamp(submitted_daytime) - F.unix_timestamp(duedate) 
                    #t1 = dt.datetime.strptime(duedate, "%Y-%m-%d %H:%M:%S")
                    #t2 = dt.datetime.strptime(submitted_daytime, "%Y-%m-%d %H:%M:%S")
                    delta = submitted_daytime - duedate
                    seconds_late = round(delta.total_seconds())
                    #seconds_late = dt.datetime.strptime(str(submitted_daytime), '%Y-%m-%d %H:%M:%S')
                else:
                    late = False
                    seconds_late = 0
                self.canvas_assignment_submissions.loc[len(self.canvas_assignment_submissions)] = [assignment_id,assignment,course,attempt,body,grade,grade_matches_current_submission,html_url,preview_url,score, \
                                                                                                submission_type,submitted_at,url,user_id,grader_id,graded_at,user,late,assignment_visible,excused,missing,late_policy_status, \
                                                                                                points_deducted,seconds_late,workflow_state,extra_attempts,anonymous_id,posted_at,redo_request]
        # if assignment has been graded, then add students without submissions
        if graded == True:
            for index, student in df_students_not_submit.iterrows():
                # assign static varibles per student submission(s)
                # optional table attributes currently left out: submission_comments, read_status
                assignment_id = assignid
                assignment = f"Assignment {str(assignnumber_in_section)} for {sectionname}" # NOTE: can be modified for more unique names
                course = courseid
                attempt = None
                body = "" 
                grade = "F"
                grade_matches_current_submission = True
                html_url = ""
                preview_url = ""
                score = 0
                submission_type = "online_upload" # NOTE: accepted values - online_text_entry, online_url, online_upload, media_recording or student_annotation
                submitted_at = ""
                url = "" # NOTE: only for online_url submission type
                user_id = df_users.filter(df_users['sis_user_id'] == student['StudentID']).collect()[0][0]
                grader_id = instructor_id
                graded_daytime = duedate + dt.timedelta(days=random.randint(0,2),hours=random.randint(0,23),minutes=random.randint(0,59))
                graded_at = f"{graded_daytime}"
                user = df_users.filter(df_users['sis_user_id'] == student['StudentID']).select('name').collect()[0][0]
                late = False
                assignment_visible = False
                excused = False
                missing = True
                late_policy_status = "missing" # NOTE: accepted values - can be late, missing, extended, none, or null
                points_deducted = 100
                seconds_late = 0
                workflow_state = "not submitted"
                extra_attempts = 0
                anonymous_id = self.faker.uuid4()
                posted_at = f"{graded_daytime}"
                redo_request = False
                self.canvas_assignment_submissions.loc[len(self.canvas_assignment_submissions)] = [assignment_id,assignment,course,attempt,body,grade,grade_matches_current_submission,html_url,preview_url,score, \
                                                                                                submission_type,submitted_at,url,user_id,grader_id,graded_at,user,late,assignment_visible,excused,missing,late_policy_status, \
                                                                                                points_deducted,seconds_late,workflow_state,extra_attempts,anonymous_id,posted_at,redo_request]
    
    def _genAssignmentSubmissionSummary(self,assignid,bool_graded,num_students_in_section,dfEnroll):
        # NOTE: This table is not supposed to include the assignment_id in this base level structure - this will need updating
        df_assignment_subs = spark.createDataFrame(self.canvas_assignment_submissions)
        df_assignment_subs = df_assignment_subs.filter(df_assignment_subs['assignment_id'] == f'{assignid}').groupBy('user_id','workflow_state').count()
        
        assignment_id = assignid
        if bool_graded == True:
            graded = num_students_in_section
            ungraded = 0
            notsubmitted = df_assignment_subs.filter(df_assignment_subs['workflow_state'] == "not submitted").count()
        else:
            graded = 0
            ungraded = num_students_in_section
            num_students_submitted = df_assignment_subs.filter(df_assignment_subs['workflow_state'] == "submitted").count()
            notsubmitted = num_students_in_section - num_students_submitted
        self.canvas_assignment_submission_summary.loc[len(self.canvas_assignment_submission_summary)] = [assignment_id,graded,ungraded,notsubmitted]

    
    def genQuiz_tables(self,max_num_quizzes=5):
        """This method generates 2 quiz tables: quizzes and quiz_submissions"""
        date_range = self.__get_daterange(start_date=dt.datetime(2022,1,3),end_date=dt.datetime(2022,1,21))
        # count the total number of quizzes
        m = 1
        for index, section in self.canvas_sections.iterrows():
            # find section id and associated course id
            section_id = section['id']
            sis_section_id = section['sis_section_id']
            course_id = section['course_id']
            section_name = section['name']
            # find number of students in section
            dfBT_enroll = spark.createDataFrame(self.enrollment)
            dfEnroll = dfBT_enroll.filter(dfBT_enroll['SectionID'] == f'{sis_section_id}')
            num_students_in_section = dfEnroll.count()
            # find instructor ID for the section
            dfBT_instructor_enroll = spark.createDataFrame(self.instructors_enroll)
            instructor_sis_id = dfBT_instructor_enroll.filter(dfBT_instructor_enroll['InstructsClass_SectionId'] == f'{sis_section_id}').collect()[0][0]
            df_users = spark.createDataFrame(self.canvas_users)
            instructor_id = df_users.filter(df_users['sis_user_id'] == f'{instructor_sis_id}').collect()[0][0]
            # randomly choose how many quizzes have been assigned in this class
            n = random.randint(0,max_num_quizzes)
            l = 1
            while n > 0:
                # choose the day this quiz was assigned
                quiz_day = random.choice(date_range)
                # generate the quiz id
                quiz_id = self.faker.unique.random_int(min=100000, max=999999)
                # finally generate the quiz tables
                allowed_attempts,duedate = self._genQuizzes(quiz_id,course_id,l,section_name,quiz_day) 
                if duedate > dt.datetime(2022,2,2):
                    graded = False
                else:
                    graded = True
                self._genQuizSubmissions(assign_id,course_id,l,dfEnroll,allowed_attempts,graded,duedate,section_name,assign_day,num_students_in_section,instructor_id)
                n = n - 1
                m = m + 1
                l = l + 1
        self.writetojsonfile('quizzes', self.canvas_quizzes)
        self.writetojsonfile('quiz_submissions', self.canvas_quiz_submissions)

    def _genQuizzes(self,quizid,courseid,quiznumber_in_section,sectionname,quizday):
        id = quizid
        title = f"Quiz {str(quiznumber_in_section)} for {sectionname}" # NOTE: can be modified for more unique names
        html_url = f"https://canvas.cu.edu/courses/{str(courseid)}/quizzes/{str(quizid)}"
        mobile_url = f"https://canvas.cu.edu/courses/{str(courseid)}/quizzes/{str(quizid)}?persist_headless=1&force_user=1"
        description = "Sample quiz description for a section."
        quiz_type = "assignment" # NOTE: options - practice_quiz, assignment, graded_survey or survey
        assignment_group_id = self.faker.random_int(min=1000, max=9999) # does not mean anything at the moment
        time_limit = 10 # NOTE: in minutes
        shuffle_answers = False
        hide_results = "always" # NOTE: options - null, always or until_after_last_attempt
        show_correct_answers = False # NOTE: only valid if hide_results = null
        show_correct_answers_last_attempt = True
        duedate = quizday + dt.timedelta(days=random.randint(0,2))
        due_at = f"{duedate}"
        show_correct_answers_at = f"{duedate + dt.timedelta(days=1)}"
        hide_correct_answers_at = f"{self.enddate}"
        one_time_results = False
        allowed_attempts = random.randint(1,3)
        if allowed_attempts == 1:
            scoring_policy = ""
        else:
            scoring_policy = "keep_highest" # NOTE: options - keep_highest or keep_latest
        one_question_at_a_time = False
        question_count = random.randint(10,20)
        points_possible = 20
        cant_go_back = False # only true if one_question_at_a_time is true
        access_code = ""
        ip_filter = ""
        lock_at = f"{duedate}"
        unlock_at = f"{quizday}"
        published = True
        unpublishable = False
        locked_for_user = False
        speedgrader_url = ""
        quiz_extensions_url = f"https://canvas.cu.edu/courses/{str(courseid)}/quizzes/{str(quizid)}/quiz_extensions"
        permissions = ""
        all_dates = ""
        version_number = 1 
        question_types = ["multiple_choice"]
        anonymous_submissions = False
        self.canvas_quizzes.loc[len(self.canvas_quizzes)] = [id,title,html_url,mobile_url,description,quiz_type,assignment_group_id,time_limit,shuffle_answers,hide_results, \
                                                            show_correct_answers,show_correct_answers_last_attempt,show_correct_answers_at,hide_correct_answers_at,one_time_results,scoring_policy, \
                                                            allowed_attempts,one_question_at_a_time,question_count,points_possible,cant_go_back,access_code,ip_filter,due_at,lock_at,unlock_at,published, \
                                                            unpublishable,locked_for_user,speedgrader_url,quiz_extensions_url,permissions,all_dates,version_number,question_types,anonymous_submissions]
        return allowed_attempts,duedate
    
    def _genQuizSubmissions(self,quizid,quizday,num_students_in_course,dfEnroll,maxattempts):
        # randomly sample the students enrolled in the course for quiz submissions
        half_of_students_in_class = round(num_students_in_course / 2)
        num_students_submit = random.randint(half_of_students_in_class, num_students_in_course)
        df_enroll = dfEnroll.toPandas()
        df_students_submit = df_enroll.sample(n=num_students_submit)
        for index, student in df_students_submit.iterrows():
            # assign static varibles per student submission(s)
            quiz = quizid
            userid = student['StudentID']
            layout = ''
            currentpage = 0
            preview = 0
            timemodifiedoffline = 0
            timecheckstate = 0
            gradednotificationsenttime = ''
            random_num_attempts = random.randint(1,maxattempts)
            if random_num_attempts == 1:
                # if only one attempt for the student, then simply generate that single submission
                id = self.faker.uuid4()
                uniqueid = self.faker.uuid4()
                state = 'finished'
                attempt = 1
                timestart = quizday + dt.timedelta(hours=random.randint(0,23),minutes=random.randint(0,59))
                timefinish = timestart + dt.timedelta(hours=random.randint(0,1),minutes=random.randint(0,20))
                timemodified = timefinish
                sumgrades = round(random.triangular(35.00,100.00,73.50))
                self.moodle_quiz_attempts.loc[len(self.moodle_quiz_attempts)] = [id,quiz,userid,attempt,uniqueid,layout,currentpage,preview,state,timestart,timefinish,timemodified,timemodifiedoffline,timecheckstate,sumgrades,gradednotificationsenttime]
            else:
            # create variables for keeping track of the last attempt from same student
                last_attempt_daytime = quizday + dt.timedelta(hours=random.randint(20,23),minutes=random.randint(0,59))
                previous_attempt = None
                # iterate through total number of attempts
                for n in range(0,random_num_attempts):
                    id = self.faker.uuid4()
                    attempt = n + 1
                    uniqueid = self.faker.uuid4()
                    state = 'finished'
                    timemodified = last_attempt_daytime
                    sumgrades = round(random.triangular(35.00,100.00,73.50))
                    if n == (random_num_attempts - 1):
                        timestart = last_attempt_daytime - dt.timedelta(minutes=random.randint(0,20))
                        timefinish = last_attempt_daytime
                    else:
                        if isinstance(previous_attempt, type(None)):
                            timestart = quizday + dt.timedelta(hours=random.randint(6,20),minutes=random.randint(0,59))
                            timefinish = timestart + dt.timedelta(hours=random.randint(0,1),minutes=random.randint(0,20))
                        else:
                            timestart = previous_attempt + dt.timedelta(hours=random.randint(0,2),minutes=random.randint(0,59))
                            timefinish = timestart + dt.timedelta(hours=random.randint(0,1),minutes=random.randint(0,20))
                    previous_attempt = timefinish
                    self.canvas_quiz_submissions.loc[len(self.canvas_quiz_submissions)] = [id,quiz_id,user_id,submission_id,started_at,finished_at,end_at,attempt,extra_attempts,extra_time,manually_unlocked,time_spent,score,score_before_regrade, \
                                                                                        kept_score,fudge_points,has_seen_results,workflow_state,overdue_and_needs_submission]

    def genForum_tables(self,max_num_forums=5):
        """This method generates 4 forum tables: forum, forum_discussions, forum_grades and forum_posts"""
        date_range = self.__get_daterange()
        for index, course in self.moodle_course.iterrows():
            # find course id
            course_id = course['id']
            # find number of students in course
            dfBT_enroll = spark.createDataFrame(self.enrollment)
            dfEnroll = dfBT_enroll.filter(dfBT_enroll['SectionID'] == f'{course_id}')
            num_students_in_course = dfEnroll.count()
            # randomly choose how many forums have been assigned in this class
            n = random.randint(0,max_num_forums)
            while n > 0:
                # choose the day the forum was assigned
                forum_day = random.choice(date_range)
                # generate the forum ID
                forum_id = self.faker.uuid4()
                # finally generate the forum tables
                time_modified,complete_discuss = self._genForum(forum_id,course_id,forum_day,num_students_in_course)
                self._genForumDiscussions(forum_id,course_id,dfEnroll,time_modified,complete_discuss,date_range)
                self._genForumGrades(forum_id,time_modified)
                self._genForumPosts()
                n = n - 1
        self.writetofile('forum', self.moodle_forum)
        self.writetofile('forum_discussions', self.moodle_forum_discussions)
        self.writetofile('forum_grades', self.moodle_forum_grades)
        self.writetofile('forum_posts', self.moodle_forum_posts)

    def _genForum(self,forumid,courseid,forumgradedate,num_students_in_course):
        id = forumid
        course = courseid
        type = 'general'
        name = 'Forum for Course' # NOTE: can be modified for unique names
        intro = 'This is a forum for a course'
        introformat = 0
        duedate = forumgradedate 
        cutoffdate = self.enddate # NOTE: currently set to end of semester
        assessed = num_students_in_course # NOTE: unsure if this should be representing how many are graded
        assesstimestart = forumgradedate + dt.timedelta(days=random.randint(0,5),hours=random.randint(0,23),minutes=random.randint(0,59))
        assesstimefinish = assesstimestart + dt.timedelta(hours=random.randint(0,4),minutes=random.randint(0,59))
        scale = 1 # NOTE: unsure
        grade_forum = 1 # NOTE: unsure
        grade_forum_notify = 0
        maxbytes = 0
        maxattachments = 1
        forcesubscribe = 0
        trackingtype = 1
        rsstype = 0
        rssarticles = 0
        timemodified = assesstimefinish
        warnafter = duedate - dt.timedelta(hours=12)
        blockafter = self.enddate
        blockperiod = 0
        completiondiscussions = random.randint(1,3)
        completionreplies = 0
        completionposts = completiondiscussions + completionreplies # NOTE: generated by the num posts and replies needed to be marked as complete
        displaywordcount = 0
        lockdiscussionafter = cutoffdate
        self.moodle_forum.loc[len(self.moodle_forum)] = [id,course,type,name,intro,introformat,duedate,cutoffdate,assessed,assesstimestart,assesstimefinish,scale,grade_forum, \
                                                            grade_forum_notify,maxbytes,maxattachments,forcesubscribe,trackingtype,rsstype,rssarticles,timemodified,warnafter,blockafter, \
                                                            blockperiod,completiondiscussions,completionreplies,completionposts,displaywordcount,lockdiscussionafter]
        return timemodified,completiondiscussions,completionreplies

    def _genForumDiscussions(self,forumid,courseid,dfEnroll,time_modified,complete_discuss,daterange):
        # currently set to all students have successfully completed the necessary posts/replies
        df_enroll = dfEnroll.toPandas()
        for index, student in df_enroll.iterrows():
            # assign static varibles per student submission(s)
            course = courseid
            forum = forumid
            firstpost = self.faker.uuid4()
            userid = student['StudentID']
            groupid = -1
            assessed = 1
            timemodified = time_modified
            pinned = 0
            timelocked = self.enddate
            # randomly set variable for the datetime the student added a post or reply
            last_post = random.choice(daterange)
            if complete_discuss == 1:
                id = firstpost
                name = 'post'
                timestart = last_post + dt.timedelta(hours=random.randint(0,23),minutes=random.randint(0,59))
                timeend = timestart + dt.timedelta(hours=random.randint(0,1),minutes=random.randint(0,59))
                usermodified = timeend
                self.moodle_forum_discussions.loc[len(self.moodle_forum_discussions)] = [id,course,forum,name,firstpost,userid,groupid,assessed,timemodified,usermodified,timestart,timeend,pinned,timelocked]
            else:
                # iterate through adding student posts based on the completion requirement
                for n in range (0,complete_discuss):
                    name = 'post'
                    timestart = last_post + dt.timedelta(hours=random.randint(0,23),minutes=random.randint(0,59))
                    timeend = timestart + dt.timedelta(hours=random.randint(0,1),minutes=random.randint(0,59))
                    usermodified = timeend
                    if n == 0:
                        id = firstpost
                    else:
                        id = self.faker.uuid4()
                    last_post = timeend
                    self.moodle_forum_discussions.loc[len(self.moodle_forum_discussions)] = [id,course,forum,name,firstpost,userid,groupid,assessed,timemodified,usermodified,timestart,timeend,pinned,timelocked]

    def _genForumGrades(self,forumid,time_modified):
        # grade each student forum post/reply in moodle_forum_discussions table
        df_discuss = self.moodle_forum_discussions.copy()
        for index, discuss in df_discuss.iterrows():
            id = self.faker.uuid4()
            forum = forumid
            itemnumber = discuss['id']
            userid = discuss['userid']
            grade = round(random.triangular(60,100,85))
            timecreated = discuss['timeend']
            timemodified = time_modified
            self.moodle_forum_grades.loc[len(self.moodle_forum_grades)] = [id,forum,itemnumber,userid,grade,timecreated,timemodified]

    def _genForumPosts(self):
        # holds all forum posts; joins tables together to make data look realistic
        dfUsers = spark.createDataFrame(self.moodle_user).select('id', 'firstname', 'lastname')
        dfUsers = dfUsers.withColumnRenamed('id', 'uid').withColumnRenamed('firstname', 'user_firstname').withColumnRenamed('lastname', 'user_lastname')
        dfCourse = spark.createDataFrame(self.moodle_course).select('id', 'fullname', 'shortname')
        dfCourse = dfCourse.withColumnRenamed('id', 'cid').withColumnRenamed('fullname', 'course_fullname').withColumnRenamed('shortname', 'course_shortname')
        dfDiscussGrades = spark.createDataFrame(self.moodle_forum_grades).select('itemnumber', 'grade')
        dfDiscussGrades = dfDiscussGrades.withColumnRenamed('itemnumber', 'discuss_id')
        dfDiscuss = spark.createDataFrame(self.moodle_forum_discussions)
        dfDiscuss = dfDiscuss.join(dfUsers, dfDiscuss.userid == dfUsers.uid,how='inner').drop('uid')
        dfDiscuss = dfDiscuss.join(dfCourse, dfDiscuss.course == dfCourse.cid,how='inner').drop('cid')
        dfDiscuss = dfDiscuss.join(dfDiscussGrades, dfDiscuss.id == dfDiscussGrades.discuss_id,how='inner').drop('discuss_id')
        # then only extract the posts, convert back to pandas df, and fill table
        dfDiscuss = dfDiscuss.filter(dfDiscuss['name']=='post')
        df_discuss = dfDiscuss.toPandas()
        for index, discuss in df_discuss.iterrows():
            id = self.faker.uuid4()
            discussion = discuss['id']
            parent = f'{id}'
            userid = discuss['userid']
            created = discuss['timestart']
            modified = discuss['usermodified']
            mailed = 0
            # extract useful items
            user_firstname = discuss['user_firstname']
            user_lastname = discuss['user_lastname']
            course_fullname = discuss['course_fullname']
            # continue generating fields
            subject = f'Post by {user_firstname} {user_lastname}'
            message = f'This post is for the course {course_fullname}. This is a sample discussion-post message.'
            messageformat = 0
            messagetrust = 0
            attachment = '' # unsure
            totalscore = discuss['grade']
            mailnow = 0
            deleted = 0
            privatereplyto = 0
            wordcount = 15 # NOTE: inaccurate since this varies
            charcount = 35 # NOTE: inaccurate since this varies
            self.moodle_forum_posts.loc[len(self.moodle_forum_posts)] = [id,discussion,parent,userid,created,modified,mailed,subject,message,messageformat,messagetrust,attachment, \
                                                                            totalscore,mailnow,deleted,privatereplyto,wordcount,charcount]

    def writetojsonfile(self,filename,dfOutfile):
        finalgenfilepath = 'stage1/Transactional/test_data/v0.1/canvas_activity_gen/' + filename + '/' + filename + '.json'
        dfOutfile.to_json(oea.to_url(finalgenfilepath), orient='records', force_ascii=False, lines=True)

StatementMeta(, , , Cancelled, )