# Test Data Generation: Base-Truth Tables Class

**Author**: *Jordyn Leonauskas (minor updates by Christian)*. **Affiliation**: *Kwantum Edu Analytics*. **Last Modified**: *2/3/2023*.

This OEA test data generation class notebook generates the base-truth tables, which are then used to generate test datasets for modules (or other purposes); this notebook is needed to successfully run the J_base_test_data_gen_demo notebook.

This class notebook primarily leans on the use of the OEA_py class notebook, Faker and random-address python packages to generate 5 base-truth tables:
 1. **Students**,
 2. **Schools**,
 3. **Courses**: general classes offered,
 4. **Sections**: specific classes offered, and
 5. **Enrollment**: students enrollment-assignment in specfic sections.

In [9]:
import random
from tokenize import Ignore, String
from faker import Faker
import pandas as pd
import datetime as dt
from datetime import date
import numpy as np
from pyparsing import nums
from torch import real
import random_address

class TestDataGen:
    def __init__(self):
        self.faker = Faker('en_US')
        self.edlevel = 'high'
        #in production, options prek, elementary, mid, high, hed

        students = {
            'Gender': [],
            'FirstName': [],
            'MiddleName': [],
            'LastName': [],
            'StudentID': [],
            'Birthday': [],
            'SchoolName': [],
            'SchoolID': [],
            'Grade': [],
            'Performance': [],
            'HispanicLatino': [],
            'Race': [],
            'Flag': [],
            'Email': [],
            'Phone': [],
            'Address': [],
            'City': [],
            'State': [],
            'Zipcode': []
        }
        self.students = pd.DataFrame(students, dtype=object)

        schools = {
            'SchoolName':[],
            'SchoolID':[]
        }
        self.schools = pd.DataFrame(schools, dtype=object)

        courses = {
            'CourseName':[],
            'CourseID':[],
            'SchoolName':[],
            'SchoolID':[],
            'CourseSubject':[],
            'CourseGradeLevel':[]
        }
        self.courses = pd.DataFrame(courses, dtype=object)

        sections = {
            'SectionName':[],
            'SectionID':[],
            'CourseName':[],
            'CourseID':[],
            'SchoolName':[],
            'SchoolID':[],
            'SectionSubject':[],
            'SectionGradeLevel':[]
        }
        self.sections = pd.DataFrame(sections, dtype=object)

        enrollment = {
            'StudentName':[],
            'StudentID':[],
            'SectionName':[],
            'SectionID':[],
            'CourseName':[],
            'CourseID':[],
            'CourseGradeLevel':[],
            'SchoolName':[],
            'SchoolID':[]
        }
        self.enrollment = pd.DataFrame(enrollment, dtype=object)

    def genbasetables(self, numstudents, numschools, numcourses):
        self._genstudents(numstudents)
        self._genschools(numschools)
        self._assignschools()
        self._gencourses(numcourses)
        self._gensections()
        self._genenrollment()
        dfStudents = spark.createDataFrame(self.students)
        dfSchools = spark.createDataFrame(self.schools)
        dfCourses = spark.createDataFrame(self.courses)
        dfSections = spark.createDataFrame(self.sections)
        dfEnrollment = spark.createDataFrame(self.enrollment)
        # set current datetime for rundate folder
        currentDate = dt.datetime.now()
        self.currentDateTime = currentDate.strftime("%Y-%m-%d %H-%M-%S")
        dfStudents.coalesce(1).write.save(oea.to_url('stage1/Transactional/test_data/v0.1/base_students/snapshot_batch_data/rundate='+self.currentDateTime), format='csv', mode='overwrite', header='true', mergeSchema='true')
        dfSchools.coalesce(1).write.save(oea.to_url('stage1/Transactional/test_data/v0.1/base_schools/snapshot_batch_data/rundate='+self.currentDateTime), format='csv', mode='overwrite', header='true', mergeSchema='true')
        dfCourses.coalesce(1).write.save(oea.to_url('stage1/Transactional/test_data/v0.1/base_courses/snapshot_batch_data/rundate='+self.currentDateTime), format='csv', mode='overwrite', header='true', mergeSchema='true')
        dfSections.coalesce(1).write.save(oea.to_url('stage1/Transactional/test_data/v0.1/base_sections/snapshot_batch_data/rundate='+self.currentDateTime), format='csv', mode='overwrite', header='true', mergeSchema='true')
        dfEnrollment.coalesce(1).write.save(oea.to_url('stage1/Transactional/test_data/v0.1/base_enrollment/snapshot_batch_data/rundate='+self.currentDateTime), format='csv', mode='overwrite', header='true', mergeSchema='true')

    # helper methods

    def __get_age(self, born):
        today = date.today()
        return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

    def __genstudent(self):
        gender = random.choices(['M','F','O'],weights=[0.45,0.5,0.05])
        if gender == ['M']:
            firstname = self.faker.first_name_male()
            middlename = self.faker.first_name_male()
        elif gender == ['F']: 
            firstname = self.faker.first_name_female()
            middlename = self.faker.first_name_female()
        elif gender == ['O']:
            firstname = self.faker.first_name_nonbinary()
            middlename = self.faker.first_name_nonbinary()
        lastname = self.faker.last_name()
        studentid = self.faker.uuid4()
        if self.edlevel == 'high':
            birthday = self.faker.date_of_birth(minimum_age=15,maximum_age=18)
        elif self.edlevel == 'middle':
            birthday = self.faker.date_of_birth(minimum_age=12,maximum_age=14)
        elif self.edlevel == 'elementary':
            birthday = self.faker.date_of_birth(minimum_age=6,maximum_age=11)
        elif self.edlevel == 'prek':
            birthday = self.faker.date_of_birth(minimum_age=2,maximum_age=5)
        elif self.edlevel == 'hed':
            birthday = self.faker.date_of_birth(minimum_age=18,maximum_age=22)
        # education
        school = ''
        schoolid = ''
        grade = self.__get_age(birthday) - 6
        performance = random.choices(['high','avg','low'], weights=[0.3,0.6,0.1])
        # demographics
        hispaniclatino = random.choices(['True','False'], weights=[0.189,0.811])
        race = random.choices(['white','blackafricanamerican','americanindianalaskanative','asian','nativehawaiianpacificislander','twoormoreraces'], weights=[0.758,0.136,0.013,0.061,0.003,0.029])
        flag = random.choices(['','FreeLunch','ReducedLunch','Homeless'],weights=[0.78,0.1,0.1,0.02])
        # contact
        email = f'{firstname}{lastname}@contoso.edu'
        phone = self.faker.phone_number()
        address = random_address.real_random_address_by_state('CA')
        try:
            city = address['city']
        except KeyError:
            city = 'Quantico'
        state = address['state']
        zipcode = address['postalCode']
        address = address['address1']
        self.students.loc[len(self.students.index)] = [gender[0], firstname, middlename, lastname, studentid, birthday, school,
        schoolid, grade, performance[0], hispaniclatino[0], race[0], flag[0], email, phone, address, city, state, zipcode]

    def _genstudents(self, numstudents=100):
        while numstudents > 0:
            self.__genstudent()
            numstudents = numstudents - 1

    def __genschool(self):
        if self.edlevel == 'high':
            schoolname = f'{self.faker.last_name()} High'
        schoolid = self.faker.uuid4()
        self.schools.loc[len(self.schools.index)] = [schoolname, schoolid]

    def _genschools(self, numschools=3):
        while numschools > 0:
            self.__genschool()
            numschools = numschools - 1

    # in production
    def _genhedschools(self, numschools=3):
        options = ['School of Fine Arts', 'School of Science and Health', 'School of Music', 'School of Engineering', 'School of Information Technology', 'School of Culinary Arts']

    def _assignschools(self):
        i = len(self.students.index) - 1
        while i >= 0:
            school = random.randint(0, len(self.schools.index) - 1)
            self.students.at[i, 'SchoolName'] = self.schools.at[school, 'SchoolName']
            self.students.at[i, 'SchoolID'] = self.schools.at[school, 'SchoolID']
            i = i - 1

    def _gencourses(self, numcourses=6):
        options = [
            'Trigonomotry',
            'Pre-Calculus',
            'Calculus',
            'Choir',
            'Band',
            'Orchestra',
            'Reading',
            'English',
            'World Literature',
            'Astronomy',
            'Biology',
            'Chemistry',
            'Physics',
            'Physical Education',
            'Health',
            'Pottery',
            'Art',
            'Theatre',
            'Computer Science',
            'US History',
            'World History'
        ]
        grades = {
            'Trigonomotry':'0',
            'Pre-Calculus':'11',
            'Calculus':'12',
            'Choir':'0',
            'Band':'0',
            'Orchestra':'0',
            'Reading':'0',
            'English':'0',
            'World Literature':'11',
            'Astronomy':'9',
            'Biology':'10',
            'Chemistry':'11',
            'Physics':'12',
            'Physical Education':'0',
            'Health':'0',
            'Pottery':'0',
            'Art':'0',
            'Theatre':'0',
            'Computer Science':'12',
            'US History':'10',
            'World History':'11'
        }
        subjects = {
            'Trigonomotry':'Mathematics',
            'Pre-Calculus':'Mathematics',
            'Calculus':'Mathematics',
            'Choir':'Visual and Performing Arts',
            'Band':'Visual and Performing Arts',
            'Orchestra':'Visual and Performing Arts',
            'Reading':'English Language and Literature',
            'English':'English Language and Literature',
            'World Literature':'English Language and Literature',
            'Astronomy':'Life and Physical Sciences',
            'Biology':'Life and Physical Sciences',
            'Chemistry':'Life and Physical Sciences',
            'Physics':'Life and Physical Sciences',
            'Physical Education':'Physical Health and Safety Education',
            'Health':'Physical Health and Safety Education',
            'Pottery':'Visual and Performing Arts',
            'Art':'Visual and Performing Arts',
            'Theatre':'Visual and Performing Arts',
            'Computer Science':'Information Technology',
            'US History':'Social Sciences and History',
            'World History':'Social Sciences and History'
        }
        for index, school in self.schools.iterrows():
            courses = random.sample(options,numcourses)
            for course in courses:
                coursename = course
                courseid = self.faker.uuid4()
                schoolname = school['SchoolName']
                schoolid = school['SchoolID'] 
                coursesubject = subjects[course]
                coursegradelevel = grades[course]
                self.courses.loc[len(self.courses.index)] = [coursename,courseid,schoolname,schoolid,coursesubject,coursegradelevel]

    def _gensections(self):
        for index, course in self.courses.iterrows():
            numsections = random.randint(1,2)
            while numsections > 0:
                sectionnum = random.randint(100,300)
                coursename = course['CourseName'] 
                sectionname = f'{coursename} {sectionnum}'
                sectionid = self.faker.uuid4()
                courseid = course['CourseID']
                schoolname = course['SchoolName']
                schoolid = course['SchoolID']
                sectionsubject = course['CourseSubject']
                sectiongradelevel = course['CourseGradeLevel']
                self.sections.loc[len(self.sections.index)] = [sectionname,sectionid,coursename,courseid,schoolname,schoolid,sectionsubject,sectiongradelevel]
                numsections = numsections - 1

    def _genenrollment(self):
        for index, student in self.students.iterrows():
            studentcourses = self.courses[self.courses['SchoolID'] == student['SchoolID']]
            studentgrade = student['Grade']
            studentgradecourses = studentcourses[(studentcourses['CourseGradeLevel'] == f'{studentgrade}') | (studentcourses['CourseGradeLevel'] == '0')]
            coursesample = studentgradecourses.sample(n=6)
            for index, course in coursesample.iterrows():
                coursesections = self.sections[self.sections['CourseID'] == course['CourseID']]
                coursesection = coursesections.sample(n=1)
                coursesection.reset_index(inplace=True)
                studentfirstname = student['FirstName']
                studentlastname = student['LastName']
                studentname = f'{studentfirstname} {studentlastname}'
                studentid = student['StudentID']
                sectionname = coursesection.loc[0,'SectionName']
                sectionid = coursesection.loc[0,'SectionID']
                coursename = course['CourseName']
                courseid = course['CourseID']
                coursegradelevel = course['CourseGradeLevel']
                schoolname = course['SchoolName'] 
                schoolid = course['SchoolID']
                self.enrollment.loc[len(self.enrollment.index)] = [studentname,studentid,sectionname,sectionid,coursename,courseid,coursegradelevel,schoolname,schoolid]