# Test Data Generation: Base Truth Tables Class

This OEA test data generation class notebook generates the base truth tables, which are then used to generate test datasets for modules (or other purposes); this notebook is needed to successfully run the test_data_gen_demo notebook.

This class notebook primarily leans on the use of the OEA_py class notebook, Faker and random-address python packages to generate these base tables.

This notebook defines and uses 1 main function, and 6 helper methods. Below describes the main function:
 1. **gen_base_tables(numstudents, numschools, ed_level)**: Generates base truth tables for students and schools. Accepts defined number of students, schools, and desired education level of students. Accepted **ed_level** values are:
    * ```high``` - generates high schools and high-school students.
    * ```middle``` - generates middle schools and middle-school students.
    * ```elementary``` - generates elementary schools and elementary-school students.
    * ```prek``` - generates pre-schools and pre-kindergarten kids.

In [1]:
import random
from tokenize import Ignore, String
from faker import Faker
import pandas as pd
import datetime as dt
from datetime import date
import numpy as np
from pyparsing import nums
from torch import real
import random_address

class TestDataGen_BaseTables():
    def __init__(self, source_folder='test_data'):
        
        # set current datetime for rundate folder
        currentDate = dt.datetime.now()
        self.currentDateTime = currentDate.strftime("%Y-%m-%d %H-%M-%S")

        self.faker = Faker('en_US')
        #opt prek, elementary, mid, high, hed

        students = {
        'Gender': [],
        'FirstName': [],
        'MiddleName': [],
        'LastName': [],
        'StudentID': [],
        'Birthday': [],
        'School': [],
        'SchoolID': [],
        'Grade': [],
        'Performance': [],
        'HispanicLatino': [],
        'Race': [],
        'Flag': [],
        'Email': [],
        'Phone': [],
        'Address': [],
        'City': [],
        'State': [],
        'Zipcode': []
        }
        self.students = pd.DataFrame(students, dtype=object)

        schools = {
            'SchoolName':[],
            'SchoolID':[]
        }
        self.schools = pd.DataFrame(schools, dtype=object)

    def gen_base_tables(self, numstudents, numschools, ed_level='high'):
        self.edlevel = ed_level
        self._gen_students(numstudents)
        self._gen_schools(numschools)
        self._assign_schools()
        dfStudents = spark.createDataFrame(self.students)
        dfSchools = spark.createDataFrame(self.schools)
        # NOTE: type of batch data can be updated as needed
        dfStudents.coalesce(1).write.save(oea.to_url('stage1/Transactional/test_data/v0.1/base_students/snapshot_batch_data/rundate='+self.currentDateTime), format='csv', mode='overwrite', mergeSchema='true')
        dfSchools.coalesce(1).write.save(oea.to_url('stage1/Transactional/test_data/v0.1/base_schools/snapshot_batch_data/rundate='+self.currentDateTime), format='csv', mode='overwrite', mergeSchema='true')

    # helper methods

    def __get_age(self, born):
        today = date.today()
        return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

    def __gen_student(self):
        gender = random.choices(['M','F','O'],weights=[0.45,0.5,0.05])
        if gender == ['M']:
            firstname = self.faker.first_name_male()
            middlename = self.faker.first_name_male()
        elif gender == ['F']: 
            firstname = self.faker.first_name_female()
            middlename = self.faker.first_name_female()
        elif gender == ['O']:
            firstname = self.faker.first_name_nonbinary()
            middlename = self.faker.first_name_nonbinary()
        lastname = self.faker.last_name()
        studentid = self.faker.uuid4()
        if self.edlevel == 'high':
            birthday = self.faker.date_of_birth(minimum_age=15,maximum_age=18)
        elif self.edlevel == 'middle':
            birthday = self.faker.date_of_birth(minimum_age=12,maximum_age=14)
        elif self.edlevel == 'elementary':
            birthday = self.faker.date_of_birth(minimum_age=6,maximum_age=11)
        elif self.edlevel == 'prek':
            birthday = self.faker.date_of_birth(minimum_age=2,maximum_age=5)
        # education
        school = ''
        schoolid = ''
        grade = self.__get_age(birthday) - 6
        performance = random.choices(['high','avg','low'], weights=[0.3,0.6,0.1])
        # demographics
        hispaniclatino = random.choices(['True','False'], weights=[0.189,0.811])
        race = random.choices(['white','blackafricanamerican','americanindianalaskanative','asian','nativehawaiianpacificislander','twoormoreraces'], weights=[0.708,0.149,0.013,0.061,0.009,0.06])
        flag = ''
        # contact
        email = f'{firstname}{lastname}@contoso.edu'
        phone = self.faker.phone_number()
        address = random_address.real_random_address_by_state('CA')
        city = address['city']
        state = address['state']
        zipcode = address['postalCode']
        address = address['address1']
        self.students.loc[len(self.students.index)] = [gender[0], firstname, middlename, lastname, studentid, birthday, school, \
        schoolid, grade, performance[0], hispaniclatino[0], race[0], flag, email, phone, address, city, state, zipcode]

    def _gen_students(self, numstudents=10):
        while numstudents > 0:
            self.__gen_student()
            numstudents = numstudents - 1

    def __gen_school(self):
        if self.edlevel == 'high':
            schoolname = f'{self.faker.last_name()} High'
        schoolid = self.faker.uuid4()
        self.schools.loc[len(self.schools.index)] = [schoolname, schoolid]

    def _gen_schools(self, numschools=3):
        while numschools > 0:
            self.__gen_school()
            numschools = numschools - 1

    def _assign_schools(self):
        i = len(self.students.index) - 1
        while i >= 0:
            school = random.randint(0, len(self.schools.index) - 1)
            self.students.at[i, 'School'] = self.schools.at[school, 'SchoolName']
            self.students.at[i, 'SchoolID'] = self.schools.at[school, 'SchoolID']
            i = i - 1