In [1]:
import numpy as np
import pandas as pd
np.random.seed(100) 

In [2]:
# Simulation parameters and exogenous factors

n_reg = 10000  # cohort of regular students
n_exp = 10000  # cohort of experimental students
n_stu = n_reg + n_exp

# Simulate race
race_codes = ['WHITE', 'BLACK', 'HISPANIC', 'ASIAN'] # race codes
race_probs = [0.5, 0.15, 0.2, 0.15] # race probabilities
race = np.random.choice(race_codes, size=n_stu, replace=True, p=race_probs)

# Simulate ability score
ability_u = np.random.uniform(low=0, high=25, size=n_stu) 

# Simulate income (function of race and ability score)
inc_mean = 60000
log_inc_sd = 0.2
log_fam_inc = np.log(inc_mean) + \
              np.random.normal(loc=0.0, scale=log_inc_sd, size=n_stu) + \
              0.20*(race=='ASIAN') + \
              -0.30*(race=='BLACK') + \
              -0.25*(race=='HISPANIC') + \
              0.0*(ability_u - np.mean(ability_u))/(np.max(ability_u) - np.min(ability_u))
inc = np.exp(log_fam_inc)

# Regular cohort, class size probability formula
prob_small_reg = 0.0 + \
                 0.50*(ability_u - np.min(ability_u))/(np.max(ability_u) - np.min(ability_u)) + \
                 0.50*(log_fam_inc - np.min(log_fam_inc))/(np.max(log_fam_inc) - np.min(log_fam_inc)) + \
                 -0.25*(race=='BLACK') + \
                 -0.25*(race=='HISPANIC') 

# Experimental cohort, class size probability formula
prob_small_exp = 0.5

# Target class sizes
class_sz_small = 15.0
class_sz_large = 30.0

# Teacher MA assignment probs
ma_small = 0.75
ma_large = 0.25

# schools
n_schools = 20
school_effect_range = 5


In [3]:
# Initialize student dataframe
stu_df = pd.DataFrame.from_dict({
    'log_fam_inc': log_fam_inc, 
    'race': race, 
    'ability': ability_u
})

stu_df['cohort'] = 'EXPERIMENTAL'
stu_df.loc[0:n_reg-1, 'cohort'] = 'REGULAR'
stu_df['class_size'] = 'LARGE'



In [4]:
# Make the class size assignment for the regular cohort
assign = np.random.uniform(size=n_stu) < prob_small_reg
stu_df.loc[assign & (stu_df['cohort']=='REGULAR'), 'class_size'] = 'SMALL'

# Make the class size assignment for the experimental cohort
assign = np.random.uniform(size=n_stu) < prob_small_exp
stu_df.loc[assign & (stu_df['cohort']=='EXPERIMENTAL'), 'class_size'] = 'SMALL'


In [5]:
# Allocate students to classes

n_stu_small = np.sum(stu_df['class_size']=='SMALL')
n_stu_large = np.sum(stu_df['class_size']=='LARGE')

n_classes_small = int(n_stu_small/class_sz_small)
n_classes_large = int(n_stu_large/class_sz_large)

small = stu_df['class_size']=='SMALL'

stu_df['ord'] = 0
stu_df.loc[small, 'ord'] = np.arange(n_stu_small)
stu_df.loc[~small, 'ord'] = np.arange(n_stu_large)

stu_df.loc[small, 'class_id'] = 50001 + (stu_df.loc[small, 'ord'] % n_classes_small)
stu_df.loc[~small, 'class_id'] = 52001 + (stu_df.loc[~small, 'ord'] % n_classes_large)

print(f"SMALL: {n_stu_small} students, {n_classes_small} classes, class_id range: {stu_df.loc[small,'class_id'].min()}-{stu_df.loc[small,'class_id'].max()}")
print(f"LARGE: {n_stu_large} students, {n_classes_large} classes, class_id range: {stu_df.loc[~small,'class_id'].min()}-{stu_df.loc[~small,'class_id'].max()}")


SMALL: 9192 students, 612 classes, class_id range: 50001.0-50612.0
LARGE: 10808 students, 360 classes, class_id range: 52001.0-52360.0


In [6]:
# Create class dataframe

cls_df = stu_df[['class_id', 'class_size']].copy().drop_duplicates()
stu_df = stu_df.drop(labels='class_size', axis=1)

# Assign teacher ma
u = np.random.uniform(size=len(cls_df))
small_idx = cls_df['class_size']=='SMALL'

cls_df['teacher_has_ma'] = False
cls_df.loc[ small_idx, 'teacher_has_ma'] = u[small_idx] < ma_small
cls_df.loc[ ~small_idx, 'teacher_has_ma'] = u[~small_idx] < ma_large

# Randomly allocate classrooms to schools
cls_df['school_id'] = 63001 + np.random.randint(low=0, high=n_schools, size=len(cls_df))


In [7]:
# School effects dataframe

sch_df = cls_df.groupby('school_id').agg(n_classes = ('class_id','count')).reset_index()
sch_df['school_effect'] = np.random.uniform(low=-school_effect_range, high=school_effect_range, size=n_schools)


In [8]:
# Test scores

stu_df = stu_df.merge(cls_df, on='class_id', how='left')
stu_df = stu_df.merge(sch_df, on='school_id', how='left')

stu_df['test_score'] = 60 + \
                       30*(stu_df['ability'] - stu_df['ability'].mean()) / (stu_df['ability'].max() - stu_df['ability'].min()) + \
                       5*(stu_df['class_size']=='SMALL') + \
                       7.5*(stu_df['teacher_has_ma']) + \
                       1*(stu_df['school_effect']) + \
                       10*(stu_df['log_fam_inc'] - stu_df['log_fam_inc'].mean()) / (stu_df['log_fam_inc'].max() - stu_df['log_fam_inc'].min())

print(f"Test score range: {stu_df['test_score'].min()}-{stu_df['test_score'].max()}")


Test score range: 37.160602193812814-94.62523325748843


In [9]:
# Prepare output
stu_file = '../data/class_size/students.csv'
cls_file = '../data/class_size/classes.csv'

stu_df = stu_df.sample(frac=1).reset_index(drop=True)
stu_df['family_income'] = np.round(np.exp(stu_df['log_fam_inc']))
stu_df = stu_df.sort_values(by=['class_id'], ascending=True)
stu_df['student_id'] = np.arange(n_stu)

stu_df[['student_id', 'class_id', 'school_id', 'test_score', 'family_income', 'race', 'cohort']].to_csv(stu_file, header=True, index=False)
cls_df[['class_id', 'school_id', 'class_size', 'teacher_has_ma']].to_csv(cls_file, header=True, index=False)
