In [1]:
import numpy as np
import pandas as pd
np.random.seed(100) 

In [2]:
# Simulation parameters

n_students = 5000
n_schools = 50
n_classes_small = 200
n_classes_large = 50
fam_inc_min = 30000
fam_inc_max = 200000
race_codes = ['White', 'Black', 'Asian']
race_probs = [0.5, 0.2, 0.3]
assign_score_coefs = [1, 0.5/12, 0.1]  # assignment score, log(family_income), race==Asiain

teacher_ma_prob_small = 0.5
teacher_ma_prob_large = 0.4

test_coef = [70, 10/12, 4, 5] # constant, income, small, teacher_ma
test_sd = 2


In [3]:
# Simulate student-level variables

student_id = np.arange(n_students)
fam_inc = np.random.randint(low=fam_inc_min, high=fam_inc_max, size=n_students)
race = np.random.choice(race_codes, size=n_students, replace=True, p=race_probs)
rand_score = np.random.uniform(size=n_students)

assign_score = rand_score*assign_score_coefs[0] + \
               np.log(fam_inc)*assign_score_coefs[1] + \
               (race=='Asian')*assign_score_coefs[2]

small = assign_score > np.median(assign_score)  # assigned to small classroom if assign score is greater than median

n_small = np.sum(small)
n_large = n_students - n_small


In [4]:
# Assign students to classrooms 

class_id = np.zeros(n_students)
class_id[small] = 10000 + np.random.randint(low=0, high=n_classes_small, size=n_small)
class_id[~small] = 10000 + np.random.randint(low=n_classes_small, high=n_classes_small+n_classes_large, size=n_large)


In [5]:
# Create a data frame of students

stu_df = pd.DataFrame({'student_id':student_id, 'fam_inc':fam_inc, 'race':race, 'class_id':class_id})


In [6]:
# Create a data frame of classrooms

cls_df = stu_df.groupby('class_id').agg(class_size = ('student_id','count')).reset_index()
cls_df['small_class'] = cls_df['class_id'] < 10000 + n_classes_small

u = np.random.uniform(size=len(cls_df))
cls_df['teacher_ma'] = False
idx_small = cls_df['small_class']
idx_large = ~cls_df['small_class']
cls_df.loc[ idx_small, 'teacher_ma'] = u[idx_small] < teacher_ma_prob_small
cls_df.loc[ idx_large, 'teacher_ma'] = u[idx_large] < teacher_ma_prob_large

cls_df['school_id'] = 7000 + np.random.randint(low=0, high=n_schools, size=len(cls_df))


In [7]:
# Create a data frame of schools

sch_df = cls_df.groupby('school_id').agg(school_size = ('class_size','sum'), n_classes = ('class_id','count'))
sch_df['school_effect'] = np.random.uniform(-3,3,size=len(sch_df))


In [8]:
# Simualate test scores

stu_df = stu_df.merge(cls_df, on='class_id', how='left')
stu_df = stu_df.merge(sch_df, on='school_id', how='left')

stu_df['test_score'] = test_coef[0] + \
                       test_coef[1] * np.log(stu_df['fam_inc']) + \
                       test_coef[2] * stu_df['small_class'] + \
                       test_coef[3] * stu_df['teacher_ma'] + \
                       stu_df['school_effect'] + \
                       np.random.normal(loc=0, scale=test_sd, size=len(stu_df))



In [11]:
# Output datasets

stu_df[['student_id', 'test_score', 'fam_inc', 'race', 'class_id', 'school_id']].to_csv('../data/class_size/students.csv', header=True, index=False)
cls_df[['class_id', 'class_size', 'small_class', 'teacher_ma', 'school_id']].to_csv('../data/class_size/classes.csv', header=True, index=False)


Unnamed: 0,student_id,fam_inc,race,class_id,class_size,small_class,teacher_ma,school_id,school_size,n_classes,school_effect,test_score
0,0,68408,Asian,10237.0,49,False,False,7007,139,4,-1.846901,75.215299
1,1,86088,Black,10208.0,46,False,False,7009,64,2,-1.890574,80.876199
2,2,107655,White,10226.0,63,False,True,7025,125,3,-0.623916,84.665381
3,3,95615,Asian,10216.0,53,False,False,7033,200,5,-1.177101,81.328627
4,4,112270,Black,10137.0,13,True,False,7030,107,5,-1.013392,83.572299
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4995,118119,White,10000.0,20,True,True,7040,162,7,-0.597064,86.528518
4996,4996,174869,Asian,10216.0,53,False,False,7033,200,5,-1.177101,78.486680
4997,4997,162156,Asian,10167.0,15,True,False,7047,88,8,-2.362541,81.240276
4998,4998,145564,Asian,10105.0,17,True,False,7026,280,12,1.783260,85.129243
