In [1]:
import numpy as np
import pandas as pd
np.random.seed(100) 

In [2]:
# Simulation parameters and exogenous factors

n_reg = 4000  # cohort of regular students
n_exp = 2000  # cohort of experimental students
n_stu = n_reg + n_exp

# Simulate race
race_codes = ['WHITE', 'BLACK', 'HISPANIC', 'ASIAN'] # race codes
race_probs = [0.5, 0.15, 0.2, 0.15] # race probabilities
race = np.random.choice(race_codes, size=n_stu, replace=True, p=race_probs)

# Simulate family income
fam_inc_min = 30000  # min family income
fam_inc_max = 150000 # max family income
incdenom = np.log(fam_inc_max) - np.log(fam_inc_min)
fam_inc = np.random.randint(low=fam_inc_min, high=fam_inc_max, size=n_stu)

# ability score that depends on family income
# ability = 10*(log(inc)-log(inc_min))/(log(inc_max)-log(inc_min)) + U[0,25]
# range: 0-35
ability_u = np.random.uniform(low=0, high=25, size=n_stu)
ability = 10*(np.log(fam_inc) - np.log(fam_inc_min))/incdenom + ability_u

# for regular cohort, class size assignment depends on income, ability, and race
prob_small_reg = 0.1 + 0.2*(np.log(fam_inc) - np.log(fam_inc_min))/incdenom - 0.25*(race=='WHITE') - 0.3*(race=='ASIAN') + 0.02*ability_u

# for experimental cohort, class size assignment is random
prob_small_exp = 0.5

# target class sizes
class_sz_small = 15.0
class_sz_large = 30.0

# teacher MA assignment probs
ma_small = 0.5
ma_large = 0.4

# schools
n_schools = 20
school_effect_range = 5  

# test scores
# test score = 45 + ability + 5*small + 10*ma + school_effect
test_coefs = [45, 1, 5, 10, 1]

# file locations
stu_file = '../data/class_size/students.csv'
cls_file = '../data/class_size/classes.csv'


In [3]:
# Initialize student dataframe
stu_df = pd.DataFrame.from_dict({
    'family_income': fam_inc, 
    'race': race, 
    'ability': ability
})

stu_df['cohort'] = 'EXPERIMENTAL'
stu_df.loc[0:n_reg-1, 'cohort'] = 'REGULAR'


In [4]:
# For regular cohort, class size assignment depends on income and race
assign = np.random.uniform(size=n_stu) < prob_small_reg
stu_df['class_size'] = 'LARGE'
stu_df.loc[assign & (stu_df['cohort']=='REGULAR'), 'class_size'] = 'SMALL'


In [5]:
# For experimental cohort, class size assignment is random
assign = np.random.uniform(size=n_stu) < prob_small_exp
stu_df.loc[assign & (stu_df['cohort']=='EXPERIMENTAL'), 'class_size'] = 'SMALL'


In [6]:
# Allocate students to classes

n_stu_small = np.sum(stu_df['class_size']=='SMALL')
n_stu_large = np.sum(stu_df['class_size']=='LARGE')

n_classes_small = int(n_stu_small/class_sz_small)
n_classes_large = int(n_stu_large/class_sz_large)

small = stu_df['class_size']=='SMALL'

stu_df['ord'] = 0
stu_df.loc[small, 'ord'] = np.arange(n_stu_small)
stu_df.loc[~small, 'ord'] = np.arange(n_stu_large)

stu_df.loc[small, 'class_id'] = 7001 + (stu_df.loc[small, 'ord'] % n_classes_small)
stu_df.loc[~small, 'class_id'] = 8001 + (stu_df.loc[~small, 'ord'] % n_classes_large)

print(f"SMALL: {n_stu_small} students, {n_classes_small} classes, class_id range: {stu_df.loc[small,'class_id'].min()}-{stu_df.loc[small,'class_id'].max()}")
print(f"LARGE: {n_stu_large} students, {n_classes_large} classes, class_id range: {stu_df.loc[~small,'class_id'].min()}-{stu_df.loc[~small,'class_id'].max()}")


SMALL: 2294 students, 152 classes, class_id range: 7001.0-7152.0
LARGE: 3706 students, 123 classes, class_id range: 8001.0-8123.0


In [7]:
# Create class dataframe

cls_df = stu_df.groupby(['class_id','class_size']).agg(n_students = ('ord','count')).reset_index()
stu_df = stu_df.drop(labels='class_size', axis=1)

# assign teacher ma
n_cls = len(cls_df)
small = cls_df['class_size']=='SMALL'
u = np.random.uniform(size=n_cls)

cls_df['teacher_has_ma'] = False
cls_df.loc[small, 'teacher_has_ma'] = u[small] < ma_small
cls_df.loc[~small, 'teacher_has_ma'] = u[~small] < ma_large

# randomly allocate to schools
cls_df['school_id'] = 9001 + np.random.randint(low=0, high=n_schools, size=n_cls)


In [8]:
# School effects dataframe

sch_df = cls_df.groupby('school_id').agg(n_classes = ('class_id','count')).reset_index()
sch_df['school_effect'] = np.random.uniform(low=-school_effect_range, high=school_effect_range, size=n_schools)


In [9]:
# Test scores

stu_df = stu_df.merge(cls_df, on='class_id', how='left')
stu_df = stu_df.merge(sch_df, on='school_id', how='left')

stu_df['test_score'] = test_coefs[0] + \
    test_coefs[1]*stu_df['ability'] + \
    test_coefs[2]*(stu_df['class_size']=='SMALL') + \
    test_coefs[3]*(stu_df['teacher_has_ma']) + \
    test_coefs[4]*(stu_df['school_effect'])

print(f"Test score range: {stu_df['test_score'].min()}-{stu_df['test_score'].max()}")


Test score range: 41.197146772054566-97.66281600969505


In [10]:
# Prepare output

stu_df = stu_df.sample(frac=1).reset_index(drop=True)
stu_df = stu_df.sort_values(by=['class_id'], ascending=True)
stu_df['student_id'] = np.arange(n_stu)

stu_df[['student_id', 'class_id', 'school_id', 'test_score', 'family_income', 'race', 'cohort']].to_csv(stu_file, header=True, index=False)
cls_df[['class_id', 'school_id', 'n_students', 'class_size', 'teacher_has_ma']].to_csv(cls_file, header=True, index=False)
