In [53]:
import numpy as np
import pandas as pd
np.random.seed(100) 

In [54]:
# Simulation parameters and exogenous factors

n_reg = 4000  # cohort of regular students
n_exp = 2000  # cohort of experimental students
n_stu = n_reg + n_exp

# Simulate race
race_codes = ['WHITE', 'BLACK', 'HISPANIC', 'ASIAN'] # race codes
race_probs = [0.5, 0.15, 0.2, 0.15] # race probabilities
race = np.random.choice(race_codes, size=n_stu, replace=True, p=race_probs)

# Simulate family income
fam_inc_min = 30000  # min family income
fam_inc_max = 150000 # max family income
incdenom = np.log(fam_inc_max) - np.log(fam_inc_min)
fam_inc = np.random.randint(low=fam_inc_min, high=fam_inc_max, size=n_stu)

# ability score that depends on family income
# ability = 10*(log(inc)-log(inc_min))/(log(inc_max)-log(inc_min)) + U[0,25]
ability = 10*(np.log(fam_inc) - np.log(fam_inc_min))/incdenom + np.random.uniform(low=0, high=25, size=n_stu)


In [55]:
# Initialize student dataframe
stu_df = pd.DataFrame.from_dict({
    'family_income': fam_inc, 
    'race': race, 
    'ability': ability
})

stu_df['cohort'] = 'EXPERIMENTAL'
stu_df.loc[0:n_reg-1, 'cohort'] = 'REGULAR'


In [56]:
# For regular cohort, class size assignment depends on income and race
prob_small = 0.25 + 0.2*(np.log(fam_inc) - np.log(fam_inc_min))/incdenom + 0.1*(race=='WHITE') + 0.15*(race=='ASIAN')
assign = np.random.uniform(size=n_stu) < prob_small
stu_df['class_size'] = 'LARGE'
stu_df.loc[assign & (stu_df['cohort']=='REGULAR'), 'class_size'] = 'SMALL'

In [57]:
# For experimental cohort, class size assignment is random
prob_small = 0.5
assign = np.random.uniform(size=n_stu) < prob_small
stu_df.loc[assign & (stu_df['cohort']=='EXPERIMENTAL'), 'class_size'] = 'SMALL'

In [58]:
# Allocate students to classes
class_sz_small = 15
class_sz_large = 30

n_stu_small = np.sum(stu_df['class_size']=='SMALL')
n_stu_large = np.sum

n_classes_small = int(np.sum(stu_df['class_size']=='SMALL')/class_sz_small)
n_classes_large = int(np.sum(stu_df['class_size']=='LARGE')/class_sz_large)
n_stu_small = 




In [61]:
n_classes_large

106

In [3]:
# Simulate student-level variables

student_id = np.arange(n_students)  # student ids
fam_inc = np.random.randint(low=fam_inc_min, high=fam_inc_max, size=n_students)  # simulate family income
race = np.random.choice(race_codes, size=n_students, replace=True, p=race_probs)  # simulate race

ability = ability_coefs[0] + \
    ability_coefs[1]*np.log(fam_inc) + \
    np.random.uniform(low=0, high=umax, size=n_students)

init_assign = np.random.choice(["SMALL", "LARGE"], size=n_students, replace=True, p=[small_assign_prob, 1-small_assign_prob])  # initial assignment

reassign_prob = reassign_coefs[0] + \
    reassign_coefs[1] * ability + \
    reassign_coefs[2] * (race=='ASIAN')  # probability of reassignment to small if initially assigned to large

reassign = (np.random.uniform(size=n_students) < reassign_prob)  # whether will be reassigned to small if initially assigned to large

small = (init_assign=='SMALL') | reassign  # actual assignment to small

noise = np.random.uniform(size=n_students) < assignment_noise
small[noise] = ~small[noise]


In [35]:
stu_df

Unnamed: 0,family_income,race,ability,cohort,class_size
0,48350,BLACK,12.567292,REGULAR,
1,53830,WHITE,17.768527,REGULAR,SMALL
2,48342,WHITE,27.078016,REGULAR,
3,148027,HISPANIC,24.698552,REGULAR,SMALL
4,32591,WHITE,13.166611,REGULAR,SMALL
...,...,...,...,...,...
5995,145237,WHITE,16.057479,EXPERIMENTAL,
5996,130838,WHITE,22.975937,EXPERIMENTAL,
5997,35761,ASIAN,1.316183,EXPERIMENTAL,
5998,144389,HISPANIC,14.011621,EXPERIMENTAL,


In [4]:
# Allocate students to classrooms
n_small = np.sum(small)
n_large = n_students - n_small

order_id = np.zeros(n_students)
order_id[small] = np.arange(n_small)
order_id[~small] = np.arange(n_large)

class_id = np.zeros(n_students)
class_id[small] = np.floor(order_id[small] / size_small) + 70001
class_id[~small] = np.floor(order_id[~small] / size_large) + 80001


In [5]:
# Student level dataframe
stu_df = pd.DataFrame.from_dict({
    'student_id': student_id, 
    'family_income': fam_inc, 
    'race': race,
    'ability': ability,
    'initial_assignment': init_assign, 
    'class_id': class_id
})


In [6]:
# Class level dataframe
class_id = sorted(list(stu_df['class_id'].unique()))
cls_df = pd.DataFrame.from_dict({'class_id':class_id})

class_small = cls_df['class_id'] < 80000
cls_df['class_size'] = 'LARGE'
cls_df.loc[ class_small, 'class_size'] = 'SMALL'

n_class = len(class_id)
n_class_small = np.sum(class_small)
n_class_large = np.sum(~class_small)

# simulate whether class teacher has masters degree
u = np.random.uniform(size=n_class)
cls_df['teacher_has_ma'] = False
cls_df.loc[ class_small, 'teacher_has_ma'] = u[class_small] < teacher_ma_prob_small
cls_df.loc[ ~class_small, 'teacher_has_ma'] = u[~class_small] < teacher_ma_prob_large


In [7]:
# Merge classroom info to students and simulate test scores
stu_df = stu_df.merge(cls_df, on='class_id', how='left')

stu_df['test_score'] = test_score_coefs[0] + \
    test_score_coefs[1] * stu_df['ability'] + \
    test_score_coefs[2] * (stu_df['class_size']=='SMALL') + \
    test_score_coefs[3] * (stu_df['teacher_has_ma']) 

In [8]:
# Output datasets
stu_df[['student_id', 'class_id', 'test_score', 'family_income', 'race', 'initial_assignment']].to_csv('../data/class_size/students.csv', header=True, index=False)
cls_df[['class_id', 'class_size', 'teacher_has_ma']].to_csv('../data/class_size/classes.csv', header=True, index=False)
