In [23]:
import pandas as pd

#create edfacts dataframe
assessment_df = pd.read_csv('data/schools_edfacts_assessments_2020.csv')
assessment_df = assessment_df[['ncessch_num', 'year', 'read_test_pct_prof_midpt', 'math_test_pct_prof_midpt', 'grade_edfacts', 'race', 'sex', 'lep', 'homeless', 'migrant', 'disability', 'econ_disadvantaged', 'foster_care', 'military_connected']]  # Select only the required columns
    
#Create LISD dataframe
lisd_2018_2022 = pd.read_csv('data/LISD_1.0_2018-2022.csv')
lisd = lisd_2018_2022[['ncessch', 'year', 'perflunch', 'perrlunch', 'perfrlunch', 'pernam', 'perasian', 'perhisp', 'perblack', 'pertr', 'perwhite', 'level']]
lisd.columns = ['ncessch_num', 'year', 'perflunch', 'perrlunch', 'perfrlunch', 'pernam', 'perasian', 'perhisp', 'perblack', 'pertr', 'perwhite', 'level']

# Create meps dataframe
meps = pd.read_csv('data/schools_meps.csv')
meps = meps[['ncessch_num', 'year', 'meps_poverty_pct']]
meps.columns = ['ncessch_num', 'year', 'perpoverty']

# Create schools dataframe
schools = pd.read_csv('data/schools_ccd_directory.csv')
schools = schools[['ncessch_num', 'year', 'school_name', 'city_mailing', 'state_mailing', 'zip_mailing', 'latitude', 'longitude']]
schools.columns = ['ncessch_num', 'year', 'school_name', 'city', 'state', 'zip', 'latitude', 'longitude']

# Create RUCA dataframe
ruca = pd.read_csv('data/RUCA2010zipcode.csv')
ruca = ruca[['ZIP_CODE', 'RUCA1', 'RUCA2']]

# Create zip code to ZCTA dataframe
zip_to_zcta = pd.read_csv('data/ZIP_Code_ZCTA_Crosswalk.csv')
zip_to_zcta = zip_to_zcta[['ZIP_CODE', 'zcta']]

# Create ACS Deprivation Index dataframe
acs = pd.read_csv('data/ACS_deprivation_index_zcta.csv')
acs = acs[['ZCTA5', 'fraction_assisted_income', 'fraction_high_school_edu', 'median_income', 'fraction_no_health_ins', 'fraction_poverty', 'fraction_vacant_housing', 'dep_index']]

# Filter dataframes for the year 2020
lisd = lisd[lisd['year'] == 2020]
meps = meps[meps['year'] == 2020]
schools = schools[schools['year'] == 2020]

# Merge dataframes
df = pd.merge(assessment_df, lisd, on=['ncessch_num', 'year'], how='left')
df = pd.merge(df, meps, on=['ncessch_num', 'year'], how='left')
df = pd.merge(df, schools, on=['ncessch_num', 'year'], how='left')
df = pd.merge(df, ruca, left_on='zip', right_on='ZIP_CODE', how='left')
df = pd.merge(df, zip_to_zcta, left_on='zip', right_on='ZIP_CODE', how='left')
df = pd.merge(df, acs, left_on='zcta', right_on='ZCTA5', how='left')

# filter only rows where level is High/Secondary
df = df[df['level'] == 'High/Secondary']

print(df.head())
print(df.shape)



  schools = pd.read_csv('data/schools_ccd_directory.csv')


    ncessch_num  year  read_test_pct_prof_midpt  math_test_pct_prof_midpt  \
47  10000500871  2020                      22.0                      19.0   
48  10000500871  2020                      10.0                      10.0   
49  10000500871  2020                      12.0                       7.5   
50  10000500871  2020                      22.0                      22.0   
51  10000500871  2020                       5.0                       5.0   

    grade_edfacts  race  sex  lep  homeless  migrant  ...  ZIP_CODE_y  \
47             99    99   99   99        99       99  ...     35950.0   
48             99    99   99   99        99       99  ...     35950.0   
49             99    99   99   99        99       99  ...     35950.0   
50             99    99    2   99        99       99  ...     35950.0   
51             99    99   99    1        99       99  ...     35950.0   

       zcta    ZCTA5  fraction_assisted_income  fraction_high_school_edu  \
47  35950.0  35950.0  

In [24]:
nulls = df.isnull().sum()
print(nulls)

ncessch_num                     0
year                            0
read_test_pct_prof_midpt    22313
math_test_pct_prof_midpt    21684
grade_edfacts                   0
race                            0
sex                             0
lep                             0
homeless                        0
migrant                         0
disability                      0
econ_disadvantaged              0
foster_care                     0
military_connected              0
perflunch                       0
perrlunch                       0
perfrlunch                      0
pernam                          0
perasian                        0
perhisp                         0
perblack                        0
pertr                           0
perwhite                        0
level                           0
perpoverty                  14922
school_name                     0
city                            0
state                           0
zip                             0
latitude      

In [25]:
df = df.drop(columns=['ZIP_CODE_x', 'ZIP_CODE_y', 'ZCTA5'])
print(df.head())
print(df.shape)

    ncessch_num  year  read_test_pct_prof_midpt  math_test_pct_prof_midpt  \
47  10000500871  2020                      22.0                      19.0   
48  10000500871  2020                      10.0                      10.0   
49  10000500871  2020                      12.0                       7.5   
50  10000500871  2020                      22.0                      22.0   
51  10000500871  2020                       5.0                       5.0   

    grade_edfacts  race  sex  lep  homeless  migrant  ...  RUCA1  RUCA2  \
47             99    99   99   99        99       99  ...    4.0    4.0   
48             99    99   99   99        99       99  ...    4.0    4.0   
49             99    99   99   99        99       99  ...    4.0    4.0   
50             99    99    2   99        99       99  ...    4.0    4.0   
51             99    99   99    1        99       99  ...    4.0    4.0   

       zcta  fraction_assisted_income  fraction_high_school_edu  \
47  35950.0        

In [26]:
nulls = df.isnull().sum()
print(nulls)

ncessch_num                     0
year                            0
read_test_pct_prof_midpt    22313
math_test_pct_prof_midpt    21684
grade_edfacts                   0
race                            0
sex                             0
lep                             0
homeless                        0
migrant                         0
disability                      0
econ_disadvantaged              0
foster_care                     0
military_connected              0
perflunch                       0
perrlunch                       0
perfrlunch                      0
pernam                          0
perasian                        0
perhisp                         0
perblack                        0
pertr                           0
perwhite                        0
level                           0
perpoverty                  14922
school_name                     0
city                            0
state                           0
zip                             0
latitude      

In [27]:
# write the cleaned data to a csv file
df.to_csv('data/merged_2020_hs.csv', index=False)