In [158]:
## Import packages
library(aod)
library(plyr)
library(mgcv)
library(stringr)
library(stargazer)
library(dplyr)
library(corrplot)
library(purrr)
library(ggplot2)


## Prepare dataframe

### Load data sets

In [170]:
print('Import Stanford Educational Data, this contains info on school learning rates and demographics.
However, this data is missing census tract info.')
#seda_data <- read.csv("seda_county_pool_gcs_4.1.csv") ## TODO: which is the right dataset?
seda_data <- read.csv("seda_geodist_pool_gcs_4.1.csv")
seda_data <- filter(seda_data, stateabb == 'CA' & subcat == 'all')
print("'seda_data' head:")
head(seda_data)
print('Import smart location data, this contains walkability index info for census tracts')
location_data <- read.csv("EPA_SmartLocationDatabase_V3_Jan_2021_Final.csv")
print("'location_data' head:")
head(location_data)

print('To join seda data with location_data we need to match school district names to census tract info. 
This can be accomplished with the grf21_lea_tract data, which pairs school district names to census tracts
Source: https://nces.ed.gov/programs/edge/Geographic/RelationshipFiles')
grf21_lea_tract <- read.csv("grf21_lea_tract.csv")
print("'grf21_lea_tract' head:")
head(grf21_lea_tract)

[1] "Import Stanford Educational Data, this contains info on school learning rates and demographics.\nHowever, this data is missing census tract info."
[1] "'seda_data' head:"


sedalea,sedaleaname,fips,stateabb,subcat,subgroup,gradecenter,gap,tot_asmts,cellcount,...,gcs_mn_grd_ol_se,gcs_mn_mth_ol_se,gcs_mn_avg_eb,gcs_mn_coh_eb,gcs_mn_grd_eb,gcs_mn_mth_eb,gcs_mn_avg_eb_se,gcs_mn_coh_eb_se,gcs_mn_grd_eb_se,gcs_mn_mth_eb_se
600001,ACTON-AGUA DULCE UNIFIED,6,CA,all,all,5.5,0,10225,93,...,0.03825889,0.11524657,4.872226,0.01859092,1.0330263,-0.44674155,0.05714427,0.01851047,0.03689886,0.1103038
600006,ROSS VALLEY ELEMENTARY,6,CA,all,all,5.5,0,21479,92,...,0.03815837,0.10854442,7.342026,0.07784535,1.1493115,-0.75214852,0.05477293,0.01750866,0.03680237,0.10439633
600009,CUYAMA JOINT UNIFIED,6,CA,all,all,5.5,0,1386,70,...,0.07458805,0.20950904,3.172816,-0.01572943,0.9669531,0.06818565,0.11067105,0.0361308,0.06598205,0.18314775
600011,FORT SAGE UNIFIED,6,CA,all,all,5.5,0,2722,78,...,0.05885866,0.1675252,3.73623,0.09723531,1.0414415,-0.40530377,0.08571303,0.02626744,0.05433261,0.15325524
600012,TWIN RIDGES ELEMENTARY,6,CA,all,all,5.5,0,869,82,...,0.07909614,0.23289394,3.736791,0.10728047,1.0748518,-0.27237938,0.11572343,0.0360274,0.06880173,0.19866493
600013,ROCKLIN UNIFIED,6,CA,all,all,5.5,0,98286,98,...,0.03026514,0.09270408,6.691219,0.07240027,1.1825471,-0.17404206,0.04606905,0.01483906,0.02956309,0.09006892


[1] "Import smart location data, this contains walkability index info for census tracts"
[1] "'location_data' head:"


OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,D5DRI,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd,Shape_Length,Shape_Area
1,481130000000.0,481130000000.0,48,113,7825,4,206,"Dallas-Fort Worth, TX-OK",19100,...,0.1846967,0.000475598,0.1377067,6,14,15,17,14.0,3110.361,297836.1
2,481130000000.0,481130000000.0,48,113,7825,2,206,"Dallas-Fort Worth, TX-OK",19100,...,0.3232213,0.000800804,0.2318678,3,10,12,14,10.833333,3519.469,484945.1
3,481130000000.0,481130000000.0,48,113,7825,3,206,"Dallas-Fort Worth, TX-OK",19100,...,0.3146279,0.000736145,0.2131463,1,1,7,17,8.333333,1697.092,106705.9
4,481130000000.0,481130000000.0,48,113,7824,1,206,"Dallas-Fort Worth, TX-OK",19100,...,0.2298209,0.000708073,0.2050183,16,10,17,17,15.666667,2922.609,481828.4
5,481130000000.0,481130000000.0,48,113,7824,2,206,"Dallas-Fort Worth, TX-OK",19100,...,0.1648628,0.000432736,0.125296,4,7,11,14,10.166667,3731.972,687684.8
6,481130000000.0,481130000000.0,48,113,7827,1,206,"Dallas-Fort Worth, TX-OK",19100,...,0.189063,0.000467412,0.1353362,1,4,5,13,6.833333,3109.652,205126.8


[1] "To join seda data with location_data we need to match school district names to census tract info. \nThis can be accomplished with the grf21_lea_tract data, which pairs school district names to census tracts\nSource: https://nces.ed.gov/programs/edge/Geographic/RelationshipFiles"
[1] "'grf21_lea_tract' head:"


LEAID,NAME_LEA21,TRACT,COUNT,LANDAREA,WATERAREA
100001,Fort Rucker School District,1031010300,2,23.42836917,0.0
100001,Fort Rucker School District,1045020000,2,66.51340431,1.081745166
100003,Maxwell AFB School District,1101000900,2,3.356590455,0.143795261
100003,Maxwell AFB School District,1101001000,2,0.001525876,0.0
100005,Albertville City School District,1095030701,9,2.125781664,0.0
100005,Albertville City School District,1095030702,9,0.800889811,0.009929776


### Join grf21_lea_tract and location_data into new 'df'

In [171]:
print('As we see from the head printouts above, the "tract" column in grf21_lea_tract includes STATEFP and COUNTYFP.
To join this with location_data we have to remove this prepended info. Since we\'re only looking at CA, this is a valid thing to do.')
grf21_lea_tract$TRACTCE <- substr(as.character(grf21_lea_tract$TRACT), 5, 10)
df <- merge(grf21_lea_tract, location_data,by="TRACTCE")


[1] "As we see from the head printouts above, the \"tract\" column in grf21_lea_tract includes STATEFP and COUNTYFP.\nTo join this with location_data we have to remove this prepended info. Since we're only looking at CA, this is a valid thing to do."


### Join df with seda_data

In [172]:
print("There's a mismatch in how district names appear:")
head(df$NAME_LEA21)
head(seda_data$sedaleaname)
print("we can correct that by removing ' SCHOOL DISTRICT' from the df column:")
maxLen = map(as.character(df$NAME_LEA21), nchar)
maxLen = as.numeric(maxLen) - 16
df$sedaleaname <- substr(toupper(df$NAME_LEA21), 1, maxLen)
head(df$sedaleaname)

[1] "There's a mismatch in how district names appear:"


[1] "we can correct that by removing ' SCHOOL DISTRICT' from the df column:"


In [None]:
first_pass = TRUE
num_rows = nrow(seda_data)
for (row in 1:num_rows) {
    seda_row = seda_data[row, ]
    school_district_name = str_trim(as.character(seda_row$sedaleaname))
    df_subset = df %>% filter(grepl(school_district_name, sedaleaname))
    # TODO: is gcs_mn_grd_ol_se what we want? we need to figure out what the learning index value should be
    x<-rep(seda_row$gcs_mn_grd_ol_se,times=nrow(df_subset))
    df_subset$gcs_mn_grd_ol_se<-x
    print(df_subset)
    if (first_pass) {
        df_temp = df_subset
        first_pass = FALSE
    } else {
        rbind(df_temp, df_subset)
    }
}
head(df_temp)

In [None]:
df = df_temp # Cleanup

In [None]:
# TODO: Do we need this even?
california_data <- read.csv("California.csv")
head(california_data)

In [None]:
# TODO: Do we need this even?
corsonetal_data <- read.csv("corsonetal.csv")
head(corsonetal_data)

In [None]:
# TODO: Do we need this even?
marionetal_data <- read.csv("marionetal.csv")
head(marionetal_data)