In [39]:
import boto3
import sys,os
sys.path.append("..")
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from swampy import structshape as ss

### Ingest school directory dataset from S3

In [40]:
conn = boto3.client('s3')
local_dir_filename = 'directory_all.csv.bak'
if not os.path.exists('./' + local_dir_filename):
    conn.download_file('edu-data-bucket','directory/schools_ccd_directory.csv',local_dir_filename)

In [41]:
df0 = pd.read_csv(local_dir_filename,low_memory=False)

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

### Shape, datatype, gross feature analysis

In [None]:
df0.shape, df0.dtypes

In [None]:
df0.head(n=3)

Observations
* There are many years and schools in this dataset that are not in the graduation rate dataset
* Ncessch and ncessch_num appear to be duplicates
* Features shared with the Adjusted Cohort dataset are
    * ncessch
    * school_name
    * leaid
    * lea_name
    * state_location
    * fips

In [None]:

pd.options.display.float_format = '{:.1f}'.format
pd.set_option('display.max_columns',99)
df0.describe()

### Strategy
* Subset to our years of interest and potentially interesting columns
* Explore missingness, distributions, outliers 

In [None]:
# Subset to our years of interest
years = list(range(2010,2019))
our_years = df0.query('year in @years')
pd.set_option('display.max_columns',99)
our_years.head(n=3)

### Codebook
* variable	format	label
* year	numeric	Academic year (fall semester)
* ncessch	string	National Center for Education Statistics (NCES) identification number
* ncessch_num	numeric	National Center for Education Statistics (NCES) identification number (numeric)
* school_id	string	School identification number (NCES)
* school_name	string	School name
* leaid	string	Local education agency identification number (NCES)
* lea_name	string	Local education agency name
* state_leaid	string	Local education agency identification number (state)
* seasch	string	School identification number (state)
* street_mailing	string	Street of mailing address 
* city_mailing	string	City of mailing address
* state_mailing	string	State of mailing address 
* zip_mailing	string	Zip code of mailing address 
* street_location	string	Street of location
* city_location	string	City of location
* state_location	string	State of location
* zip_location	string	Zip code of location
* phone	string	Telephone number
* fips	fips	Federal Information Processing Standards state code
* latitude	numeric	Latitude of institution
* longitude	numeric	Longitude of institution
* csa	numeric	Combined statistical area
* cbsa	numeric	Core-based statistical area
* urban_centric_locale	urban_centric_locale	Degree of urbanization (urban-centric locale)
* county_code	numeric	County code
* congress_district_id	numeric	State and 114th congressional district identification number
* state_leg_district_lower	string	State legislative district—lower
* state_leg_district_upper	string	State legislative district—upper
* school_level	school_level	School level 
* school_type	school_type	School type 
* school_status	school_status	Status at start of school year
* lowest_grade_offered	grade_offered_ccd	Lowest grade offered
* highest_grade_offered	grade_offered_ccd	Highest grade offered 
* elem_cedp	yes_no	Elementary school indicator (Center on Education Data and Policy variable)
* middle_cedp	yes_no	Middle school indicator (Center on Education Data and Policy variable)
* high_cedp	yes_no	High school indicator (Center on Education Data and Policy variable)
* ungrade_cedp	yes_no	Ungraded school indicator (Center on Education Data and Policy variable)
* bureau_indian_education	yes_no	Bureau of Indian Education school
* title_i_status	title_i_status	Title I status
* title_i_eligible	yes_no	Title I eligibility
* title_i_schoolwide	yes_no	Schoolwide Title I eligibility
* charter	yes_no	Charter school
* magnet	yes_no	Magnet school
* shared_time	yes_no	Shared time
* virtual	virtual	Virtual school 
* teachers_fte	numeric	Number of full-time equivalent teachers
* lunch_program	lunch_program	National School Lunch Program (NSLP) status
* free_lunch	numeric	Number of students eligible for free lunch
* reduced_price_lunch	numeric	Number of students eligible for reduced-price lunch
* free_or_reduced_price_lunch	numeric	Number of students eligible for free or reduced-price lunch
* direct_certification	numeric	Number of students eligible for free lunch by direct certification
* enrollment	numeric	Student enrollment


In [None]:
# Subset to potentially interesting columns



Verify that ncessch and ncessch_num are duplicates since we plan to merge the graduation dataset with it.

In [None]:
assert all(our_years.ncessch == our_years.ncessch_num)
our_years.drop(columns=['ncessch_nums'],axis=1,inplace=True)

### Missingness

In [None]:
our_years.isnull().sum()[our_years.isnull().sum().values > 5000]

In [42]:
# Investigate high missingness columns and columns with negatives