### Ingest math and language arts state test participation 

In [2]:
# Look at columns in files one at a time before we upload to worm
import os,sys,pandas as pd, numpy as np,seaborn as sns

In [3]:
dataset_dir = "/home/bb/datasets/state_test_participation/"

os.chdir(dataset_dir)
math_files = []
rla_files = []
for root, dirs, files in os.walk('.',topdown=False):
    for f in files:
        if f.startswith("math"):
            math_files += [f]
        if f.startswith("rla"):
            rla_files += [f]
        print(f)

math-participation-sch-sy2018-19-wide.csv
rla-participation-sch-sy2012-13.csv
rla-participation-sch-sy2015-16.csv
math-participation-sch-sy2016-17.csv
math-participation-sch-sy2012-13.csv
math-participation-sch-sy2014-15.csv
math-participation-sch-sy2013-14.csv
rla-participation-sch-sy2018-19-wide.csv
rla-participation-sch-sy2017-18.csv
math-participation-sch-sy2015-16.csv
rla-participation-sch-sy2016-17.csv
rla-participation-sch-sy2013-14.csv
math-participation-sch-sy2017-18.csv
rla-participation-sch-sy2014-15.csv


In [4]:
# We are missing the years 2010-2011 and 2011-2012. Possibly we can impute the values.
# Lets look at the feature overlap

In [5]:
# We have 7 years of Math percent participated scores
len(math_files), len(rla_files)

(7, 7)

In [6]:
# Create raw math and rla dataframe lists
math_dfs = [pd.read_csv(dataset_dir + "/" +  i,low_memory=False) for i in math_files]
rla_dfs = [pd.read_csv(dataset_dir + "/" + i,low_memory=False) for i in rla_files]

In [7]:
# Example of the shape of one of the datrames
math_dfs[0].shape, math_dfs[0].columns.tolist()

((90320, 265),
 ['STNAM',
  'FIPST',
  'LEANM',
  'LEAID',
  'ST_LEAID',
  'NCESSCH',
  'ST_SCHID',
  'SCHNAM',
  'DATE_CUR',
  'ALL_MTH00numpart_1819',
  'ALL_MTH00pctpart_1819',
  'MAM_MTH00numpart_1819',
  'MAM_MTH00pctpart_1819',
  'MAS_MTH00numpart_1819',
  'MAS_MTH00pctpart_1819',
  'MBL_MTH00numpart_1819',
  'MBL_MTH00pctpart_1819',
  'MHI_MTH00numpart_1819',
  'MHI_MTH00pctpart_1819',
  'MTR_MTH00numpart_1819',
  'MTR_MTH00pctpart_1819',
  'MWH_MTH00numpart_1819',
  'MWH_MTH00pctpart_1819',
  'F_MTH00numpart_1819',
  'F_MTH00pctpart_1819',
  'M_MTH00numpart_1819',
  'M_MTH00pctpart_1819',
  'CWD_MTH00numpart_1819',
  'CWD_MTH00pctpart_1819',
  'ECD_MTH00numpart_1819',
  'ECD_MTH00pctpart_1819',
  'FCS_MTH00numpart_1819',
  'FCS_MTH00pctpart_1819',
  'LEP_MTH00numpart_1819',
  'LEP_MTH00pctpart_1819',
  'HOM_MTH00numpart_1819',
  'HOM_MTH00pctpart_1819',
  'MIG_MTH00numpart_1819',
  'MIG_MTH00pctpart_1819',
  'MIL_MTH00numpart_1819',
  'MIL_MTH00pctpart_1819',
  'ALL_MTH03numpar

In [11]:
# Inspecting the codebook, we see that there are only two columns relevant to our study.
# These are the percentage participation numbers in math and reading / language arts state assessments.
# Extract the total participation column names that are relevant to high schools 
list(map(lambda x : print(x.filter(like="ALL_MTHHSPCT").columns),math_dfs))
list(map(lambda x : print(x.filter(like="ALL_RLAHSPCT").columns),rla_dfs))
list(map(lambda x : print(x.filter(like="NCESSCH").columns),math_dfs))
list(map(lambda x : print(x.filter(like="NCESSCH").columns),rla_dfs))

Index([], dtype='object')
Index(['ALL_MTHHSPCTPART_1617'], dtype='object')
Index(['ALL_MTHHSPCTPART_1213'], dtype='object')
Index(['ALL_MTHHSPCTPART_1415'], dtype='object')
Index(['ALL_MTHHSPCTPART_1314'], dtype='object')
Index(['ALL_MTHHSPCTPART_1516'], dtype='object')
Index(['ALL_MTHHSPCTPART_1718'], dtype='object')
Index(['ALL_RLAHSPCTPART_1213'], dtype='object')
Index(['ALL_RLAHSPCTPART_1516'], dtype='object')
Index([], dtype='object')
Index(['ALL_RLAHSPCTPART_1718'], dtype='object')
Index(['ALL_RLAHSPCTPART_1617'], dtype='object')
Index(['ALL_RLAHSPCTPART_1314'], dtype='object')
Index(['ALL_RLAHSPCTPART_1415'], dtype='object')
Index(['NCESSCH'], dtype='object')
Index(['NCESSCH'], dtype='object')
Index(['NCESSCH'], dtype='object')
Index(['NCESSCH'], dtype='object')
Index(['NCESSCH'], dtype='object')
Index(['NCESSCH'], dtype='object')
Index(['NCESSCH'], dtype='object')
Index(['NCESSCH'], dtype='object')
Index(['NCESSCH'], dtype='object')
Index(['NCESSCH'], dtype='object')
Index(['NC

[None, None, None, None, None, None, None]

### Strategy:
* Subset the dataframes to our columns of interest e.g. (NCESSCH, ALL_MTHHSPCTPART_1213, ALL_RLAHSPCTPART_1213)
* Create one large math and one large language arts dataframe with all the years stacked, so that it matches our graduation rate and school directory dataframes
* Explore statistics and missing values
* Save these two dataframes to disk

In [9]:
def year_string(y: int):
    """Input an integer year and get a range that matches the column suffixes in the raw data.
    e.g. 2011 => 1112 and 2018 => 1819."""
    return str(y)[-2:] + str(int(str(y)[-2:]) + 1)

In [34]:
math_df = pd.DataFrame(columns=["Year", "NCESSCH", "PCT_PART"])
rla_df = pd.DataFrame(columns=["Year", "NCESSCH", "PCT_PART"])

new_dfs = []
for df in math_dfs:
    pct_part = df.loc[:,['ALL_MTHHSPCT' in i or 'ALL_MTHHSpct' in i for i in df.columns]]
    year = int("20" + pct_part.columns[0][-4:-2])
    year_df = pd.DataFrame()
    year_df['Year'] = np.array([year] * len(pct_part))
    ncessch = df.filter(like='NCESSCH')
    assert isinstance(pct_part,pd.DataFrame)
    assert isinstance(year_df,pd.DataFrame)
    assert isinstance(ncessch,pd.DataFrame)
    new_df = pd.concat([year_df,ncessch,pct_part])
    new_dfs += [new_df]

In [35]:
new_dfs[0]

Unnamed: 0,Year,NCESSCH,ALL_MTHHSpctpart_1819
0,2018.0,,
1,2018.0,,
2,2018.0,,
3,2018.0,,
4,2018.0,,
...,...,...,...
90315,,,GE90
90316,,,GE95
90317,,,
90318,,,
