# Explore missingness in charter schools data

Author: Jaren Haber<br>
Project Manager: Jaren Haber, PhD Candidate <br>
Contact: jhaber@berkeley.edu

Institution: University of California, Berkeley <br>
Program: Undergraduate Research Apprentice Program (URAP) <br>

Date created: 11-09-18<br>
Last modified: 11-09-18

Description: 

## Initialize

### Import packages

In [None]:
import pandas as pd # For working with DataFrames
import gc # For speeding up loading pickle files ('gc' = 'garbage collector')
import ast # For working with strings
import numpy as np # For numerical things
import re # For cleaning webtext

### Define file paths

In [1]:
# Input files:
charters_path = "../../nowdata/charters_2015.pkl"
pubschools_path = "../../nowdata/pubschools_2015.pkl"

In [3]:
# Output files:
#charters_storepath = "../../nowdata/backups/charters_full_2015_250_v2a.pkl"
#pubschools_storepath = "../../nowdata/backups/pubschools_full_2015_CRDC.pkl"

### Define helper functions 

In [6]:
def convert_df(df, ignore_list):
    """Makes a Pandas DataFrame more memory-efficient through intelligent use of Pandas data types: 
    specifically, by storing columns with repetitive Python strings not with the object dtype for unique values 
    (entirely stored in memory) but as categoricals, which are represented by repeated integer values. This is a 
    net gain in memory when the reduced memory size of the category type outweighs the added memory cost of storing 
    one more thing. As such, this function checks the degree of redundancy for a given column before converting it."""
    
    # Remove specified columns to avoid conversion errors, those that shouldn't have their dtype converted
    # e.g., columns that are large lists of tuples, like "WEBTEXT" or "CMO_WEBTEXT", should stay as 'object' dtype
    ignore_df = df[ignore_list]
    df.drop(ignore_list, axis=1, inplace=True)
    
    converted_df = pd.DataFrame() # Initialize DF for memory-efficient storage of strings (object types)
    df_obj = df.select_dtypes(include=['object']).copy() # Filter to only those columns of object data type

    # Loop through all columns that have 'object' dtype, b/c we especially want to convert these if possible:
    for col in df.columns: 
        if col in df_obj: 
            num_unique_values = len(df_obj[col].unique())
            num_total_values = len(df_obj[col])
            if (num_unique_values / num_total_values) < 0.5: # Only convert data types if at least half of values are duplicates
                converted_df.loc[:,col] = df[col].astype('category') # Store these columns as dtype "category"
            else: 
                converted_df.loc[:,col] = df[col]
        else:    
            converted_df.loc[:,col] = df[col]
                      
    # Downcast dtype to reduce memory drain
    converted_df.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='float')
    converted_df.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='signed')
    
    # Reintroduce ignored columns into resulting DF
    for col in ignore_list:
        converted_df[col] = ignore_df[col]
    
    return converted_df

## Load data

In [7]:
gc.disable() # disable garbage collector (to speed things up)
charterdf = pd.read_pickle(charters_path)
gc.enable() # enable garbage collector (for cleanliness)

In [8]:
print("#rows, #cols: ", charterdf.shape)
list(charterdf)

#rows, #cols:  (10965, 702)


['NCESSCH',
 'URL',
 'LAT1516',
 'LON1516',
 'AM',
 'AS',
 'BL',
 'HI',
 'HP',
 'TR',
 'TOTFRL',
 'CHARTER_TEXT',
 'WEBSITE',
 'MEMBER',
 'LEVEL',
 'LOCALE15',
 'LEAID',
 'LSTREET1',
 'LSTREET2',
 'LSTREET3',
 'LCITY',
 'LSTATE',
 'LZIP',
 'CMO_NAME',
 'CMO_MEMSUM',
 'SCH_NAME',
 'CMO_STATE',
 'CMO_SCHNUM',
 'CMO_URL',
 'CMO_NUMSTATES',
 'CMO_ALLSTATES',
 'CMO_SECTOR',
 'CMO_NUMSTUDENTS_CREDO17',
 'CMO_TYPE',
 'SURVYEAR',
 'FIPST',
 'STABR',
 'SEANAME',
 'ST_LEAID',
 'SCHID',
 'ST_SCHID',
 'MSTREET1',
 'MSTREET2',
 'MSTREET3',
 'MCITY',
 'MSTATE',
 'MZIP',
 'MZIP4',
 'PHONE',
 'LZIP4',
 'UNION',
 'OUT_OF_STATE_FLAG',
 'SCH_TYPE_TEXT',
 'SCH_TYPE',
 'RECON_STATUS',
 'GSLO',
 'GSHI',
 'VIRTUAL',
 'BIES',
 'SY_STATUS_TEXT',
 'SY_STATUS',
 'UPDATED_STATUS_TEXT',
 'UPDATED_STATUS',
 'EFFECTIVE_DATE',
 'G13OFFERED',
 'AEOFFERED',
 'UGOFFERED',
 'NOGRADES',
 'CHARTAUTH1',
 'CHARTAUTHN1',
 'CHARTAUTH2',
 'CHARTAUTHN2',
 'IGOFFERED',
 'FRELCH',
 'REDLCH',
 'AE',
 'TOTAL',
 'AMALM',
 'AMALF',
 '

In [9]:
#show wether there are duplicates or not for the target column of a dataframe

print("Number duplicates: ", sum(charterdf.duplicated(subset='NCESSCH', keep='first')))
print("# entries total: ", len(charterdf['NCESSCH']))
print("# unique entries: ", len(charterdf['NCESSCH'].unique()))

Number duplicates:  0
# entries total:  10965
# unique entries:  10965


In [11]:
# Define variables to keep:
# SE_T002_002 = "Population density per sq. mile"
# PCT_SE_T113_002 = "% of families whose income is below poverty level"
# PCTETH_SD = "% total population nonwhite"
keepvars = ['LEVEL', 'MEMBER', 'SE_T002_002', 'AGE', 'PCTETH', 'PCTFRL', 'PCTETH_SD', 'PCT_SE_T113_002', 'ESS_RATIO', 'constant', 'PROG_RATIO', 'INQ_RATIO', 'DISC_RATIO', 'STABR', 'LEAID']
charterdf = charterdf[keepvars]
list(charterdf)

KeyError: "['constant'] not in index"

In [12]:
charterdf.head(20)

Unnamed: 0,NCESSCH,URL,LAT1516,LON1516,AM,AS,BL,HI,HP,TR,...,ESS_VALID_STR,PROG_VALID_COUNT,PROG_VALID_STR,RIT_VALID_COUNT,RIT_VALID_STR,INQUIRY_RATIO,DISCIPLINE_RATIO,ESS_VALID_RATIO,PROG_VALID_RATIO,RIT_VALID_RATIO
0,10019700000.0,http://www.maef.net/,,,,,,,,,...,-3.049363,18.0,-2.997944,51.0,-2.545622,0.007424,0.000446,0.000893,0.001005,0.002847
1,20000100000.0,https://education.alaska.gov/DOE_Rolodex/Schoo...,60.796131,-161.765194,167.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,,0.0,,,,,,
2,20015000000.0,https://www.kgbsd.org/ketchikancharter,55.347001,-131.641191,74.0,37.0,2.0,5.0,4.0,5.0,...,-6.0,1.0,-3.123525,0.0,-6.0,0.003762,0.000752,0.0,0.000752,0.0
3,20015000000.0,http://www.tongassschool.org/,55.347001,-131.641191,57.0,12.0,4.0,6.0,1.0,11.0,...,-3.673297,5.0,-3.752448,31.0,-2.96001,0.009768,0.000177,0.000212,0.000177,0.001096
4,20018000000.0,https://aquarian.asdk12.org/,61.192407,-149.916872,10.0,11.0,6.0,19.0,2.0,51.0,...,-6.0,0.0,-6.0,0.0,-6.0,0.0,0.0,0.0,0.0,0.0
5,20018000000.0,https://education.alaska.gov/DOE_Rolodex/Schoo...,61.1981,-149.876,30.0,16.0,18.0,43.0,18.0,69.0,...,,0.0,,0.0,,,,,,
6,20018000000.0,,,,,,,,,,...,,0.0,,0.0,,,,,,
7,20018000000.0,,,,,,,,,,...,,0.0,,0.0,,,,,,
8,20018000000.0,http://www.winterberrycharterschool.com/,61.19445,-149.791641,15.0,8.0,5.0,14.0,1.0,26.0,...,-3.492173,10.0,-3.190948,34.0,-2.659441,0.009728,0.000193,0.000322,0.000644,0.002191
9,20018000000.0,http://www.asdk12.org/aboutschools/eagleacademy/,61.319213,-149.579442,2.0,3.0,8.0,6.0,0.0,18.0,...,-6.0,0.0,-6.0,0.0,-6.0,0.0,0.0,0.0,0.0,0.0


## List missing values

In [13]:
len(charterdf)

10965

In [22]:
keepvars = ['LEVEL', 'MEMBER', 'SE_T002_002', 'AGE', 'PCTETH', 'PCTFRL', 'PCTETH_SD', 'PCT_SE_T113_002', 'ESS_RATIO', 'PROG_RATIO', 'INQ_RATIO', 'DISC_RATIO', 'STABR', 'LEAID']
'''
'LAT1516',
 'LON1516',
 'AM',
 'AS',
 'BL',
 'HI',
 'HP',
 'TR','''
for var in keepvars:
    print(sum(charterdf[var].isnull()))

0
19
4076
19
43
22
4017
4017
1007
1007
1007
1007
0
3


In [55]:
sum(charterdf[(charterdf['MEMBER'].isnull()==True) | charterdf['MEMBER'] == 0 |
          (charterdf['AGE'].isnull()==True) | 
          (charterdf['BL'].isnull()==True) | (charterdf['TOTFRL'].isnull()==True) | 
          (charterdf['LEAID'].isnull()==True)]['URL'].notnull())

34

In [49]:
(charterdf[(charterdf['MEMBER'].isnull()==False) | charterdf['MEMBER'] != 0][['MEMBER', 'AGE', 'AM', 'AS', 'BL', 'HI', 'HP', 'TR', 'TOTFRL', 'LEAID']])

Unnamed: 0,MEMBER,AGE,AM,AS,BL,HI,HP,TR,TOTFRL,LEAID
1,170.0,16.0,167.0,0.0,0.0,0.0,0.0,0.0,149.0,200001.0
2,190.0,17.0,74.0,37.0,2.0,5.0,4.0,5.0,102.0,200150.0
3,169.0,12.0,57.0,12.0,4.0,6.0,1.0,11.0,88.0,200150.0
4,377.0,17.0,10.0,11.0,6.0,19.0,2.0,51.0,0.0,200180.0
5,503.0,17.0,30.0,16.0,18.0,43.0,18.0,69.0,0.0,200180.0
8,273.0,10.0,15.0,8.0,5.0,14.0,1.0,26.0,0.0,200180.0
9,176.0,10.0,2.0,3.0,8.0,6.0,0.0,18.0,0.0,200180.0
10,270.0,12.0,27.0,8.0,1.0,16.0,4.0,20.0,0.0,200180.0
11,163.0,12.0,8.0,9.0,10.0,13.0,7.0,23.0,0.0,200180.0
12,468.0,8.0,8.0,9.0,1.0,23.0,3.0,48.0,0.0,200180.0


In [56]:
missingdf = charterdf[(charterdf['MEMBER'].isnull()==True) | charterdf['MEMBER'] == 0 |
          (charterdf['AGE'].isnull()==True) | 
          (charterdf['BL'].isnull()==True) | (charterdf['TOTFRL'].isnull()==True) | 
          (charterdf['LEAID'].isnull()==True)][['MEMBER', 'AGE', 'AM', 'AS', 'BL', 'HI', 'HP', 'TR', 'TOTFRL', 'LEAID', 'URL']]

In [57]:
print(len(missingdf))
missingdf

57


Unnamed: 0,MEMBER,AGE,AM,AS,BL,HI,HP,TR,TOTFRL,LEAID,URL
96,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,400053.0,http://www.hamesa.com/
689,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,400418.0,
805,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,400751.0,http://concordiacharter.org/
887,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,400826.0,
990,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,400932.0,
1306,,0.0,,,,,,,,601426.0,http://www.pathwaysacademy.education/
1320,,0.0,,,,,,,,601436.0,http://www.olivegrovecharter.org/
1467,,-1.0,,,,,,,,609620.0,http://www.todaysfreshstart.org/
1656,,0.0,,,,,,,,619890.0,http://www.kingsvalleyhs.org/
1967,298.0,,0.0,0.0,0.0,298.0,0.0,0.0,288.0,622710.0,


In [25]:
np.all(charterdf[['MEMBER', 'AGE', 'PCTETH', 'PCTFRL', 'LEAID']])

False

In [18]:
# Filter to only open schools
filtered_STATUS = (charterdf['SY_STATUS15'] == 1) | (charterdf['SY_STATUS15'] == 3) | (charterdf['SY_STATUS15'] == 4) | (charterdf['SY_STATUS15'] == 5) | (charterdf['SY_STATUS15'] == 8)

charterdf = charterdf.loc[filtered_STATUS]
print(charterdf.shape)
charterdf.head()

(6947, 702)


Unnamed: 0,NCESSCH,URL,LAT1516,LON1516,AM,AS,BL,HI,HP,TR,...,ESS_VALID_STR,PROG_VALID_COUNT,PROG_VALID_STR,RIT_VALID_COUNT,RIT_VALID_STR,INQUIRY_RATIO,DISCIPLINE_RATIO,ESS_VALID_RATIO,PROG_VALID_RATIO,RIT_VALID_RATIO
1,20000100000.0,https://education.alaska.gov/DOE_Rolodex/Schoo...,60.796131,-161.765194,167.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,,0.0,,,,,,
2,20015000000.0,https://www.kgbsd.org/ketchikancharter,55.347001,-131.641191,74.0,37.0,2.0,5.0,4.0,5.0,...,-6.0,1.0,-3.123525,0.0,-6.0,0.003762,0.000752,0.0,0.000752,0.0
3,20015000000.0,http://www.tongassschool.org/,55.347001,-131.641191,57.0,12.0,4.0,6.0,1.0,11.0,...,-3.673297,5.0,-3.752448,31.0,-2.96001,0.009768,0.000177,0.000212,0.000177,0.001096
4,20018000000.0,https://aquarian.asdk12.org/,61.192407,-149.916872,10.0,11.0,6.0,19.0,2.0,51.0,...,-6.0,0.0,-6.0,0.0,-6.0,0.0,0.0,0.0,0.0,0.0
5,20018000000.0,https://education.alaska.gov/DOE_Rolodex/Schoo...,61.1981,-149.876,30.0,16.0,18.0,43.0,18.0,69.0,...,,0.0,,0.0,,,,,,


In [19]:
charterdf[charterdf["LEAID"].astype(object).apply(len) > 0]

TypeError: object of type 'float' has no len()