# Filter public/charter school data into team-specific data sets

Author(s): Jaren Haber<br>
Project Manager: Jaren Haber, PhD Candidate <br>
Contact: jhaber@berkeley.edu

Institution: University of California, Berkeley <br>
Program: Undergraduate Research Apprentice Program (URAP) <br>

Date created: Nov. 27, 2018<br>
Last modified: Nov. 27, 2018

Description: 

## Initialize

### Import packages

In [13]:
import pandas as pd # For working with DataFrames
import gc # For speeding up loading pickle files ('gc' = 'garbage collector')
import ast # For working with strings
import numpy as np # For numerical things
import re # For cleaning webtext
import _pickle as cPickle # Optimized version of pickle
import gc # For managing garbage collector

### Define file paths

In [14]:
# Input files:
charters_path = "../../nowdata/charters_2015.pkl"
pubschools_path = "../../nowdata/pubschools_2015.pkl"

charters_small_loc = "../../nowdata/backups/charters_parsed_03-04_no-text_SMALL.csv"
ACSsmall_loc = "../data/ACS_2016_sd-merged_SMALL.csv"

In [15]:
# Output files:
charters_storepath = "../../nowdata/backups/charters_full_2015_250_v2a.pkl"
pubschools_storepath = "../../nowdata/backups/pubschools_full_2015_CRDC.pkl"

geo_storepath = "../../nowdata/backups/charters_geo_2015_v2a.csv"
stats_storepath = "../../nowdata/backups/charters_stats_2015_v2a.csv"

### Define helper functions

In [16]:
def convert_df(df, ignore_list):
    """Makes a Pandas DataFrame more memory-efficient through intelligent use of Pandas data types: 
    specifically, by storing columns with repetitive Python strings not with the object dtype for unique values 
    (entirely stored in memory) but as categoricals, which are represented by repeated integer values. This is a 
    net gain in memory when the reduced memory size of the category type outweighs the added memory cost of storing 
    one more thing. As such, this function checks the degree of redundancy for a given column before converting it."""
    
    # Remove specified columns to avoid conversion errors, those that shouldn't have their dtype converted
    # e.g., columns that are large lists of tuples, like "WEBTEXT" or "CMO_WEBTEXT", should stay as 'object' dtype
    if len(ignore_list)>0:
        ignore_df = df[ignore_list]
        df.drop(ignore_list, axis=1, inplace=True)
    
    converted_df = pd.DataFrame() # Initialize DF for memory-efficient storage of strings (object types)
    df_obj = df.select_dtypes(include=['object']).copy() # Filter to only those columns of object data type

    # Loop through all columns that have 'object' dtype, b/c we especially want to convert these if possible:
    for col in df.columns: 
        if col in df_obj: 
            num_unique_values = len(df_obj[col].unique())
            num_total_values = len(df_obj[col])
            if (num_unique_values / num_total_values) < 0.5: # Only convert data types if at least half of values are duplicates
                converted_df.loc[:,col] = df[col].astype('category') # Store these columns as dtype "category"
            else: 
                converted_df.loc[:,col] = df[col]
        else:    
            converted_df.loc[:,col] = df[col]
                      
    # Downcast dtype to reduce memory drain
    converted_df.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='float')
    converted_df.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='signed')
    
    # Reintroduce ignored columns into resulting DF
    if len(ignore_list)>0:
        for col in ignore_list:
            converted_df[col] = ignore_df[col]
    
    return converted_df

In [17]:
def quickpickle_load(picklepath):
    '''Very time-efficient way to load pickle-formatted objects into Python.
    Uses C-based pickle (cPickle) and gc workarounds to facilitate speed. 
    Input: Filepath to pickled (*.pkl) object.
    Output: Python object (probably a list of sentences or something similar).'''

    with open(picklepath, 'rb') as loadfile:
        
        gc.disable() # disable garbage collector
        outputvar = cPickle.load(loadfile) # Load from picklepath into outputvar
        gc.enable() # enable garbage collector again
    
    return outputvar

In [18]:
def quickpickle_dump(dumpvar, picklepath):
    '''Very time-efficient way to dump pickle-formatted objects from Python.
    Uses C-based pickle (cPickle) and gc workarounds to facilitate speed. 
    Input: Python object (probably a list of sentences or something similar).
    Output: Filepath to pickled (*.pkl) object.'''

    with open(picklepath, 'wb') as destfile:
        
        gc.disable() # disable garbage collector
        cPickle.dump(dumpvar, destfile) # Dump dumpvar to picklepath
        gc.enable() # enable garbage collector again

In [58]:
def check_df(DF, colname):
    """Displays basic info about a dataframe in memory.
    Input: Pandas DataFrame object
    Output: printed basic stats:    # rows and columns, 
                                    # duplicates by colname, 
                                    column names and, if missing data, the # missing cases."""
    
    # Show DF info, including # duplicates by colname
    print("# rows and cols: ", str(DF.shape))
    print("# duplicates by " + str(colname) + ": " + str(sum(DF.duplicated(subset=colname, keep='first'))))

    print("\nColumns and # missing cases (if any): ")
    for col in list(DF):
        missed = sum(DF[colname].isnull())
        if missed > 0:
            print(col + ": " + str(missed) + " missing")
        else:
            print(col)
    
    #print("\nALL column names: ", list(DF))

In [70]:
def load_filtered_df(dfpath, keepcols):
    """Quickly loads a Pandas DataFrame from file (either .csv or .pkl format), 
    keeps only those variables in keepvars (if not an empty list), and makes the DF memory-efficient.
    Input: file path to DataFrame (.csv or .pkl), list of variables to keep from said DF (or empty list, to keep all cols)
    Output: DF with reduced variables and with memory-efficient dtypes."""
    
    if len(keepcols)>0:
        if dfpath.endswith(".csv"):
            newdf = pd.read_csv(dfpath, usecols=keepcols, low_memory=False)
        elif dfpath.endswith(".pkl"):
            newdf = quickpickle_load(dfpath)
            newdf = newdf[keepcols]
            
    else:
        if dfpath.endswith(".csv"):
            newdf = pd.read_csv(dfpath, low_memory=False)
        elif dfpath.endswith(".pkl"):
            newdf = quickpickle_load(dfpath)
    
    if "WEBTEXT" in list(newdf) or "CMO_WEBTEXT" in list(newdf):
        newdf = convert_df(newdf, ["WEBTEXT", "CMO_WEBTEXT"])
    else:
        newdf = convert_df(newdf, [])
    
    if "NCESSCH" in list(newdf):
        newdf["NCESSCH"] = newdf["NCESSCH"].astype(float)
        check_df(newdf, "NCESSCH")
    
    return newdf

In [107]:
def density_calc(somedf, largedf=None, groupvar, uniqueid):
    """Calculates total number of entities (rows) in a given DataFrame that share a given clustering/group variable.
    Uses uniqueid to identify number of independent entities. Finally merges the density with the given DF.
    Useful for calculating the density of charter/public schools in a given school district.
    Input: DataFrame, variable to group by, unique IDs (for each entity), variable name for density.
    Output: DataFrame with density variable added."""

    # Keep only relevant variables from somedf for finding density
    grouped = somedf[[groupvar, uniqueid]]
    
    # TO DO: Use pubdf (as largedf) to calculate density of both public schools and charter schools in somedf
    
    # Generate 2-element DF grouped by groupvar, identifying distinct entities using uniqueid:
    grouped = grouped.groupby([groupvar])[uniqueid].count().reset_index(name="Number_entities")
    
    # Merge density column into original DF
    densitycol = pd.merge(somedf, grouped, how='outer', on=[groupvar])["Number_entities"]
    
    #new_frame[densityvar] = new_frame['All_school_counts']/merge_frame[("Area (Land)", "Geo_AREALAND")]
    
    return densitycol

In [21]:
def write_list(file_path, textlist):
    """Writes textlist to file_path.
    Input: Path to file, list of strings
    Output: Nothing (saved to disk)"""
    
    with open(file_path, 'w') as file_handler:
        
        for elem in textlist:
            file_handler.write("{}\n".format(elem))
    
    return    

In [22]:
def load_list(file_path):
    """Loads list into memory. Must be assigned to object.
    Input: Path to file
    Output: List object"""
    
    textlist = []
    with open(file_path) as file_handler:
        line = file_handler.readline()
        while line:
            textlist.append(line)
            line = file_handler.readline()
    return textlist

### Load data

In [72]:
charters_smalldf = load_filtered_df(charters_small_loc, [])

# rows and cols:  (6972, 32)
# duplicates by NCESSCH: 0

Columns and # missing cases (if any): 
STATENAME
LEAID
LEA_NAME
NCESSCH
SCH_NAME
TOTFRL
AM
AS
BL
HI
HP
MEMBER
TR
TITLEI
FTE
YEAR_OPENED
YEAR_CLOSED
LATCODE
LONGCODE
LOCALE
ALL_MTH00PCTPROF_1415
ALL_RLA00PCTPROF_1415
ADDRESS14
ess_count
prog_count
rit_count
ess_strength
prog_strength
AGE
PCTETH
PLACE
TOTETH


In [77]:
# Define variables to keep from big data set
keepvars = ['LEVEL', 'MEMBER', 'SE_T002_002', 'AGE', 'PCTETH', 'PCTFRL', 'PCTETH_SD', 'PCT_SE_T113_002', 
            'ESS_VALID_RATIO', 'PROG_VALID_RATIO', 'INQUIRY_RATIO', 'DISCIPLINE_RATIO', 'STABR', 'LEAID', 'GEO_LEAID',
            'NCESSCH']

In [114]:
# Load full public school data set - just the cols needed to count density
pubdf_small = load_filtered_df(pubschools_path, ["NCESSCH", "GEO_LEAID"])

# rows and cols:  (136825, 2)
# duplicates by NCESSCH: 0

Columns and # missing cases (if any): 
NCESSCH
GEO_LEAID


In [78]:
# Load and filter charter data set
charterdf = load_filtered_df(charters_path, keepvars)

# rows and cols:  (10965, 16)
# duplicates by NCESSCH: 0

Columns and # missing cases (if any): 
LEVEL
MEMBER
SE_T002_002
AGE
PCTETH
PCTFRL
PCTETH_SD
PCT_SE_T113_002
ESS_VALID_RATIO
PROG_VALID_RATIO
INQUIRY_RATIO
DISCIPLINE_RATIO
STABR
LEAID
GEO_LEAID
NCESSCH


In [111]:
# Create new variable with density of charter schools
charterdf["CHARTER_DENSITY"] = density_calc(charterdf, pubdf_small, "GEO_LEAID", "NCESSCH")

In [None]:
# Drop identifier (not needed for analysis)
charterdf.drop(columns="NCESSCH", axis=1, inplace=True)

In [None]:
# Remove missing vars


In [None]:
'''# Rename variables
read_charterdf = charterdf.rename(
    index=str, columns={'LEVEL':, 'MEMBER':'Student_count', 'SE_T002_002':, 'AGE':'School_age', 
                        'PCTETH':, 'PCTFRL':, 'PCTETH_SD':, 'PCT_SE_T113_002':, 
                        'ESS_VALID_RATIO':, 'PROG_VALID_RATIO':, 'INQUIRY_RATIO':'IBL_emphasis',
                        'DISCIPLINE_RATIO':'Discipline_emphasis', 'STABR':'State', 'LEAID':'SD_formal', 
                        'GEO_LEAID':'SD_geog'})
                        '''

## Save data to file

In [None]:
# Save data to disk
charterdf.to_csv(stats_storepath, index=False)

In [None]:
#quickpickle_dump(charterdf, charters_storepath)

In [None]:
#df1.to_csv("../../nowdata/schooldf_filtered_Nov18.csv")