In [1]:
# setting the random seed for reproducibility
import random
random.seed(493)

# for manipulating dataframes
import pandas as pd
import numpy as np

# for statistical testing
from scipy import stats

# for working with timestamps
from datetime import datetime
from dateutil.parser import parse

# for visualizations
%matplotlib inline
import matplotlib.pyplot as plt

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Executive Summary

> **Something goes here.**

### PLANNING

- [X] Planning
- [X] Acquire data
- [X] Prepare the data
- [ ] Explore the data
- [ ] Modeling

In [2]:
def show_missing(df):
    """
    Takes a dataframe and returns a dataframe with stats
    on missing and null values with their percentages.
    """
    null_count = df.isnull().sum()
    null_percentage = (null_count / df.shape[0]) * 100
    empty_count = pd.Series(((df == ' ') | (df == '')).sum())
    empty_percentage = (empty_count / df.shape[0]) * 100
    nan_count = pd.Series(((df == 'nan') | (df == 'NaN')).sum())
    nan_percentage = (nan_count / df.shape[0]) * 100
    dfx = pd.DataFrame({'num_missing': null_count, 'missing_percentage': null_percentage,
                         'num_empty': empty_count, 'empty_percentage': empty_percentage,
                         'nan_count': nan_count, 'nan_percentage': nan_percentage})
    return dfx

In [3]:
def get_values(df, columns):
    """
    Take a dataframe and a list of columns and
    returns the value counts for the columns.
    """
    for column in columns:
        print(column)
        print('=====================================')
        print(df[column].value_counts(dropna=False))
        print('\n')

def show_values(df, param):
    if param == 'all':
        get_values(df, df.columns)
    else:
        get_values(df, param)

In [4]:
def convert_datatype(df, column, totype):
    df[column] = df[column].astype(totype)
    return df

### ACQUIRE DATA

In [5]:
# Read csv files
staar_approaches = pd.read_csv('../data/inter/02 - staar_performance_districts_approaches.csv')
staar_meets = pd.read_csv('../data/inter/02 - staar_performance_districts_meets.csv')
staar_masters = pd.read_csv('../data/inter/02 - staar_performance_districts_masters.csv')
peims = pd.read_csv('../data/inter/02 - peims_expenditures_etc_df.csv')

In [6]:
staar_approaches['CLASS'] = 'approaches'
staar_meets['CLASS'] = 'meets'
staar_masters['CLASS'] = 'masters'

In [7]:
staar = pd.concat([staar_approaches,staar_meets,staar_masters])

In [8]:
values_dict = {
    'approaches': 0, 
    'meets': 0, 
    'masters': 1
}

In [9]:
staar['TARGET'] = staar['CLASS'].map(values_dict).fillna(np.nan)

In [10]:
staar = staar.drop(columns=['CLASS'])

In [11]:
staar.head()

Unnamed: 0,release_year,test_year,district,numerator,denominator,rate,TARGET
0,2013,2012,1902,1030,1219,84.495488,0
1,2013,2012,1903,1796,2279,78.806494,0
2,2013,2012,1904,987,1243,79.404666,0
3,2013,2012,1906,649,762,85.170604,0
4,2013,2012,1907,3817,5396,70.737583,0


In [12]:
staar.shape
peims.shape

(19201, 7)

(18213, 116)

In [13]:
staar.head().T
peims.head().T

Unnamed: 0,0,1,2,3,4
release_year,2013.0,2013.0,2013.0,2013.0,2013.0
test_year,2012.0,2012.0,2012.0,2012.0,2012.0
district,1902.0,1903.0,1904.0,1906.0,1907.0
numerator,1030.0,1796.0,987.0,649.0,3817.0
denominator,1219.0,2279.0,1243.0,762.0,5396.0
rate,84.495488,78.806494,79.404666,85.170604,70.737583
TARGET,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,0,1,2,3,4
DISTRICT NUMBER,1902,1902,1902,1902,1902
DISTRICT NAME,CAYUGA ISD,CAYUGA ISD,CAYUGA ISD,CAYUGA ISD,CAYUGA ISD
YEAR,2007,2008,2009,2010,2011
GEN FUNDS-TOTAL PAYROLL EXPENDITURES,3405211,3729195,3952356,4177424,4102439
ALL FUNDS-TOTAL PAYROLL EXPENDITURES,3611026,5509091,5716404,6152112,6134967
GEN FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,444063,463617,472396,277583,283364
ALL FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,456042,934387,941931,964779,984742
GEN FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,287153,323340,341296,391572,311149
ALL FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,492377,752505,754588,1117922,973794
GEN FUNDS-TOTAL OTHER OPERATING EXPENDITURES,128670,137667,165611,170952,154614


### PREPARATION

In [14]:
staar = convert_datatype(staar, 'test_year', 'str')

In [15]:
staar = convert_datatype(staar, 'district', 'str')

In [16]:
peims = convert_datatype(peims, 'YEAR', 'str')

In [17]:
peims = convert_datatype(peims, 'DISTRICT NUMBER', 'str')

staar['district'] = staar['district'].str.zfill(6)
peims['DISTRICT NUMBER'] = peims['DISTRICT NUMBER'].str.zfill(6)

In [18]:
peims.drop(columns=['FALL SURVEY ENROLLMENT'], inplace=True)

In [19]:
peims.head().T

Unnamed: 0,0,1,2,3,4
DISTRICT NUMBER,001902,001902,001902,001902,001902
DISTRICT NAME,CAYUGA ISD,CAYUGA ISD,CAYUGA ISD,CAYUGA ISD,CAYUGA ISD
YEAR,2007,2008,2009,2010,2011
GEN FUNDS-TOTAL PAYROLL EXPENDITURES,3405211,3729195,3952356,4177424,4102439
ALL FUNDS-TOTAL PAYROLL EXPENDITURES,3611026,5509091,5716404,6152112,6134967
GEN FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,444063,463617,472396,277583,283364
ALL FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,456042,934387,941931,964779,984742
GEN FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,287153,323340,341296,391572,311149
ALL FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,492377,752505,754588,1117922,973794
GEN FUNDS-TOTAL OTHER OPERATING EXPENDITURES,128670,137667,165611,170952,154614


In [20]:
staar.head().T

Unnamed: 0,0,1,2,3,4
release_year,2013.0,2013.0,2013.0,2013.0,2013.0
test_year,2012.0,2012.0,2012.0,2012.0,2012.0
district,1902.0,1903.0,1904.0,1906.0,1907.0
numerator,1030.0,1796.0,987.0,649.0,3817.0
denominator,1219.0,2279.0,1243.0,762.0,5396.0
rate,84.495488,78.806494,79.404666,85.170604,70.737583
TARGET,0.0,0.0,0.0,0.0,0.0


In [21]:
staar = staar.rename(columns={'district':'DISTRICT NUMBER',
                        'test_year':'YEAR'
                        })

In [22]:
df = pd.merge(staar, peims, on=['DISTRICT NUMBER', 'YEAR'])

In [23]:
df.head().T

Unnamed: 0,0,1,2,3,4
release_year,2013,2013,2013,2013,2013
YEAR,2012,2012,2012,2012,2012
DISTRICT NUMBER,001902,001902,001902,001903,001903
numerator,1030,471,471,1796,697
denominator,1219,1219,1219,2279,2279
rate,84.495488,38.638228,38.638228,78.806494,30.583589
TARGET,0,0,1,0,0
DISTRICT NAME,CAYUGA ISD,CAYUGA ISD,CAYUGA ISD,ELKHART ISD,ELKHART ISD
GEN FUNDS-TOTAL PAYROLL EXPENDITURES,3941590,3941590,3941590,6598154,6598154
ALL FUNDS-TOTAL PAYROLL EXPENDITURES,5962051,5962051,5962051,7092906,7092906


In [24]:
df.drop(columns=['release_year', 'numerator', 'denominator', 'DISTRICT NAME'], inplace=True)

In [25]:
df.head().T

Unnamed: 0,0,1,2,3,4
YEAR,2012.0,2012.0,2012.0,2012.0,2012.0
DISTRICT NUMBER,1902.0,1902.0,1902.0,1903.0,1903.0
rate,84.495488,38.638228,38.638228,78.806494,30.583589
TARGET,0.0,0.0,1.0,0.0,0.0
GEN FUNDS-TOTAL PAYROLL EXPENDITURES,3941590.0,3941590.0,3941590.0,6598154.0,6598154.0
ALL FUNDS-TOTAL PAYROLL EXPENDITURES,5962051.0,5962051.0,5962051.0,7092906.0,7092906.0
GEN FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,410043.0,410043.0,410043.0,1112135.0,1112135.0
ALL FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,874882.0,874882.0,874882.0,1536657.0,1536657.0
GEN FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,518893.0,518893.0,518893.0,525857.0,525857.0
ALL FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,846218.0,846218.0,846218.0,700794.0,700794.0


In [26]:
df = df.rename(columns={'DISTRICT NUMBER':'DISTRICT',
                        'rate':'RATE'
                        })

In [27]:
df.head().T

Unnamed: 0,0,1,2,3,4
YEAR,2012.0,2012.0,2012.0,2012.0,2012.0
DISTRICT,1902.0,1902.0,1902.0,1903.0,1903.0
RATE,84.495488,38.638228,38.638228,78.806494,30.583589
TARGET,0.0,0.0,1.0,0.0,0.0
GEN FUNDS-TOTAL PAYROLL EXPENDITURES,3941590.0,3941590.0,3941590.0,6598154.0,6598154.0
ALL FUNDS-TOTAL PAYROLL EXPENDITURES,5962051.0,5962051.0,5962051.0,7092906.0,7092906.0
GEN FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,410043.0,410043.0,410043.0,1112135.0,1112135.0
ALL FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,874882.0,874882.0,874882.0,1536657.0,1536657.0
GEN FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,518893.0,518893.0,518893.0,525857.0,525857.0
ALL FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,846218.0,846218.0,846218.0,700794.0,700794.0


In [28]:
show_values(df, ['YEAR'])

YEAR
2012    3639
2019    3590
2018    3576
2017    3543
2014    1218
2013    1216
2015    1203
2016    1195
Name: YEAR, dtype: int64




In [29]:
df = df[df['YEAR'] == '2019' ]

In [30]:
df.shape

(3590, 116)

In [31]:
df.dtypes

YEAR                                                                object
DISTRICT                                                            object
RATE                                                               float64
TARGET                                                               int64
GEN FUNDS-TOTAL PAYROLL EXPENDITURES                                 int64
ALL FUNDS-TOTAL PAYROLL EXPENDITURES                                 int64
GEN FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES      int64
ALL FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES      int64
GEN FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES                    int64
ALL FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES                    int64
GEN FUNDS-TOTAL OTHER OPERATING EXPENDITURES                         int64
ALL FUNDS-TOTAL OTHER OPERATING EXPENDITURES                         int64
GEN FUNDS-TOTAL OPERATING EXPENDITURES BY OBJ                        int64
ALL FUNDS-TOTAL OPERATING

In [32]:
 df.to_csv('../data/inter/03 - combined_peims_staar_2019.csv', index=False)

In [33]:
df.shape

(3590, 116)

In [34]:
df.head().T

Unnamed: 0,15590,15591,15592,15593,15594
YEAR,2019.0,2019.0,2019.0,2019.0,2019.0
DISTRICT,1902.0,1902.0,1902.0,1903.0,1903.0
RATE,84.549356,61.480687,61.480687,83.927675,56.403817
TARGET,0.0,0.0,1.0,0.0,0.0
GEN FUNDS-TOTAL PAYROLL EXPENDITURES,4333355.0,4333355.0,4333355.0,8766303.0,8766303.0
ALL FUNDS-TOTAL PAYROLL EXPENDITURES,6025217.0,6025217.0,6025217.0,9093950.0,9093950.0
GEN FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,598412.0,598412.0,598412.0,867517.0,867517.0
ALL FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,1075904.0,1075904.0,1075904.0,1514689.0,1514689.0
GEN FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,303944.0,303944.0,303944.0,451392.0,451392.0
ALL FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,648206.0,648206.0,648206.0,784631.0,784631.0


In [35]:
dfx = df.copy()

In [36]:
columns_to_keep = [c for c in dfx.columns if c.lower()[:3] != 'gen']

dfx = dfx[columns_to_keep]

In [37]:
dfx.shape

(3590, 60)

In [38]:
dfx.head().T

Unnamed: 0,15590,15591,15592,15593,15594
YEAR,2019.0,2019.0,2019.0,2019.0,2019.0
DISTRICT,1902.0,1902.0,1902.0,1903.0,1903.0
RATE,84.549356,61.480687,61.480687,83.927675,56.403817
TARGET,0.0,0.0,1.0,0.0,0.0
ALL FUNDS-TOTAL PAYROLL EXPENDITURES,6025217.0,6025217.0,6025217.0,9093950.0,9093950.0
ALL FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,1075904.0,1075904.0,1075904.0,1514689.0,1514689.0
ALL FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,648206.0,648206.0,648206.0,784631.0,784631.0
ALL FUNDS-TOTAL OTHER OPERATING EXPENDITURES,809559.0,809559.0,809559.0,303052.0,303052.0
ALL FUNDS-TOTAL OPERATING EXPENDITURES BY OBJ,8558886.0,8558886.0,8558886.0,11696322.0,11696322.0
ALL FUNDS-TOTAL DEBT SERVICE EXPEND BY OBJ,0.0,0.0,0.0,1034420.0,1034420.0


In [39]:
dfx = dfx.drop(dfx.loc[:, 'ALL FUNDS-TOTAL OPERATING EXPENDITURES BY OBJ':'ALL FUNDS-TOTAL NON-OPER EXPENDITURES BY OBJ'].columns, axis=1)

In [40]:
dfx.head().T

Unnamed: 0,15590,15591,15592,15593,15594
YEAR,2019.0,2019.0,2019.0,2019.0,2019.0
DISTRICT,1902.0,1902.0,1902.0,1903.0,1903.0
RATE,84.549356,61.480687,61.480687,83.927675,56.403817
TARGET,0.0,0.0,1.0,0.0,0.0
ALL FUNDS-TOTAL PAYROLL EXPENDITURES,6025217.0,6025217.0,6025217.0,9093950.0,9093950.0
ALL FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,1075904.0,1075904.0,1075904.0,1514689.0,1514689.0
ALL FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,648206.0,648206.0,648206.0,784631.0,784631.0
ALL FUNDS-TOTAL OTHER OPERATING EXPENDITURES,809559.0,809559.0,809559.0,303052.0,303052.0
ALL FUNDS-TOTAL NON-OPER AND OPER OEXPENDITURES BY OBJ,8985224.0,8985224.0,8985224.0,12905901.0,12905901.0
"ALL FUNDS-INSTRUCTION + TRANSFER EXPEND-FCT11,95",4649118.0,4649118.0,4649118.0,7043892.0,7043892.0


In [41]:
dfx = dfx.drop(dfx.loc[:, 'ALL FUNDS-NON-OPER EXP BY FUNCTION(1X-9X)(65XX)':'ALL FUNDS-NON-OPER EXP BY FUNCTION(1X-9X)(66XX)'].columns, axis=1)

In [42]:
dfx.head().T

Unnamed: 0,15590,15591,15592,15593,15594
YEAR,2019.0,2019.0,2019.0,2019.0,2019.0
DISTRICT,1902.0,1902.0,1902.0,1903.0,1903.0
RATE,84.549356,61.480687,61.480687,83.927675,56.403817
TARGET,0.0,0.0,1.0,0.0,0.0
ALL FUNDS-TOTAL PAYROLL EXPENDITURES,6025217.0,6025217.0,6025217.0,9093950.0,9093950.0
ALL FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,1075904.0,1075904.0,1075904.0,1514689.0,1514689.0
ALL FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,648206.0,648206.0,648206.0,784631.0,784631.0
ALL FUNDS-TOTAL OTHER OPERATING EXPENDITURES,809559.0,809559.0,809559.0,303052.0,303052.0
ALL FUNDS-TOTAL NON-OPER AND OPER OEXPENDITURES BY OBJ,8985224.0,8985224.0,8985224.0,12905901.0,12905901.0
"ALL FUNDS-INSTRUCTION + TRANSFER EXPEND-FCT11,95",4649118.0,4649118.0,4649118.0,7043892.0,7043892.0


In [43]:
dfx = dfx.drop(dfx.loc[:, 'ALL FUNDS-PREKINDERGARTEN EXPEND BILINGUAL--32':'ALL FUNDS-PREKINDERGARTEN EXPEND SPECIAL ED--32'].columns, axis=1)

In [44]:
dfx.head().T

Unnamed: 0,15590,15591,15592,15593,15594
YEAR,2019.0,2019.0,2019.0,2019.0,2019.0
DISTRICT,1902.0,1902.0,1902.0,1903.0,1903.0
RATE,84.549356,61.480687,61.480687,83.927675,56.403817
TARGET,0.0,0.0,1.0,0.0,0.0
ALL FUNDS-TOTAL PAYROLL EXPENDITURES,6025217.0,6025217.0,6025217.0,9093950.0,9093950.0
ALL FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,1075904.0,1075904.0,1075904.0,1514689.0,1514689.0
ALL FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,648206.0,648206.0,648206.0,784631.0,784631.0
ALL FUNDS-TOTAL OTHER OPERATING EXPENDITURES,809559.0,809559.0,809559.0,303052.0,303052.0
ALL FUNDS-TOTAL NON-OPER AND OPER OEXPENDITURES BY OBJ,8985224.0,8985224.0,8985224.0,12905901.0,12905901.0
"ALL FUNDS-INSTRUCTION + TRANSFER EXPEND-FCT11,95",4649118.0,4649118.0,4649118.0,7043892.0,7043892.0


In [45]:
dfx = dfx.drop(dfx.loc[:, 'ALL FUNDS-NON OPER EXP BY PIC(65XX)':'ALL FUNDS-TOTAL DISBURSEMENTS'].columns, axis=1)

In [46]:
dfx.head().T

Unnamed: 0,15590,15591,15592,15593,15594
YEAR,2019.0,2019.0,2019.0,2019.0,2019.0
DISTRICT,1902.0,1902.0,1902.0,1903.0,1903.0
RATE,84.549356,61.480687,61.480687,83.927675,56.403817
TARGET,0.0,0.0,1.0,0.0,0.0
ALL FUNDS-TOTAL PAYROLL EXPENDITURES,6025217.0,6025217.0,6025217.0,9093950.0,9093950.0
ALL FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,1075904.0,1075904.0,1075904.0,1514689.0,1514689.0
ALL FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,648206.0,648206.0,648206.0,784631.0,784631.0
ALL FUNDS-TOTAL OTHER OPERATING EXPENDITURES,809559.0,809559.0,809559.0,303052.0,303052.0
ALL FUNDS-TOTAL NON-OPER AND OPER OEXPENDITURES BY OBJ,8985224.0,8985224.0,8985224.0,12905901.0,12905901.0
"ALL FUNDS-INSTRUCTION + TRANSFER EXPEND-FCT11,95",4649118.0,4649118.0,4649118.0,7043892.0,7043892.0


In [47]:
columns_to_remove = ['ALL FUNDS-TOTAL NON-OPER AND OPER OEXPENDITURES BY OBJ',
                     'ALL FUNDS-TOT OPER AND NON-OPER EXP BY FUNCTION'
                    ]

In [48]:
dfx = dfx.drop(columns=columns_to_remove)

In [49]:
dfx.shape

(3590, 37)

In [50]:
dfx.head().T

Unnamed: 0,15590,15591,15592,15593,15594
YEAR,2019.0,2019.0,2019.0,2019.0,2019.0
DISTRICT,1902.0,1902.0,1902.0,1903.0,1903.0
RATE,84.549356,61.480687,61.480687,83.927675,56.403817
TARGET,0.0,0.0,1.0,0.0,0.0
ALL FUNDS-TOTAL PAYROLL EXPENDITURES,6025217.0,6025217.0,6025217.0,9093950.0,9093950.0
ALL FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,1075904.0,1075904.0,1075904.0,1514689.0,1514689.0
ALL FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,648206.0,648206.0,648206.0,784631.0,784631.0
ALL FUNDS-TOTAL OTHER OPERATING EXPENDITURES,809559.0,809559.0,809559.0,303052.0,303052.0
"ALL FUNDS-INSTRUCTION + TRANSFER EXPEND-FCT11,95",4649118.0,4649118.0,4649118.0,7043892.0,7043892.0
"ALL FUNDS-INSTRUC RESOURCE MEDIA SERVICE, FCT12",66490.0,66490.0,66490.0,117860.0,117860.0


In [51]:
dfx.drop(columns=['RATE'], inplace=True)

In [52]:
dfx.to_csv('../data/inter/03 - trimmed_combined_peims_staar_2019.csv', index=False)