In [1]:
# setting the random seed for reproducibility
import random
random.seed(493)

# for manipulating dataframes
import pandas as pd
import numpy as np

# for statistical testing
from scipy import stats

# for working with timestamps
from datetime import datetime
from dateutil.parser import parse

# for visualizations
%matplotlib inline
import matplotlib.pyplot as plt

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Executive Summary

> **Something goes here.**

### PLANNING

- [X] Planning
- [X] Acquire data
- [ ] Prepare the data
- [ ] Explore the data
- [ ] Modeling

In [2]:
def show_missing(df):
    """
    Takes a dataframe and returns a dataframe with stats
    on missing and null values with their percentages.
    """
    null_count = df.isnull().sum()
    null_percentage = (null_count / df.shape[0]) * 100
    empty_count = pd.Series(((df == ' ') | (df == '')).sum())
    empty_percentage = (empty_count / df.shape[0]) * 100
    nan_count = pd.Series(((df == 'nan') | (df == 'NaN')).sum())
    nan_percentage = (nan_count / df.shape[0]) * 100
    dfx = pd.DataFrame({'num_missing': null_count, 'missing_percentage': null_percentage,
                         'num_empty': empty_count, 'empty_percentage': empty_percentage,
                         'nan_count': nan_count, 'nan_percentage': nan_percentage})
    return dfx

In [3]:
def get_values(df, columns):
    """
    Take a dataframe and a list of columns and
    returns the value counts for the columns.
    """
    for column in columns:
        print(column)
        print('=====================================')
        print(df[column].value_counts(dropna=False))
        print('\n')

def show_values(df, param):
    if param == 'all':
        get_values(df, df.columns)
    else:
        get_values(df, param)

### ACQUIRE DATA

In [4]:
# Read csv files
peims_df = pd.read_csv('../data/in/2007-2021-summaried-peims-financial-data.csv')
staar_df1 = pd.read_csv('../data/in/tidy_campstaar1_2012to2019.csv')
staar_df2 = pd.read_csv('../data/in/tidy_campstaar2_2013to2019.csv')

  staar_df1 = pd.read_csv('../data/in/tidy_campstaar1_2012to2019.csv')


In [5]:
peims_df.head()
staar_df1.head()
staar_df2.head()

Unnamed: 0,DISTRICT NUMBER,DISTRICT NAME,YEAR,GEN FUNDS-LOCAL TAX REVENUE FROM M&O,ALL FUNDS-LLOCAL TAX REVENUE FROM M&O,GEN FUNDS-STATE REVENUE,ALL FUNDS-STATE REVENUE,GEN FUNDS-FEDERAL REVENUE,ALL FUNDS-FEDERAL REVENUE,GEN FUNDS-OTHER LOCAL & INTERMEDIATE REVENUE,ALL FUNDS-OTHER LOCAL & INTERMEDIATE REVENUE,GEN FUNDS-TOTAL OPERATING REVENUE,ALL FUNDS-TOTAL OPERATING REVENUE,GEN FUNDS-LOCAL PROPERTY TAXES FROM I&S,ALL FUNDS-LOCAL PROPERTY TAXES FROM I&S,GEN FUNDS-STATE DEBT FUNDS,ALL FUNDS-STATE DEBT FUNDS,GEN FUNDS-OTHER RECEIPTS,ALL FUNDS-OTHER RECEIPTS,GEN FUNDS-OTHER REVENUE,ALL FUNDS-OTHER REVENUE,GEN FUNDS-TOTAL OPERATING REVENUE AND OTHER REVENUE,ALL FUNDS-TOTAL OPERATING REVENUE AND OTHER REVENUE,GEN FUNDS-EQUITY TRANSFERS,ALL FUNDS-EQUITY TRANSFERS,GEN FUNDS-TOT DEBT SERV FIN AND TRS EST REV,ALL FUNDS-TOT DEBT SERV FIN AND TRS EST REV,GEN FUNDS-TOTAL OPERATING REVENUE AND OTHER REVENUE AND RECPATUR,ALL FUNDS-TOTAL OPERATING REVENUE AND OTHER REVENUE AND RECPATUR,GEN FUNDS-DEBT SERVICE FINANCING RELATED REVENUE,ALL FUNDS-DEBT SERVICE FINANCING RELATED REVENUE,GEN FUNDS-ESTIMATED STATE TRS CONTRINUTIONS,ALL FUNDS-ESTIMATED STATE TRS CONTRINUTIONS,GEN FUNDS-TOTAL DEBT SERVICE FINANCING AND TRS ESTIMATE REVENUE,ALL FUNDS-TOTAL DEBT SERVICE FINANCING AND TRS ESTIMATE REVENUE,"GEN FUNDS-TOTAL OPERATING, OTR, DEBT SERV FIN, AND TRS EST REVEN","ALL FUNDS-TOTAL OPERATING, OTR, DEBT SERV FIN, AND TRS EST REVEN",GEN FUNDS-TOTAL PAYROLL EXPENDITURES,ALL FUNDS-TOTAL PAYROLL EXPENDITURES,GEN FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,ALL FUNDS-TOTAL PROFESSIONAL & CONTRACTED SERVICES EXPENDITURES,GEN FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,ALL FUNDS-TOTAL SUPPLIES & MATERIALS EXPENDITURES,GEN FUNDS-TOTAL OTHER OPERATING EXPENDITURES,ALL FUNDS-TOTAL OTHER OPERATING EXPENDITURES,GEN FUNDS-TOTAL OPERATING EXPENDITURES BY OBJ,ALL FUNDS-TOTAL OPERATING EXPENDITURES BY OBJ,GEN FUNDS-TOTAL DEBT SERVICE EXPEND BY OBJ,ALL FUNDS-TOTAL DEBT SERVICE EXPEND BY OBJ,GEN FUNDS-TOTAL CAPITAL OUTLAY EXPEND BY OBJ,ALL FUNDS-TOTAL CAPITAL OUTLAY EXPEND BY OBJ,GEN FUNDS-TOTAL NON-OPER EXPENDITURES BY OBJ,ALL FUNDS-TOTAL NON-OPER EXPENDITURES BY OBJ,GEN FUNDS-TOTAL NON-OPER AND OPER OEXPENDITURES BY OBJ,ALL FUNDS-TOTAL NON-OPER AND OPER OEXPENDITURES BY OBJ,"GEN FUNDS-INSTRUCTION + TRANSFER EXPEND-FCT11,95","ALL FUNDS-INSTRUCTION + TRANSFER EXPEND-FCT11,95","GEN FUNDS-INSTRUC RESOURCE MEDIA SERVICE EXP, FCT12","ALL FUNDS-INSTRUC RESOURCE MEDIA SERVICE, FCT12","GEN FUNDS-CURRICULUM/STAFF DEVELOPMENT EXP, FCT13","ALL FUNDS-CURRICULUM/STAFF DEVELOPMENT EXP, FCT13","GEN FUNDS-INSTRUC LEADERSHIP EXPEND, FCT21","ALL FUNDS-INSTRUC LEADERSHIP EXPEND, FCT21","GEN FUNDS-CAMPUS ADMINISTRATION EXPEND, FCT23","ALL FUNDS-CAMPUS ADMINISTRATION EXPEND, FCT23","GEN FUNDS-GUIDANCE & COUNSELING SERVICES EXP, FCT31","ALL FUNDS-GUIDANCE 7 COUNSELING SERVICES EXP, FCT31","GEN FUNDS-SOCIAL WORK SERVICES EXP, FCT32","ALL FUNDS-SOCIAL WORK SERVICES EXP, FCT32","GEN FUNDS-HEALTH SERVICES EXP, FCT33","ALL FUNDS-HEALTH SERVICES EXP, FCT33","GEN FUNDS-TRANSPORTATION EXPENDITURES, FCT34","ALL FUNDS-TRANSPORTATION EXPENDITURES, FCT34","GEN FUNDS-FOOD SERVICE EXPENDITURES, FCT35","ALL FUNDS-FOOD SERVICE EXPENDITURES, FCT35","GEN FUNDS-EXTRACURRICULAR EXPENDITURES, FCT36","ALL FUNDS-EXTRACURRICULAR EXPENDITURES,FCT36","GEN FUNDS-GENERAL ADMINISTRAT EXPEND-FCT41,80,92","ALL FUNDS-GENERAL ADMINISTRAT EXPEND-FCT41,80,92","GEN FUNDS-PLANT MAINTENANCE/OPERA EXPEND, FCT51","ALL FUNDS-PLANT MAINTENANCE/OPERA EXPEND, FCT51","GEN FUNDS-SECURITY/MONITORING SERVICE EXPEND, FCT52","ALL FUNDS-SECURITY/MONITORING SERVICE EXPEND, FCT5","GEN FUNDS-DATA PROCESSING SERVICES EXPEND, FCT53","ALL FUNDS-DATA PROCESSING SERVICES, FCT53","GEN FUNDS-COMMUNITY SERVICES, FCT61","ALL FUNDS-COMMUNITY SERVICES, FCT61",GEN FUNDS-TOTAL OPERATE EXPEND BY FUNCTION,ALL FUNDS-TOTAL OPERATE EXPEND BY FUNCTION,GEN FUNDS-NON-OPER EXP BY FUNCTION(1X-9X)(65XX),ALL FUNDS-NON-OPER EXP BY FUNCTION(1X-9X)(65XX),GEN FUNDS-NON-OPER EXP BY FUNCTION(1X-9X)(66XX),ALL FUNDS-NON-OPER EXP BY FUNCTION(1X-9X)(66XX),GEN FUNDS-TOT NON-OPER EXPEND BY FUNCTION,ALL FUNDS-TOT NON-OPER EXPEND BY FUNCTION,GEN FUNDS-TOT OPER AND NON-OPER EXP BY FUNCTION,ALL FUNDS-TOT OPER AND NON-OPER EXP BY FUNCTION,GEN FUNDS-REGULAR PROGRAM EXPEND--11,ALL FUNDS-REGULAR PROGRAM EXPEND--11,GEN FUNDS-GIFTED/TALENTED PROGRAM EXPEND--21,ALL FUNDS-GIFTED/TALENTED PROGRAM EXPEND--21,GEN FUNDS-CAREER & TECHNOLOGY PGM EXPEND--22,ALL FUNDS-CAREER & TECHNOLOGY PGM EXPEND--22,GEN FUNDS-STUDENTS WITH DISABILITIES PGM EXPEND--23,ALL FUNDS-STUDENTS WITH DISABILITIES PGM EXPEND--23,"GEN FUNDS-STATE COMPENSATORY ED EXPEND--24, 29, 30, 34","ALL FUNDS-STATE COMPENSATORY ED EXPEND--24, 29, 30, 34",GEN FUNDS-BILINGUAL PROGRAM EXPEND--25,ALL FUNDS-BILINGUAL PROGRAM EXP--25,GEN FUNDS-HIGH SCHOOL ALLOTMENT PROGRAM EXPEND--91,ALL FUNDS-HIGH SCHOOL ALLOTMENT PROGRAM--91,"GEN FUNDS-PREKINDERGARTEN EXPEND--32,35","ALL FUNDS-PREKINDERGARTEN--32,35",GEN FUNDS-PREKINDERGARTEN EXPEND BILINGUAL--32,GEN FUNDS-PREKINDERGARTEN EXPEND COMP ED--32,GEN FUNDS-PREKINDERGARTEN EXPEND REGULAR--32,GEN FUNDS-PREKINDERGARTEN EXPEND SPECIAL ED--32,ALL FUNDS-PREKINDERGARTEN EXPEND BILINGUAL--32,ALL FUNDS-PREKINDERGARTEN EXPEND COMP ED--32,ALL FUNDS-PREKINDERGARTEN EXPEND REGULAR--32,ALL FUNDS-PREKINDERGARTEN EXPEND SPECIAL ED--32,GEN FUNDS-ATHLETICS PROGRAM EXPEND--91,ALL FUNDS-ATHLETICS PROGRAM--91,GEN FUNDS-UNDISTRIBUTED PROGRAM EXP--99,ALL FUNDS-UNDISTRIBUTED PROGRAM EXP--99,GEN FUNDS-TOTAL PROGRAM OPERATING EXPENDITURES,ALL FUNDS-TOTAL PROGRAM OPERATING EXPENDITURES,GEN FUNDS-NON OPER EXP BY PIC(65XX),ALL FUNDS-NON OPER EXP BY PIC(65XX),GEN FUNDS-NON OPER EXP BY PIC(66XX),ALL FUNDS-NON OPER EXP BY PIC(66XX),GEN FUNDS-TOT NON-OPER EXPENDITURES BY PIC,ALL FUNDS-TOT NON-OPER EXPENDITURES BY PIC,GEN FUNDS-TOT OPER AND NON-OPER EXP BY PIC,ALL FUNDS-TOT OPER AND NON-OPER EXP BY PIC,GEN FUNDS-TOTAL OPER EXPENDITURES FOR TD,ALL FUNDS-TOTAL OPER EXPENDITURES FOR TD,GEN FUNDS-EQUITY TRANSFERS FOR TD,EINTRAN4,GEN FUNDS-TOTAL OTHER USES,ALL FUNDS-TOTAL OTHER USES,GEN FUNDS-INTERGOVERN CHARGES EXPEND,ALL FUNDS-INTERGOVERN CHARGES EXPEND,GEN FUNDS-DEBT SERVICE (OBJECT 6500) FOR TD,ALL FUNDS-DEBT SERVICE (OBJECT 6500) FOR TD,GEN FUNDS-CAPITAL PROJECTS(OBJECT 6600) FOR TD,ALL FUNDS-CAPITAL PROJECTS(OBJECT 6600) FOR TD,GEN FUNDS-TOTAL DISBURSEMENTS,ALL FUNDS-TOTAL DISBURSEMENTS,FALL SURVEY ENROLLMENT
0,'001902,CAYUGA ISD,2007,4122552,4122552,1114179,1176283,0,250168,215239,354535,5451970,5903538,0,0,0,0,1418,1418,1418,1418,5453388,5904956,0,0,0,0,5453388,5904956,0,0,198676,198676,198676,198676,5652064,6103632,3405211,3611026,444063,456042,287153,492377,128670,133649,4265097,4693094,421865,421865,145130,156886,566995,578751,4832092,5271845,2450934,2642749,76087,77503,820,4969,0,4000,255309,255309,140044,140044,0,0,46080,46080,188873,188873,0,190100,186577,223094,300100,300100,540789,540789,1857,1857,77627,77627,0,0,4265097,4693094,421865,421865,145130,156886,566995,578751,4832092,5271845,2368474,2382088,9815,9815,114517,114517,358842,358842,125940,313706,1197,1197,0,0,0,0,0,0,0,0,0,0,0,0,147213,183730,1139099,1329199,4265097,4693094,421865,421865,145130,156886,566995,578751,4832092,5271845,4265097,4693094,0,0,4411,5829,70703,70703,421865,421865,145130,156886,4907206,5348377,569
1,'001902,CAYUGA ISD,2008,3641351,3641351,1845232,2350131,0,1852221,183293,609434,5669876,8453137,0,0,0,0,0,953589,0,953589,5669876,9406726,0,0,0,0,5669876,9406726,0,0,226753,266516,226753,266516,5896629,9673242,3729195,5509091,463617,934387,323340,752505,137667,221087,4653819,7417070,0,0,647852,632398,647852,632398,5301671,8049468,2741830,4108584,54416,79604,4571,16968,0,302836,258301,258301,147403,877655,0,0,45898,45898,224996,224996,0,217411,219760,275561,310133,310133,553803,606415,1533,1533,91175,91175,0,0,4653819,7417070,0,0,647852,632398,647852,632398,5301671,8049468,2567178,2598844,10256,10256,113084,113084,389832,2689110,180245,339340,1246,1246,0,0,0,0,0,0,0,0,0,0,0,0,182724,182724,1209254,1482466,4653819,7417070,0,0,647852,632398,647852,632398,5301671,8049468,4653819,7417070,0,0,15000,15000,75304,210304,0,0,647852,632398,5391975,8274772,580
2,'001902,CAYUGA ISD,2009,3319164,3319164,1950169,2473373,0,1714988,123568,521780,5392901,8029305,0,0,0,0,0,0,0,0,5392901,8029305,185597,185597,185597,185597,5578498,8214902,0,0,245989,245989,245989,245989,5638890,8275294,3952356,5716404,472396,941931,341296,754588,165611,251355,4931659,7664278,0,0,722766,803313,722766,803313,5654425,8467591,2963259,4391883,60497,84600,1277,11375,0,217697,264999,264999,150041,845635,0,0,46497,46497,189287,189287,0,256598,268885,322772,258457,258457,633954,679972,870,870,93636,93636,0,0,4931659,7664278,0,0,722766,803313,722766,803313,5654425,8467591,2793147,2841002,10252,10252,133520,133520,376075,2562026,181981,364949,1261,1261,0,0,0,0,0,0,0,0,0,0,0,0,223733,277620,1211690,1473648,4931659,7664278,0,0,722766,803313,722766,803313,5654425,8467591,4931659,7664278,185597,185597,0,0,166251,166251,0,0,722766,803313,6006273,8819439,594
3,'001902,CAYUGA ISD,2010,3222688,3222688,2302369,2917268,0,2851372,161664,665811,5686721,9657139,0,0,0,0,34819,62162,34819,62162,5721540,9719301,0,0,0,0,5721540,9719301,0,0,253640,290497,253640,290497,5975180,10009798,4177424,6152112,277583,964779,391572,1117922,170952,422119,5017531,8656932,0,0,1498127,1582837,1498127,1582837,6515658,10239769,3150102,5139086,58224,84603,2273,14662,0,257802,281539,281539,151042,874259,0,0,49065,49065,197297,363286,0,251772,280267,322201,258943,258943,480017,650952,2596,2596,106166,106166,0,0,5017531,8656932,0,0,1498127,1582837,1498127,1582837,6515658,10239769,2894386,2926417,10303,10303,165656,179030,401915,3409136,181041,356797,1264,1264,49181,49181,0,0,0,0,0,0,0,0,0,0,234405,234405,1079380,1490399,5017531,8656932,0,0,1498127,1582837,1498127,1582837,6515658,10239769,5017531,8656932,0,0,28767,63586,90462,168462,0,0,1498127,1582837,6634887,10471817,628
4,'001902,CAYUGA ISD,2011,3152618,3152618,2439570,3091438,0,2687377,134887,603182,5727075,9534615,0,0,0,0,0,10089,0,10089,5727075,9544704,0,0,0,0,5727075,9544704,0,0,252603,292847,252603,292847,5979678,9837551,4102439,6134967,283364,984742,311149,973794,154614,318649,4851566,8412152,0,0,261902,272638,261902,272638,5113468,8684790,2950685,4982639,52170,78948,74,14768,0,308315,283396,283396,152784,752040,0,0,49250,49250,238060,328741,0,266258,255095,309954,288543,288543,466889,634680,729,729,113891,113891,0,0,4851566,8412152,0,0,261902,272638,261902,272638,5113468,8684790,2665401,2697739,9891,9891,175300,178400,404628,3292103,196290,395533,1216,1216,48529,48529,0,0,0,0,0,0,0,0,0,0,207921,207921,1142390,1580820,4851566,8412152,0,0,261902,272638,261902,272638,5113468,8684790,4851566,8412152,0,0,30213,30213,96141,168750,0,0,261902,272638,5239822,8883753,606


Unnamed: 0,data_release,data_category,data_level,release_year,test_year,campus_number,grade_level,subject,proficiency,demog,numerator,denominator,new_rate
0,tapr,CAMPSTAAR1.csv,campus,2014,2013,1902001,eoc,algebra_1,approaches,all_students,41,48,85.42
1,tapr,CAMPSTAAR1.csv,campus,2014,2014,1902001,eoc,algebra_1,approaches,all_students,51,57,89.47
2,tapr,CAMPSTAAR1.csv,campus,2014,2013,1902001,eoc,algebra_1,approaches,at_risk,11,17,64.71
3,tapr,CAMPSTAAR1.csv,campus,2014,2014,1902001,eoc,algebra_1,approaches,at_risk,32,38,84.21
4,tapr,CAMPSTAAR1.csv,campus,2014,2013,1902001,eoc,algebra_1,approaches,economic_disadvant,17,20,85.0


Unnamed: 0,data_release,data_category,data_level,release_year,test_year,campus_number,grade_level,subject,proficiency,demog,numerator,denominator,new_rate
0,tapr,CAMPSTAAR2.csv,campus,2019,2019,1902001,eoc,english_i,approaches,all_students,40,51,78.43
1,tapr,CAMPSTAAR2.csv,campus,2019,2019,1903001,eoc,english_i,approaches,all_students,88,127,69.29
2,tapr,CAMPSTAAR2.csv,campus,2019,2019,1904001,eoc,english_i,approaches,all_students,50,72,69.44
3,tapr,CAMPSTAAR2.csv,campus,2019,2019,1906002,eoc,english_i,approaches,all_students,21,28,75.0
4,tapr,CAMPSTAAR2.csv,campus,2019,2019,1907001,eoc,english_i,approaches,all_students,245,382,64.14


### PREPARATION

In [6]:
peims_df.shape
staar_df1.shape
staar_df2.shape

(18213, 150)

(6249982, 13)

(10024061, 13)

In [7]:
staar_df = pd.concat([staar_df1, staar_df2])

In [8]:
staar_df.head()

Unnamed: 0,data_release,data_category,data_level,release_year,test_year,campus_number,grade_level,subject,proficiency,demog,numerator,denominator,new_rate
0,tapr,CAMPSTAAR1.csv,campus,2014,2013,1902001,eoc,algebra_1,approaches,all_students,41,48,85.42
1,tapr,CAMPSTAAR1.csv,campus,2014,2014,1902001,eoc,algebra_1,approaches,all_students,51,57,89.47
2,tapr,CAMPSTAAR1.csv,campus,2014,2013,1902001,eoc,algebra_1,approaches,at_risk,11,17,64.71
3,tapr,CAMPSTAAR1.csv,campus,2014,2014,1902001,eoc,algebra_1,approaches,at_risk,32,38,84.21
4,tapr,CAMPSTAAR1.csv,campus,2014,2013,1902001,eoc,algebra_1,approaches,economic_disadvant,17,20,85.0


In [9]:
staar_df.shape

(16274043, 13)

In [10]:
staar_df = staar_df.drop_duplicates(keep='first')

In [11]:
staar_df.shape

(15116278, 13)

In [12]:
show_values(staar_df, ['data_release', 'data_category', 'data_level',
                       'release_year', 'test_year', 'grade_level', 'subject',
                       'proficiency', 'demog'
                      ])

data_release
tapr    15116278
Name: data_release, dtype: int64


data_category
CAMPSTAAR2.csv    8866296
CAMPSTAAR1.csv    6249982
Name: data_category, dtype: int64


data_level
campus    15116278
Name: data_level, dtype: int64


release_year
2019    6982315
2014    2596228
2018    1490175
2017    1409707
2013    1324368
2015    1313485
Name: release_year, dtype: int64


test_year
2019    3562483
2018    3419832
2014    1849170
2017    1490175
2016    1409707
2012    1324368
2015    1313485
2013     747058
Name: test_year, dtype: int64


grade_level
all    8100598
4      1428388
5      1381593
eoc    1023421
3       984805
8       910978
7       693002
6       527957
5        24651
4        24117
3        15955
6          588
8          141
7           84
Name: grade_level, dtype: int64


subject
reading           4175531
mathematics       3757577
science           2013890
writing           1739898
all_subjects      1510950
social_studies     895011
algebra_i          231189
biology   

In [13]:
columns_to_drop = ['data_release', 'data_category', 'data_level']

In [14]:
staar_df.drop(columns=columns_to_drop, inplace=True)

In [15]:
staar_df.shape

(15116278, 10)

In [16]:
show_missing(staar_df)

Unnamed: 0,num_missing,missing_percentage,num_empty,empty_percentage,nan_count,nan_percentage
release_year,0,0.0,0,0.0,0,0.0
test_year,0,0.0,0,0.0,0,0.0
campus_number,0,0.0,0,0.0,0,0.0
grade_level,0,0.0,0,0.0,0,0.0
subject,0,0.0,0,0.0,0,0.0
proficiency,0,0.0,0,0.0,0,0.0
demog,0,0.0,0,0.0,0,0.0
numerator,0,0.0,0,0.0,0,0.0
denominator,0,0.0,0,0.0,0,0.0
new_rate,0,0.0,0,0.0,0,0.0


In [17]:
show_missing(peims_df)

Unnamed: 0,num_missing,missing_percentage,num_empty,empty_percentage,nan_count,nan_percentage
DISTRICT NUMBER,0,0.0,0,0.0,0,0.0
DISTRICT NAME,0,0.0,0,0.0,0,0.0
YEAR,0,0.0,0,0.0,0,0.0
GEN FUNDS-LOCAL TAX REVENUE FROM M&O,0,0.0,0,0.0,0,0.0
ALL FUNDS-LLOCAL TAX REVENUE FROM M&O,0,0.0,0,0.0,0,0.0
GEN FUNDS-STATE REVENUE,0,0.0,0,0.0,0,0.0
ALL FUNDS-STATE REVENUE,0,0.0,0,0.0,0,0.0
GEN FUNDS-FEDERAL REVENUE,0,0.0,0,0.0,0,0.0
ALL FUNDS-FEDERAL REVENUE,0,0.0,0,0.0,0,0.0
GEN FUNDS-OTHER LOCAL & INTERMEDIATE REVENUE,0,0.0,0,0.0,0,0.0


In [18]:
staar_df.head()

Unnamed: 0,release_year,test_year,campus_number,grade_level,subject,proficiency,demog,numerator,denominator,new_rate
0,2014,2013,1902001,eoc,algebra_1,approaches,all_students,41,48,85.42
1,2014,2014,1902001,eoc,algebra_1,approaches,all_students,51,57,89.47
2,2014,2013,1902001,eoc,algebra_1,approaches,at_risk,11,17,64.71
3,2014,2014,1902001,eoc,algebra_1,approaches,at_risk,32,38,84.21
4,2014,2013,1902001,eoc,algebra_1,approaches,economic_disadvant,17,20,85.0


In [19]:
staar_df['campus_number'] = staar_df['campus_number'].apply(lambda x: '{0:0>9}'.format(x))

In [20]:
staar_df.head()

Unnamed: 0,release_year,test_year,campus_number,grade_level,subject,proficiency,demog,numerator,denominator,new_rate
0,2014,2013,1902001,eoc,algebra_1,approaches,all_students,41,48,85.42
1,2014,2014,1902001,eoc,algebra_1,approaches,all_students,51,57,89.47
2,2014,2013,1902001,eoc,algebra_1,approaches,at_risk,11,17,64.71
3,2014,2014,1902001,eoc,algebra_1,approaches,at_risk,32,38,84.21
4,2014,2013,1902001,eoc,algebra_1,approaches,economic_disadvant,17,20,85.0


In [21]:
staar_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15116278 entries, 0 to 10024060
Data columns (total 10 columns):
 #   Column         Dtype  
---  ------         -----  
 0   release_year   int64  
 1   test_year      int64  
 2   campus_number  object 
 3   grade_level    object 
 4   subject        object 
 5   proficiency    object 
 6   demog          object 
 7   numerator      int64  
 8   denominator    int64  
 9   new_rate       float64
dtypes: float64(1), int64(4), object(5)
memory usage: 1.2+ GB


In [22]:
staar_df.drop(columns=['new_rate'], inplace=True)

In [23]:
staar_df.head()

Unnamed: 0,release_year,test_year,campus_number,grade_level,subject,proficiency,demog,numerator,denominator
0,2014,2013,1902001,eoc,algebra_1,approaches,all_students,41,48
1,2014,2014,1902001,eoc,algebra_1,approaches,all_students,51,57
2,2014,2013,1902001,eoc,algebra_1,approaches,at_risk,11,17
3,2014,2014,1902001,eoc,algebra_1,approaches,at_risk,32,38
4,2014,2013,1902001,eoc,algebra_1,approaches,economic_disadvant,17,20


In [24]:
staar_df['district'] = staar_df['campus_number'].str[:6]

In [25]:
staar_df.head()

Unnamed: 0,release_year,test_year,campus_number,grade_level,subject,proficiency,demog,numerator,denominator,district
0,2014,2013,1902001,eoc,algebra_1,approaches,all_students,41,48,1902
1,2014,2014,1902001,eoc,algebra_1,approaches,all_students,51,57,1902
2,2014,2013,1902001,eoc,algebra_1,approaches,at_risk,11,17,1902
3,2014,2014,1902001,eoc,algebra_1,approaches,at_risk,32,38,1902
4,2014,2013,1902001,eoc,algebra_1,approaches,economic_disadvant,17,20,1902


In [26]:
staar_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15116278 entries, 0 to 10024060
Data columns (total 10 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   release_year   int64 
 1   test_year      int64 
 2   campus_number  object
 3   grade_level    object
 4   subject        object
 5   proficiency    object
 6   demog          object
 7   numerator      int64 
 8   denominator    int64 
 9   district       object
dtypes: int64(4), object(6)
memory usage: 1.2+ GB


In [27]:
df = staar_df.groupby(['release_year',
                  'test_year',
                  'district',
                  'grade_level',
                  'subject',
                  'proficiency',
                  'demog'
                 ])[["numerator", "denominator"]].sum().reset_index()

In [28]:
df['district'].nunique()

1258

In [29]:
df.head()

Unnamed: 0,release_year,test_year,district,grade_level,subject,proficiency,demog,numerator,denominator
0,2013,2012,1902,3,mathematics,approaches,all_students,33,42
1,2013,2012,1902,3,mathematics,approaches,at_risk,13,18
2,2013,2012,1902,3,mathematics,approaches,economic_disadvant,15,19
3,2013,2012,1902,3,mathematics,approaches,female,17,20
4,2013,2012,1902,3,mathematics,approaches,male,16,22


In [30]:
df.shape

(3676553, 9)

In [31]:
df.to_csv('../data/inter/01 - district_staar_df.csv', index=False)  

### EXPLORATION

### MODELING

# Conclusion