# Executive Summary

This notebook will attempt to answer the following research question:

    What's money got to do with it?

## PLANNING

- [X] Planning
    - [X] import libraries/packages
    - [X] configure notebook environment
    - [X] define helper functions
- [X] Acquire data
    - [X] get PEIMS financial data
    - [X] get STAAR performance data
- [X] Prepare the data
    - [X] prepare PEIMS data
        - [X] get rid of unwanted columns
        - [x] get rid of NaNs
    - [X] prepare STAAR data
        - [X] get rid of duplicates
        - [X] get rid of unwanted columns
        - [X] get rid of NaNs
        - [X] create new columns
        - [X] merge two datasets together
- [ ] Explore the data
    - [ ] examine for multicollinearity and determine truly independent variables
    - [ ] get to know the master dataset
    - [ ] feature engineering
- [ ] Modeling
    - [ ] create, fit, use XGBoost
    - [ ] gridsearch
    - [ ] retrain model using best hyperparameters
    - [ ] extract feature importances

In [1]:
# for manipulating dataframes
import pandas as pd

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## ACQUIRE DATA

In [2]:
# Get the PEIMS and STAAR datasets
peims_df = pd.read_csv('../data/in/2007-2021-summaried-peims-financial-data.csv')
staar_df1 = pd.read_csv('../data/in/tidy_campstaar1_2012to2019.csv')
staar_df2 = pd.read_csv('../data/in/tidy_campstaar2_2013to2019.csv')

  staar_df1 = pd.read_csv('../data/in/tidy_campstaar1_2012to2019.csv')


## PREPARATION

### STAAR Dataset

In [3]:
# Consolidate the two files into one dataframe
staar_df = pd.concat([staar_df1, staar_df2])

In [4]:
# Get rid of duplciates
staar_df = staar_df.drop_duplicates(keep='first')

In [5]:
columns_to_drop = ['data_release',
                   'data_category',
                   'data_level',
                   'release_year'
                  ]

In [6]:
# Get rid of unnecessary columns
staar_df.drop(columns=columns_to_drop, inplace=True)

In [7]:
# Prepend with 0's
staar_df['campus_number'] = staar_df['campus_number'].apply(lambda x: '{0:0>9}'.format(x))

In [8]:
# Drop 'new_rate'
staar_df.drop(columns=['new_rate'], inplace=True)

In [9]:
# Get the district number
staar_df['district'] = staar_df['campus_number'].str[:6]

In [10]:
# Turn dataset into district-level data
staar = staar_df.groupby(['test_year',
                          'district',
                          'grade_level',
                          'subject',
                          'proficiency',
                          'demog'
                         ])[["numerator", "denominator"]].sum().reset_index()

In [11]:
# Remove subsets
staar = staar[staar['grade_level'] == 'all']
staar = staar[staar['subject'] == 'all_subjects']
staar = staar[staar['demog'] == 'all_students']
staar = staar.drop(columns=['grade_level',
                            'subject',
                            'demog'
                           ])

In [12]:
# Create rate column
staar['rate'] = round(staar['numerator'] / staar['denominator'] * 100, 2)

# Remove raw columns
staar = staar.drop(columns=['numerator',
                            'denominator'
                           ])

In [13]:
# Isolate 2019 test year
staar = staar[staar['test_year'] == 2019]
staar = staar.drop(columns=['test_year'])

In [14]:
staar = staar.rename(columns={'district':'District',
                        'proficiency':'Proficiency',
                        'rate':'Rate'
                        })

In [15]:
staar.head()

Unnamed: 0,District,Proficiency,Rate
2682938,1902,approaches,84.55
2682951,1902,masters,28.86
2682964,1902,meets,61.48
2683797,1903,approaches,83.93
2683811,1903,masters,26.52


### PEIMS Dataset

In [16]:
peims_df['DISTRICT NUMBER'] = peims_df['DISTRICT NUMBER'].str[1:]

In [17]:
peims = peims_df.drop(columns=['DISTRICT NAME'])

In [18]:
# Remove all columns between column name 'B' to 'D'
peims = peims.drop(peims.loc[:, 'GEN FUNDS-LOCAL TAX REVENUE FROM M&O':'ALL FUNDS-TOTAL OPERATING, OTR, DEBT SERV FIN, AND TRS EST REVEN'].columns, axis=1)

In [19]:
# Isolate 2019 test year
peims = peims[peims['YEAR'] == 2019]
peims = peims.drop(columns=['YEAR'])

In [20]:
# Remove columns that starts with 'GEN'
columns_to_keep = [c for c in peims.columns if c.lower()[:3] != 'gen']
peims = peims[columns_to_keep]

In [21]:
# Remove Prefix 'all funds'
peims.columns = peims.columns.str.replace("ALL FUNDS-", "")

In [22]:
# Padd District numbers with 0's
peims['DISTRICT NUMBER'] = peims['DISTRICT NUMBER'].str.zfill(6)

In [23]:
peims = peims.drop(peims.loc[:, 'TOTAL OPERATING EXPENDITURES BY OBJ':'TOTAL NON-OPER AND OPER OEXPENDITURES BY OBJ'].columns, axis=1)

In [24]:
peims = peims.drop(peims.loc[:, 'TOTAL OPERATE EXPEND BY FUNCTION':'TOT OPER AND NON-OPER EXP BY FUNCTION'].columns, axis=1)

In [25]:
peims = peims.drop(peims.loc[:, 'TOTAL PROGRAM OPERATING EXPENDITURES':'EINTRAN4'].columns, axis=1)

In [26]:
peims = peims.drop(peims.loc[:, 'INTERGOVERN CHARGES EXPEND':'FALL SURVEY ENROLLMENT'].columns, axis=1)

In [27]:
peims = peims.rename(columns={'DISTRICT NUMBER':'DISTRICT'})

In [28]:
# Remove word 'EXPENDITURES'
peims.columns = peims.columns.str.replace("EXPENDITURES", "")

In [29]:
# Remove word 'EXPEND'
peims.columns = peims.columns.str.replace("EXPEND", "")

In [30]:
# Remove word 'EXP'
peims.columns = peims.columns.str.replace("EXP", "")

In [31]:
# Remove word 'TOTAL'
peims.columns = peims.columns.str.replace("TOTAL", "")

In [32]:
# Remove word 'FCT'
peims.columns = peims.columns.str.replace("FCT", "")

In [33]:
# Remove dougble dashes
peims.columns = peims.columns.str.replace("--", "-")

In [34]:
# Remove double space opposite comma
peims.columns = peims.columns.str.replace(" , ", ",")

In [35]:
# Remove leading and treiling spaces
peims.columns = peims.columns.str.strip()

In [36]:
peims = peims.rename(columns={'DISTRICT':'District',
                              'PAYROLL':'Payroll',
                              'PROFESSIONAL & CONTRACTED SERVICES':'Professional & Contracted',
                              'SUPPLIES & MATERIALS':'Supplies & Materials',
                              'OTHER OPERATING':'Other Operating',
                              'INSTRUCTION + TRANSFER -11,95':'Instruction & Transfer',
                              'INSTRUC RESOURCE MEDIA SERVICE, 12':'Instructional Resource Media',
                              'CURRICULUM/STAFF DEVELOPMENT,13':'Curriculum/Staff Development',
                              'INSTRUC LEADERSHIP,21':'Instructional Leadership',
                              'CAMPUS ADMINISTRATION,23':'Campus Administration',
                              'GUIDANCE 7 COUNSELING SERVICES,31':'Guidance & Counseling',
                              'SOCIAL WORK SERVICES,32':'Social Work',
                              'HEALTH SERVICES,33':'Health Services',
                              'TRANSPORTATION,34':'Transportation',
                              'FOOD SERVICE,35':'Food Service',
                              'EXTRACURRICULAR ,36':'Extracurricular',
                              'GENERAL ADMINISTRAT -41,80,92':'General Administration',
                              'PLANT MAINTENANCE/OPERA,51':'Plant Maintenance/Operation',
                              'SECURITY/MONITORING SERVICE,5':'Security & Monitoring',
                              'DATA PROCESSING SERVICES, 53':'Data Processing',
                              'COMMUNITY SERVICES, 61':'Community Services',
                              'REGULAR PROGRAM -11':'Regular Program',
                              'GIFTED/TALENTED PROGRAM -21':'Gifted & Talented Program',
                              'CAREER & TECHNOLOGY PGM -22':'Career & Technology Program',
                              'STUDENTS WITH DISABILITIES PGM -23':'Students with Disabilities',
                              'STATE COMPENSATORY ED -24, 29, 30, 34':'State Compensatory Education',
                              'BILINGUAL PROGRAM -25':'Bilingual Program',
                              'HIGH SCHOOL ALLOTMENT PROGRAM-91':'High School Allotment',
                              'PREKINDERGARTEN-32,35':'Pre-K',
                              'PREKINDERGARTEN  BILINGUAL-32':'Pre-K Bilingual',
                              'PREKINDERGARTEN  COMP ED-32':'Pre-K Comp Ed',
                              'PREKINDERGARTEN  REGULAR-32':'Pre-K Regular',
                              'PREKINDERGARTEN  SPECIAL ED-32':'Pre-K Special Education',
                              'ATHLETICS PROGRAM-91':'Athletics Program',
                              'UNDISTRIBUTED PROGRAM -99':'Undistributed Program',
                              'OTHER USES':'Other Uses'
                             })

In [37]:
peims['District'] = peims['District'].str[1:]

In [38]:
peims.head().T

Unnamed: 0,12,27,42,57,72
District,1902,1903,1904,1906,1907
Payroll,6025217,9093950,6659596,3134475,25587063
Professional & Contracted,1075904,1514689,927209,373513,5603896
Supplies & Materials,648206,784631,937810,408024,4134969
Other Operating,809559,303052,278109,105878,1048416
Instruction & Transfer,4649118,7043892,4611747,2087166,18807861
Instructional Resource Media,66490,117860,51126,19990,167823
Curriculum/Staff Development,4986,33175,157830,0,535649
Instructional Leadership,270353,66374,0,7905,1033275
Campus Administration,306385,574699,466345,379101,2201907


In [39]:
staar.shape
peims.shape

(3593, 3)

(1200, 36)

In [40]:
staar.to_csv('../data/inter/clean_staar_2019.csv', index=False)
peims.to_csv('../data/inter/clean_peims_2019.csv', index=False)