# Civil Rights Data Merge 

Merge school-level variables on discipline (from the Civil Rights Data Collection or 'CRDC' file below) with existing data sources. Match the schools between the data sources by using the 'COMBOKEY' variable in the CRDC data set and 'NCESSCH' in the other data sets.

Key Data Sets

• charters_2015.pkl


• pubschools_2015.pkl


Helpful Links

https://jakevdp.github.io/PythonDataScienceHandbook/03.07-merge-and-join.html

https://www.shanelynn.ie/merge-join-dataframes-python-pandas-index-1/




In [1]:
# Import packages
import pandas as pd 
import numpy as np
import pickle
import csv

In [None]:
# Define file paths
charter_output_path = '../data/charters_full_2015_250_new_counts_CRDC.pkl'
public_output_path = '../data/pubschools_full_2015_CRDC.pkl'

# CRDC + PUBSCHOOLS MERGE

In [2]:
with open('../../nowdata/pubschools_2015.pkl', 'rb') as f:
    pubschools = pickle.load(f)

In [4]:
print("Number of schools in pubschools DF:   ", len(pubschools)) 
print("Number of variables in pubschools DF: ", len(list(pubschools)))

Number of schools in pubschools DF:    137429
Number of variables in pubschools DF:  656


In [2]:
vars_to_keep = ['COMBOKEY','TOT_LEPENR_M', 'TOT_LEPENR_F', 'TOT_LEPPROGENR_M', 'TOT_LEPPROGENR_F', 'TOT_IDEAENR_M', 'TOT_IDEAENR_F', 'SCH_IDEAENR_LEP_M', 'SCH_IDEAENR_LEP_F', 'TOT_504ENR_M', 'TOT_504ENR_F', 'TOT_GTENR_M', 'TOT_GTENR_F', 'SCH_CREDITRECOVERYENR', 'SCH_APCOURSES', 'TOT_APENR_M', 'TOT_APENR_F', 'TOT_APEXAM_ONEORMORE_M', 'TOT_APEXAM_ONEORMORE_F', 'TOT_APEXAM_NONE_M', 'TOT_APEXAM_NONE_F', 'TOT_APPASS_ONEORMORE_M', 'TOT_APPASS_ONEORMORE_F', 'TOT_APPASS_NONE_M', 'TOT_APPASS_NONE_F', 'TOT_IBENR_M', 'TOT_IBENR_F', 'TOT_SATACT_M', 'TOT_SATACT_F', 'SCH_PSDISC_CORP_HI_M', 'SCH_PSDISC_CORP_HI_F', 'SCH_PSDISC_CORP_AM_M', 'SCH_PSDISC_CORP_AM_F', 'SCH_PSDISC_CORP_AS_M', 'SCH_PSDISC_CORP_AS_F', 'SCH_PSDISC_CORP_HP_M', 'SCH_PSDISC_CORP_HP_F', 'SCH_PSDISC_CORP_BL_M', 'SCH_PSDISC_CORP_BL_F', 'SCH_PSDISC_CORP_WH_M', 'SCH_PSDISC_CORP_WH_F', 'SCH_PSDISC_CORP_TR_M', 'SCH_PSDISC_CORP_TR_F', 'TOT_PSDISC_CORP_M', 'TOT_PSDISC_CORP_F', 'SCH_PSDISC_CORP_LEP_M', 'SCH_PSDISC_CORP_LEP_F', 'SCH_PSDISC_CORP_IDEA_M', 'SCH_PSDISC_CORP_IDEA_F', 'SCH_PSCORPINSTANCES_ALL', 'TOT_PSDISC_SINGOOS_M', 'TOT_PSDISC_SINGOOS_F', 'TOT_PSDISC_MULTOOS_M', 'TOT_PSDISC_MULTOOS_F', 'SCH_PSOOSINSTANCES_ALL', 'SCH_PSOOSINSTANCES_IDEA', 'SCH_OOSINSTANCES_WODIS', 'SCH_OOSINSTANCES_IDEA', 'SCH_OOSINSTANCES_504', 'SCH_DAYSMISSED_HI_M', 'SCH_DAYSMISSED_HI_F', 'SCH_DAYSMISSED_AM_M', 'SCH_DAYSMISSED_AM_F', 'SCH_DAYSMISSED_AS_M', 'SCH_DAYSMISSED_AS_F', 'SCH_DAYSMISSED_HP_M', 'SCH_DAYSMISSED_HP_F', 'SCH_DAYSMISSED_BL_M', 'SCH_DAYSMISSED_BL_F', 'SCH_DAYSMISSED_WH_M', 'SCH_DAYSMISSED_WH_F', 'SCH_DAYSMISSED_TR_M', 'SCH_DAYSMISSED_TR_F', 'TOT_DAYSMISSED_M', 'TOT_DAYSMISSED_F', 'SCH_DAYSMISSED_LEP_M', 'SCH_DAYSMISSED_LEP_F', 'SCH_DAYSMISSED_504_M', 'SCH_DAYSMISSED_504_F', 'SCH_DAYSMISSED_IDEA_M', 'SCH_DAYSMISSED_IDEA_F', 'SCH_PSDISC_EXP_HI_M', 'SCH_PSDISC_EXP_HI_F', 'SCH_PSDISC_EXP_AM_M', 'SCH_PSDISC_EXP_AM_F', 'SCH_PSDISC_EXP_AS_M', 'SCH_PSDISC_EXP_AS_F', 'SCH_PSDISC_EXP_HP_M', 'SCH_PSDISC_EXP_HP_F', 'SCH_PSDISC_EXP_BL_M', 'SCH_PSDISC_EXP_BL_F', 'SCH_PSDISC_EXP_WH_M', 'SCH_PSDISC_EXP_WH_F', 'SCH_PSDISC_EXP_TR_M', 'SCH_PSDISC_EXP_TR_F', 'TOT_PSDISC_EXP_M', 'TOT_PSDISC_EXP_F', 'SCH_PSDISC_EXP_LEP_M', 'SCH_PSDISC_EXP_LEP_F', 'SCH_PSDISC_EXP_IDEA_M', 'SCH_PSDISC_EXP_IDEA_F', 'TOT_DISCWODIS_EXPWE_M', 'TOT_DISCWODIS_EXPWE_F', 'TOT_DISCWODIS_EXPWOE_M', 'TOT_DISCWODIS_EXPWOE_F', 'SCH_DISCWODIS_EXPZT_HI_M', 'SCH_DISCWODIS_EXPZT_HI_F', 'SCH_DISCWODIS_EXPZT_AM_M', 'SCH_DISCWODIS_EXPZT_AM_F', 'SCH_DISCWODIS_EXPZT_AS_M', 'SCH_DISCWODIS_EXPZT_AS_F', 'SCH_DISCWODIS_EXPZT_HP_M', 'SCH_DISCWODIS_EXPZT_HP_F', 'SCH_DISCWODIS_EXPZT_BL_M', 'SCH_DISCWODIS_EXPZT_BL_F', 'SCH_DISCWODIS_EXPZT_WH_M', 'SCH_DISCWODIS_EXPZT_WH_F', 'SCH_DISCWODIS_EXPZT_TR_M', 'SCH_DISCWODIS_EXPZT_TR_F', 'TOT_DISCWODIS_EXPZT_M', 'TOT_DISCWODIS_EXPZT_F', 'SCH_DISCWODIS_EXPZT_LEP_M', 'SCH_DISCWODIS_EXPZT_LEP_F', 'TOT_DISCWDIS_EXPWE_IDEA_M', 'TOT_DISCWDIS_EXPWE_IDEA_F', 'TOT_DISCWDIS_EXPWOE_IDEA_M', 'TOT_DISCWDIS_EXPWOE_IDEA_F', 'SCH_DISCWDIS_EXPZT_IDEA_HI_M', 'SCH_DISCWDIS_EXPZT_IDEA_HI_F', 'SCH_DISCWDIS_EXPZT_IDEA_AM_M', 'SCH_DISCWDIS_EXPZT_IDEA_AM_F', 'SCH_DISCWDIS_EXPZT_IDEA_AS_M', 'SCH_DISCWDIS_EXPZT_IDEA_AS_F', 'SCH_DISCWDIS_EXPZT_IDEA_HP_M', 'SCH_DISCWDIS_EXPZT_IDEA_HP_F', 'SCH_DISCWDIS_EXPZT_IDEA_BL_M', 'SCH_DISCWDIS_EXPZT_IDEA_BL_F', 'SCH_DISCWDIS_EXPZT_IDEA_WH_M', 'SCH_DISCWDIS_EXPZT_IDEA_WH_F', 'SCH_DISCWDIS_EXPZT_IDEA_TR_M', 'SCH_DISCWDIS_EXPZT_IDEA_TR_F', 'TOT_DISCWDIS_EXPZT_IDEA_M', 'TOT_DISCWDIS_EXPZT_IDEA_F', 'SCH_DISCWDIS_EXPZT_LEP_M', 'SCH_DISCWDIS_EXPZT_LEP_F', 'SCH_DISCWDIS_EXPZT_504_M', 'SCH_DISCWDIS_EXPZT_504_F', 'TOT_DISCWODIS_TFRALT_M', 'TOT_DISCWODIS_TFRALT_F', 'TOT_DISCWDIS_TFRALT_IDEA_M', 'TOT_DISCWDIS_TFRALT_IDEA_F', 'TOT_DISCWODIS_REF_M', 'TOT_DISCWODIS_REF_F', 'TOT_DISCWDIS_REF_IDEA_M', 'TOT_DISCWDIS_REF_IDEA_F', 'TOT_DISCWODIS_ARR_M', 'TOT_DISCWODIS_ARR_F', 'TOT_DISCWDIS_ARR_IDEA_M', 'TOT_DISCWDIS_ARR_IDEA_F', 'SCH_OFFENSE_RAPE', 'SCH_OFFENSE_BATT', 'SCH_OFFENSE_ROBWW', 'SCH_OFFENSE_ROBWX', 'SCH_OFFENSE_ROBWOW', 'SCH_OFFENSE_ATTWW', 'SCH_OFFENSE_ATTWX', 'SCH_OFFENSE_ATTWOW', 'SCH_OFFENSE_THRWW', 'SCH_OFFENSE_THRWX', 'SCH_OFFENSE_THRWOW', 'SCH_OFFENSE_POSSWX', 'TOT_RS_NONIDEA_MECH_M', 'TOT_RS_NONIDEA_MECH_F', 'TOT_RS_NONIDEA_PHYS_M', 'TOT_RS_NONIDEA_PHYS_F', 'TOT_RS_NONIDEA_SECL_M', 'TOT_RS_NONIDEA_SECL_F', 'TOT_RS_IDEA_MECH_M', 'TOT_RS_IDEA_MECH_F', 'TOT_RS_IDEA_PHYS_M', 'TOT_RS_IDEA_PHYS_F', 'TOT_RS_IDEA_SECL_M', 'TOT_RS_IDEA_SECL_F', 'SCH_RSINSTANCES_MECH_WODIS', 'SCH_RSINSTANCES_MECH_IDEA', 'SCH_RSINSTANCES_MECH_504', 'SCH_RSINSTANCES_PHYS_WODIS', 'SCH_RSINSTANCES_PHYS_IDEA', 'SCH_RSINSTANCES_PHYS_504', 'SCH_RSINSTANCES_SECL_WODIS', 'SCH_RSINSTANCES_SECL_IDEA', 'SCH_RSINSTANCES_SECL_504', 'SCH_HBALLEGATIONS_SEX', 'SCH_HBALLEGATIONS_RAC', 'SCH_HBALLEGATIONS_DIS', 'SCH_HBALLEGATIONS_ORI', 'SCH_HBALLEGATIONS_REL', 'TOT_HBREPORTED_SEX_M', 'TOT_HBREPORTED_SEX_F', 'TOT_HBREPORTED_RAC_M', 'TOT_HBREPORTED_RAC_F', 'TOT_HBREPORTED_DIS_M', 'TOT_HBREPORTED_DIS_F', 'TOT_ABSENT_M', 'TOT_ABSENT_F', 'TOT_RET_KG_M', 'TOT_RET_KG_F', 'TOT_RET_G01_M', 'TOT_RET_G01_F', 'TOT_RET_G02_M', 'TOT_RET_G02_F', 'TOT_RET_G03_M', 'TOT_RET_G03_F', 'TOT_RET_G04_M', 'TOT_RET_G04_F', 'TOT_RET_G05_M', 'TOT_RET_G05_F', 'TOT_RET_G06_M', 'TOT_RET_G06_F', 'TOT_RET_G07_M', 'TOT_RET_G07_F', 'TOT_RET_G08_M', 'TOT_RET_G08_F', 'TOT_RET_G09_M', 'TOT_RET_G09_F', 'TOT_RET_G10_M', 'TOT_RET_G10_F', 'TOT_RET_G11_M', 'TOT_RET_G11_F', 'TOT_RET_G12_M', 'TOT_RET_G12_F', 'TOT_SSPART', 'SCH_FTE_TEACH_WOFED', 'SCH_NPE_WOFED', 'SCH_FTE_AID_WOFED', 'SCH_FTE_SUP_WOFED', 'SCH_FTE_ADM_WOFED', 'SCH_FTE_AID_WFED', 'SCH_FTE_SUP_WFED', 'SCH_FTE_ADM_WFED', 'SCH_FTETEACH_TOT', 'SCH_FTETEACH_CERT', 'SCH_FTETEACH_NOTCERT', 'SCH_FTETEACH_FY', 'SCH_FTETEACH_SY', 'SCH_TEACHERS_CURR_TOT', 'SCH_TEACHERS_PREV_TOT', 'SCH_FTECOUNSELORS', 'SCH_FTETEACH_ABSENT', 'SCH_FTESECURITY_LEO', 'SCH_FTESECURITY_GUA', 'SCH_FTESERVICES_NUR', 'SCH_FTESERVICES_PSY', 'SCH_FTESERVICES_SOC', 'SCH_CREDITRECOVERY_IND', 'SCH_SSCLASSES_IND', 'SCH_APENR_IND', 'SCH_IBENR_IND', 'SCH_CORPINSTANCES_IND', 'SCH_FIREARM_IND', 'SCH_HOMICIDE_IND', 'SCH_RET_KG_IND', 'SCH_RET_G01_IND', 'SCH_RET_G02_IND', 'SCH_RET_G03_IND', 'SCH_RET_G04_IND', 'SCH_RET_G05_IND', 'SCH_RET_G06_IND', 'SCH_RET_G07_IND', 'SCH_RET_G08_IND', 'SCH_RET_G09_IND', 'SCH_RET_G10_IND', 'SCH_RET_G11_IND', 'SCH_RET_G12_IND', 'SCH_SSATHLETICS_IND']

In [3]:
CRDC = pd.read_csv('CRDC_1516_school_data.csv', encoding = "Latin1", usecols = vars_to_keep)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
print("Number of schools in CRDC DF:   ", len(CRDC)) 
print("Number of variables in CRDC DF: ", len(list(CRDC)))

Number of schools in CRDC DF:    96360
Number of variables in CRDC DF:  275


In [8]:
print("Number of resulting variables should be = ", str(len(list(pubschools)) + len(list(CRDC))))
print("Number of resulting schools should be = ", str(len(pubschools)))

Number of resulting variables should be =  931
Number of resulting schools should be =  137429


In [9]:
# Modify the original DFs in place, to save memory (making copies of these huge DFs would eat up resources)
pubschools = pubschools.set_index('NCESSCH')
CRDC = CRDC.set_index('COMBOKEY')
pubschools

Unnamed: 0_level_0,CMO_NAME,CMO_MEMSUM,SCH_NAME,CMO_STATE,CMO_SCHNUM,CMO_URL,CMO_NUMSTATES,CMO_ALLSTATES,CMO_SECTOR,CMO_NUMSTUDENTS_CREDO17,...,POP517_S16,CHILDPOV_S16,TRUE_CHARTER,WEBTEXT,MEMBER,TOTFRL,TITLEI,FTE,YEAR_OPENED,YEAR_CLOSED
NCESSCH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4.008280e+10,AAEC,1320.0,ARIZONA AGRIBUSINESS & EQUINE CENTER - ESTRELLA,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,,,...,,,1,[(https://www.aaechighschools.com/public-chart...,450.0,-1.0,1,-1.00,2010.0,
4.008280e+10,Arizona Agribusiness & Equine Center,1320.0,ARIZONA AGRIBUSINESS & EQUINE CENTER - ESTRELLA,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,CMO,1220.0,...,,,1,[(https://www.aaechighschools.com/public-chart...,450.0,-1.0,1,-1.00,2010.0,
4.004170e+10,AAEC,1320.0,AAEC - SMCC CAMPUS,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,,,...,,,1,[(https://www.aaechighschools.com/public-chart...,380.0,282.0,1,-1.00,2005.0,
4.004170e+10,Arizona Agribusiness & Equine Center,1320.0,AAEC - SMCC CAMPUS,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,CMO,1220.0,...,,,1,[(https://www.aaechighschools.com/public-chart...,380.0,282.0,1,-1.00,2005.0,
4.001060e+10,AAEC,1320.0,AAEC - PARADISE VALLEY,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,,,...,,,1,"[(https://www.aaechighschools.com/index.php, F...",386.0,-1.0,1,-1.00,1998.0,
4.001060e+10,Arizona Agribusiness & Equine Center,1320.0,AAEC - PARADISE VALLEY,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,CMO,1220.0,...,,,1,"[(https://www.aaechighschools.com/index.php, F...",386.0,-1.0,1,-1.00,1998.0,
4.800034e+11,ACADEMY OF SKILLS AND KNOWLEDGE,1161.0,CUMBERLAND ACADEMY MIDDLE,TX,4.0,http://www.cumberlandacademy.com/,1.0,TX,,,...,,,1,"[(https://ms.cumberlandacademy.com/, False, 0,...",528.0,193.0,1,40.82,2013.0,
4.800034e+11,ACADEMY OF SKILLS AND KNOWLEDGE,1161.0,CUMBERLAND H S,TX,4.0,http://www.cumberlandacademy.com/,1.0,TX,,,...,,,1,"[(https://hs.cumberlandacademy.com/, False, 0,...",,,,,2015.0,
4.800034e+11,ACADEMY OF SKILLS AND KNOWLEDGE,1161.0,CUMBERLAND ACADEMY,TX,4.0,http://www.cumberlandacademy.com/,1.0,TX,,,...,,,1,"[(https://elem.cumberlandacademy.com/, False, ...",502.0,204.0,1,36.23,1998.0,
4.800192e+11,ACCELERATED INTERMEDIATE ACADEMY,281.0,ACCELERATED INTERDISCIPLINARY ACAD,TX,3.0,http://www.aiacharterschools.org/,1.0,TX,,,...,,,1,"[(http://www.aiacharterschools.org/, False, 0,...",254.0,243.0,1,16.85,2004.0,


In [10]:
# Do a "left" in order to keep all the variables from the charters/public DF, 
# and only the vars for CRDC that occurred in matching schools. 
# This way keeps all schools and all variables, but didn't add new rows to accommodate cases 
# where CRDC had info on entities (schools?) we don't have
public_merged = pd.merge(pubschools, CRDC, how='left', left_index=True, right_index=True)

In [12]:
print("Number of schools of merged DF:\t  ", len(public_merged)) 
print("Number of variables of merged DF: ", len(list(public_merged)))
public_merged

Number of schools of merged DF:	   137429
Number of variables of merged DF:  929


Unnamed: 0,CMO_NAME,CMO_MEMSUM,SCH_NAME,CMO_STATE,CMO_SCHNUM,CMO_URL,CMO_NUMSTATES,CMO_ALLSTATES,CMO_SECTOR,CMO_NUMSTUDENTS_CREDO17,...,SCH_FTETEACH_SY,SCH_TEACHERS_CURR_TOT,SCH_TEACHERS_PREV_TOT,SCH_FTECOUNSELORS,SCH_FTETEACH_ABSENT,SCH_FTESECURITY_LEO,SCH_FTESECURITY_GUA,SCH_FTESERVICES_NUR,SCH_FTESERVICES_PSY,SCH_FTESERVICES_SOC
1.000020e+10,,,,,,,,,,,...,,,,,,,,,,
1.000020e+10,,,,,,,,,,,...,,,,,,,,,,
1.000020e+10,,,,,,,,,,,...,,,,,,,,,,
1.000020e+10,,,,,,,,,,,...,,,,,,,,,,
1.000020e+10,,,,,,,,,,,...,0.0,29.0,29.0,1.0,0.0,-9.00,2.0,0.00,2.0,0.0
1.000020e+10,,,,,,,,,,,...,0.0,16.0,16.0,1.0,0.0,-9.00,2.0,0.00,1.0,0.0
1.000020e+10,,,,,,,,,,,...,0.0,38.0,37.0,2.0,2.0,-9.00,2.0,0.00,0.0,0.0
1.000050e+10,,,,,,,,,,,...,1.0,41.0,33.0,1.0,11.0,1.00,0.0,1.00,0.0,0.0
1.000050e+10,,,,,,,,,,,...,4.0,72.0,71.0,4.5,27.0,1.00,1.0,1.00,0.0,0.0
1.000050e+10,,,,,,,,,,,...,3.0,40.0,31.0,1.0,12.0,1.00,0.0,1.00,0.0,0.0


In [13]:
# Convert NCESSCH from index back into variable:
public_merged.reset_index(level=0, inplace=True)
public_merged.head(10)

Unnamed: 0,index,CMO_NAME,CMO_MEMSUM,SCH_NAME,CMO_STATE,CMO_SCHNUM,CMO_URL,CMO_NUMSTATES,CMO_ALLSTATES,CMO_SECTOR,...,SCH_FTETEACH_SY,SCH_TEACHERS_CURR_TOT,SCH_TEACHERS_PREV_TOT,SCH_FTECOUNSELORS,SCH_FTETEACH_ABSENT,SCH_FTESECURITY_LEO,SCH_FTESECURITY_GUA,SCH_FTESERVICES_NUR,SCH_FTESERVICES_PSY,SCH_FTESERVICES_SOC
0,10000200000.0,,,,,,,,,,...,,,,,,,,,,
1,10000200000.0,,,,,,,,,,...,,,,,,,,,,
2,10000200000.0,,,,,,,,,,...,,,,,,,,,,
3,10000200000.0,,,,,,,,,,...,,,,,,,,,,
4,10000200000.0,,,,,,,,,,...,0.0,29.0,29.0,1.0,0.0,-9.0,2.0,0.0,2.0,0.0
5,10000200000.0,,,,,,,,,,...,0.0,16.0,16.0,1.0,0.0,-9.0,2.0,0.0,1.0,0.0
6,10000200000.0,,,,,,,,,,...,0.0,38.0,37.0,2.0,2.0,-9.0,2.0,0.0,0.0,0.0
7,10000500000.0,,,,,,,,,,...,1.0,41.0,33.0,1.0,11.0,1.0,0.0,1.0,0.0,0.0
8,10000500000.0,,,,,,,,,,...,4.0,72.0,71.0,4.5,27.0,1.0,1.0,1.0,0.0,0.0
9,10000500000.0,,,,,,,,,,...,3.0,40.0,31.0,1.0,12.0,1.0,0.0,1.0,0.0,0.0


In [14]:
# Rename NCESSCH from 'index'
public_merged.rename({"index":"NCESSCH"}, axis=1, inplace=True)
public_merged.head(10)

Unnamed: 0,NCESSCH,CMO_NAME,CMO_MEMSUM,SCH_NAME,CMO_STATE,CMO_SCHNUM,CMO_URL,CMO_NUMSTATES,CMO_ALLSTATES,CMO_SECTOR,...,SCH_FTETEACH_SY,SCH_TEACHERS_CURR_TOT,SCH_TEACHERS_PREV_TOT,SCH_FTECOUNSELORS,SCH_FTETEACH_ABSENT,SCH_FTESECURITY_LEO,SCH_FTESECURITY_GUA,SCH_FTESERVICES_NUR,SCH_FTESERVICES_PSY,SCH_FTESERVICES_SOC
0,10000200000.0,,,,,,,,,,...,,,,,,,,,,
1,10000200000.0,,,,,,,,,,...,,,,,,,,,,
2,10000200000.0,,,,,,,,,,...,,,,,,,,,,
3,10000200000.0,,,,,,,,,,...,,,,,,,,,,
4,10000200000.0,,,,,,,,,,...,0.0,29.0,29.0,1.0,0.0,-9.0,2.0,0.0,2.0,0.0
5,10000200000.0,,,,,,,,,,...,0.0,16.0,16.0,1.0,0.0,-9.0,2.0,0.0,1.0,0.0
6,10000200000.0,,,,,,,,,,...,0.0,38.0,37.0,2.0,2.0,-9.0,2.0,0.0,0.0,0.0
7,10000500000.0,,,,,,,,,,...,1.0,41.0,33.0,1.0,11.0,1.0,0.0,1.0,0.0,0.0
8,10000500000.0,,,,,,,,,,...,4.0,72.0,71.0,4.5,27.0,1.0,1.0,1.0,0.0,0.0
9,10000500000.0,,,,,,,,,,...,3.0,40.0,31.0,1.0,12.0,1.0,0.0,1.0,0.0,0.0


In [16]:
public_merged.to_pickle(public_output_path)

In [22]:
public_final = pd.read_pickle(public_output_path)
print("Number of schools of resulting DF:   ", len(public_final)) 
print("Number of variables of resulting DF: ", len(list(public_final)))
public_final

Number of schools of resulting DF:    137429
Number of variables of resulting DF:  930


Unnamed: 0,NCESSCH,CMO_NAME,CMO_MEMSUM,SCH_NAME,CMO_STATE,CMO_SCHNUM,CMO_URL,CMO_NUMSTATES,CMO_ALLSTATES,CMO_SECTOR,...,SCH_FTETEACH_SY,SCH_TEACHERS_CURR_TOT,SCH_TEACHERS_PREV_TOT,SCH_FTECOUNSELORS,SCH_FTETEACH_ABSENT,SCH_FTESECURITY_LEO,SCH_FTESECURITY_GUA,SCH_FTESERVICES_NUR,SCH_FTESERVICES_PSY,SCH_FTESERVICES_SOC
0,1.000020e+10,,,,,,,,,,...,,,,,,,,,,
1,1.000020e+10,,,,,,,,,,...,,,,,,,,,,
2,1.000020e+10,,,,,,,,,,...,,,,,,,,,,
3,1.000020e+10,,,,,,,,,,...,,,,,,,,,,
4,1.000020e+10,,,,,,,,,,...,0.0,29.0,29.0,1.0,0.0,-9.00,2.0,0.00,2.0,0.0
5,1.000020e+10,,,,,,,,,,...,0.0,16.0,16.0,1.0,0.0,-9.00,2.0,0.00,1.0,0.0
6,1.000020e+10,,,,,,,,,,...,0.0,38.0,37.0,2.0,2.0,-9.00,2.0,0.00,0.0,0.0
7,1.000050e+10,,,,,,,,,,...,1.0,41.0,33.0,1.0,11.0,1.00,0.0,1.00,0.0,0.0
8,1.000050e+10,,,,,,,,,,...,4.0,72.0,71.0,4.5,27.0,1.00,1.0,1.00,0.0,0.0
9,1.000050e+10,,,,,,,,,,...,3.0,40.0,31.0,1.0,12.0,1.00,0.0,1.00,0.0,0.0


# CRDC + CHARTERS MERGE

In [6]:
with open('../../nowdata/charters_2015.pkl', 'rb') as x:
    charters = pickle.load(x)

In [7]:
print("Number of schools in charters DF:   ", len(charters)) 
print("Number of variables in charters DF: ", len(list(charters)))

Number of schools in charters DF:    12877
Number of variables in charters DF:  400


In [8]:
charters

Unnamed: 0,CMO_NAME,CMO_MEMSUM,SCH_NAME,CMO_STATE,CMO_SCHNUM,CMO_URL,CMO_NUMSTATES,CMO_ALLSTATES,CMO_SECTOR,CMO_NUMSTUDENTS_CREDO17,...,STR,WEBTEXT_METHOD,ess_strength_y,prog_strength_y,PROG_COUNT,RIT_COUNT,ESS_COUNT,ESS_STR,PROG_STR,RIT_STR
0,AAEC,1320.0,ARIZONA AGRIBUSINESS & EQUINE CENTER - ESTRELLA,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,,,...,-480.000000,0,0.087500,0.037500,1,7,0,-6.000000,-2.977266,-2.132168
1,AAEC,1320.0,ARIZONA AGRIBUSINESS & EQUINE CENTER - ESTRELLA,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,,,...,-480.000000,0,0.087500,0.037500,1,7,0,-6.000000,-2.977266,-2.132168
2,Arizona Agribusiness & Equine Center,1320.0,ARIZONA AGRIBUSINESS & EQUINE CENTER - ESTRELLA,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,CMO,1220.0,...,-480.000000,0,0.087500,0.037500,1,7,0,-6.000000,-2.977266,-2.132168
3,Arizona Agribusiness & Equine Center,1320.0,ARIZONA AGRIBUSINESS & EQUINE CENTER - ESTRELLA,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,CMO,1220.0,...,-480.000000,0,0.087500,0.037500,1,7,0,-6.000000,-2.977266,-2.132168
4,AAEC,1320.0,AAEC - SMCC CAMPUS,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,,,...,-434.000000,0,0.092505,0.097047,1,8,0,-6.000000,-2.941014,-2.037426
5,AAEC,1320.0,AAEC - SMCC CAMPUS,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,,,...,-434.000000,0,0.092505,0.097047,1,8,0,-6.000000,-2.941014,-2.037426
6,Arizona Agribusiness & Equine Center,1320.0,AAEC - SMCC CAMPUS,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,CMO,1220.0,...,-434.000000,0,0.092505,0.097047,1,8,0,-6.000000,-2.941014,-2.037426
7,Arizona Agribusiness & Equine Center,1320.0,AAEC - SMCC CAMPUS,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,CMO,1220.0,...,-434.000000,0,0.092505,0.097047,1,8,0,-6.000000,-2.941014,-2.037426
8,AAEC,1320.0,AAEC - PARADISE VALLEY,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,,,...,-406.000000,0,0.083333,0.020833,0,1,0,-6.000000,-6.000000,-2.664642
9,AAEC,1320.0,AAEC - PARADISE VALLEY,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,,,...,-406.000000,0,0.083333,0.020833,0,1,0,-6.000000,-6.000000,-2.664642


In [9]:
print("Number of resulting variables should be = ", str(len(list(charters)) + len(list(CRDC))))
print("Number of resulting schools should be = ", str(len(charters)))

Number of resulting variables should be =  675
Number of resulting schools should be =  12877


In [10]:
charter = charters.set_index('NCESSCH')
CRDC = CRDC.set_index('COMBOKEY')


In [11]:
# Do a "left" in order to keep all the variables from the charters/public DF, 
# and only the vars for CRDC that occurred in matching schools. 
# This way keeps all schools and all variables, but didn't add new rows to accommodate cases 
# where CRDC had info on entities (schools?) we don't have
charter_merged = pd.merge(charter, CRDC, how="left", left_index=True, right_index=True)

In [12]:
print("Number of schools of merged DF:\t  ", len(charter_merged)) 
print("Number of variables of merged DF: ", len(list(charter_merged)))
charter_merged

Number of schools of merged DF:	   12877
Number of variables of merged DF:  673


Unnamed: 0,CMO_NAME,CMO_MEMSUM,SCH_NAME,CMO_STATE,CMO_SCHNUM,CMO_URL,CMO_NUMSTATES,CMO_ALLSTATES,CMO_SECTOR,CMO_NUMSTUDENTS_CREDO17,...,SCH_FTETEACH_SY,SCH_TEACHERS_CURR_TOT,SCH_TEACHERS_PREV_TOT,SCH_FTECOUNSELORS,SCH_FTETEACH_ABSENT,SCH_FTESECURITY_LEO,SCH_FTESECURITY_GUA,SCH_FTESERVICES_NUR,SCH_FTESERVICES_PSY,SCH_FTESERVICES_SOC
1.001970e+10,,,,,,,,,,,...,,,,,,,,,,
2.000010e+10,,,,,,,,,,,...,0.0,14.0,12.0,0.0,3.0,-9.0,0.0,0.00,0.0,0.0
2.001500e+10,,,,,,,,,,,...,0.0,12.0,11.0,0.0,7.0,-9.0,0.0,0.00,0.0,0.0
2.001500e+10,,,,,,,,,,,...,2.0,0.0,0.0,1.0,10.0,-9.0,0.0,0.00,0.0,0.0
2.001800e+10,,,,,,,,,,,...,0.0,21.0,21.0,0.0,8.0,1.0,0.0,0.70,0.0,0.0
2.001800e+10,,,,,,,,,,,...,0.0,16.0,14.0,0.0,0.0,1.0,0.0,0.00,0.0,0.0
2.001800e+10,,,,,,,,,,,...,,,,,,,,,,
2.001800e+10,,,,,,,,,,,...,,,,,,,,,,
2.001800e+10,,,,,,,,,,,...,2.0,18.0,13.0,0.0,3.5,1.0,0.0,0.00,0.0,0.0
2.001800e+10,,,,,,,,,,,...,0.0,12.0,9.0,0.0,0.0,1.0,0.0,0.80,0.0,0.0


In [13]:
# Convert NCESSCH from index back into variable:
charter_merged.reset_index(level=0, inplace=True)
charter_merged.head(10)

Unnamed: 0,index,CMO_NAME,CMO_MEMSUM,SCH_NAME,CMO_STATE,CMO_SCHNUM,CMO_URL,CMO_NUMSTATES,CMO_ALLSTATES,CMO_SECTOR,...,SCH_FTETEACH_SY,SCH_TEACHERS_CURR_TOT,SCH_TEACHERS_PREV_TOT,SCH_FTECOUNSELORS,SCH_FTETEACH_ABSENT,SCH_FTESECURITY_LEO,SCH_FTESECURITY_GUA,SCH_FTESERVICES_NUR,SCH_FTESERVICES_PSY,SCH_FTESERVICES_SOC
0,10019700000.0,,,,,,,,,,...,,,,,,,,,,
1,20000100000.0,,,,,,,,,,...,0.0,14.0,12.0,0.0,3.0,-9.0,0.0,0.0,0.0,0.0
2,20015000000.0,,,,,,,,,,...,0.0,12.0,11.0,0.0,7.0,-9.0,0.0,0.0,0.0,0.0
3,20015000000.0,,,,,,,,,,...,2.0,0.0,0.0,1.0,10.0,-9.0,0.0,0.0,0.0,0.0
4,20018000000.0,,,,,,,,,,...,0.0,21.0,21.0,0.0,8.0,1.0,0.0,0.7,0.0,0.0
5,20018000000.0,,,,,,,,,,...,0.0,16.0,14.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,20018000000.0,,,,,,,,,,...,,,,,,,,,,
7,20018000000.0,,,,,,,,,,...,,,,,,,,,,
8,20018000000.0,,,,,,,,,,...,2.0,18.0,13.0,0.0,3.5,1.0,0.0,0.0,0.0,0.0
9,20018000000.0,,,,,,,,,,...,0.0,12.0,9.0,0.0,0.0,1.0,0.0,0.8,0.0,0.0


In [14]:
# Rename NCESSCH from 'index'
charter_merged.rename({"index":"NCESSCH"}, axis=1, inplace=True)
charter_merged.head(10)

Unnamed: 0,NCESSCH,CMO_NAME,CMO_MEMSUM,SCH_NAME,CMO_STATE,CMO_SCHNUM,CMO_URL,CMO_NUMSTATES,CMO_ALLSTATES,CMO_SECTOR,...,SCH_FTETEACH_SY,SCH_TEACHERS_CURR_TOT,SCH_TEACHERS_PREV_TOT,SCH_FTECOUNSELORS,SCH_FTETEACH_ABSENT,SCH_FTESECURITY_LEO,SCH_FTESECURITY_GUA,SCH_FTESERVICES_NUR,SCH_FTESERVICES_PSY,SCH_FTESERVICES_SOC
0,10019700000.0,,,,,,,,,,...,,,,,,,,,,
1,20000100000.0,,,,,,,,,,...,0.0,14.0,12.0,0.0,3.0,-9.0,0.0,0.0,0.0,0.0
2,20015000000.0,,,,,,,,,,...,0.0,12.0,11.0,0.0,7.0,-9.0,0.0,0.0,0.0,0.0
3,20015000000.0,,,,,,,,,,...,2.0,0.0,0.0,1.0,10.0,-9.0,0.0,0.0,0.0,0.0
4,20018000000.0,,,,,,,,,,...,0.0,21.0,21.0,0.0,8.0,1.0,0.0,0.7,0.0,0.0
5,20018000000.0,,,,,,,,,,...,0.0,16.0,14.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,20018000000.0,,,,,,,,,,...,,,,,,,,,,
7,20018000000.0,,,,,,,,,,...,,,,,,,,,,
8,20018000000.0,,,,,,,,,,...,2.0,18.0,13.0,0.0,3.5,1.0,0.0,0.0,0.0,0.0
9,20018000000.0,,,,,,,,,,...,0.0,12.0,9.0,0.0,0.0,1.0,0.0,0.8,0.0,0.0


In [15]:
charter_merged.to_pickle(charter_output_path)

In [17]:
charter_final = pd.read_pickle(charter_output_path)
print("Number of schools of resulting DF:   ", len(charter_final)) 
print("Number of variables of resulting DF: ", len(list(charter_final)))
charter_final

Number of schools of resulting DF:    12877
Number of variables of resulting DF:  674


Unnamed: 0,NCESSCH,CMO_NAME,CMO_MEMSUM,SCH_NAME,CMO_STATE,CMO_SCHNUM,CMO_URL,CMO_NUMSTATES,CMO_ALLSTATES,CMO_SECTOR,...,SCH_FTETEACH_SY,SCH_TEACHERS_CURR_TOT,SCH_TEACHERS_PREV_TOT,SCH_FTECOUNSELORS,SCH_FTETEACH_ABSENT,SCH_FTESECURITY_LEO,SCH_FTESECURITY_GUA,SCH_FTESERVICES_NUR,SCH_FTESERVICES_PSY,SCH_FTESERVICES_SOC
0,1.001970e+10,,,,,,,,,,...,,,,,,,,,,
1,2.000010e+10,,,,,,,,,,...,0.0,14.0,12.0,0.0,3.0,-9.0,0.0,0.00,0.0,0.0
2,2.001500e+10,,,,,,,,,,...,0.0,12.0,11.0,0.0,7.0,-9.0,0.0,0.00,0.0,0.0
3,2.001500e+10,,,,,,,,,,...,2.0,0.0,0.0,1.0,10.0,-9.0,0.0,0.00,0.0,0.0
4,2.001800e+10,,,,,,,,,,...,0.0,21.0,21.0,0.0,8.0,1.0,0.0,0.70,0.0,0.0
5,2.001800e+10,,,,,,,,,,...,0.0,16.0,14.0,0.0,0.0,1.0,0.0,0.00,0.0,0.0
6,2.001800e+10,,,,,,,,,,...,,,,,,,,,,
7,2.001800e+10,,,,,,,,,,...,,,,,,,,,,
8,2.001800e+10,,,,,,,,,,...,2.0,18.0,13.0,0.0,3.5,1.0,0.0,0.00,0.0,0.0
9,2.001800e+10,,,,,,,,,,...,0.0,12.0,9.0,0.0,0.0,1.0,0.0,0.80,0.0,0.0
