# Merging scraped data into full public school data, through 2016-17

Author: Jaren Haber

Date: 07-18-18

In [1]:
import pandas as pd

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
def convert_df(df):
    """Makes a Pandas DataFrame more memory-efficient through intelligent use of Pandas data types: 
    specifically, by storing columns with repetitive Python strings not with the object dtype for unique values 
    (entirely stored in memory) but as categoricals, which are represented by repeated integer values. This is a 
    net gain in memory when the reduced memory size of the category type outweighs the added memory cost of storing 
    one more thing. As such, this function checks the degree of redundancy for a given column before converting it."""
    
    converted_df = pd.DataFrame() # Initialize DF for memory-efficient storage of strings (object types)
    # TO DO: Infer dtypes of df
    df_obj = df.select_dtypes(include=['object']).copy() # Filter to only those columns of object data type

    # TO DO: Don't check out lists of text chunks
    for col in df.columns: 
        if col in df_obj: 
            num_unique_values = len(df_obj[col].unique())
            num_total_values = len(df_obj[col])
            if (num_unique_values / num_total_values) < 0.5: # Only convert data types if at least half of values are duplicates
                converted_df.loc[:,col] = df[col].astype('category') # Store these columns as dtype "category"
            else: 
                converted_df.loc[:,col] = df[col]
        else:    
            converted_df.loc[:,col] = df[col]
                      
    converted_df.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='float')
    converted_df.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='signed')
    
    return converted_df

In [7]:
# Read in files
df_scraped = pd.read_pickle("../../scrapy_cluster_data/new_processed_df.pkl") # Holds charter scrape data in 'data' column
df_cmos = pd.read_csv("../data/pub_schools_CMO_2015.csv", encoding = "latin1", low_memory=False) # Holds all pub schools data, including CMO info and CMO scraped data 
df_openclose = pd.read_csv("../data/pubschools_openclose_2015.csv", low_memory=False) # Has year opened and year closed
#df_r100 = pd.read_csv("../../text_analysis/data/random_sample_100.csv", sep="\t") # 100 schools, 25 each randomly selected from high prog, high ess, low both, high both

In [9]:
# Define paths for saving files
full_path = "../data/pubschools_full_2015.pkl" # Holds all text data, both CMOs and schools; all public schools
charters_path = "../../charters_full_2015.pkl" # All text data; only charter schools (regardless if open or not)
missing_path = "../data/charters_noURL.csv" # Only charters without URLs (need to collect and clean)
#r100_path = "../../text_analysis/data/random_sample_100.csv" # Revised 100 random schools

In [10]:
# Remove unnecessary columns from scraped DF
df_scraped = df_scraped[["NCESSCH", "data"]]
#df_scraped = df_scraped[["NCESSCH", "URL"]]
df_scraped.head(10)

Unnamed: 0,NCESSCH,URL
0,130123003896,http://099.clayton.k12.ga.us/
1,130123003687,http://198.clayton.k12.ga.us/
2,350006000811,http://21stcenturypa.com/wp/
3,220019300944,http://42charterschool.org/
4,411167001641,http://a3school.org/
5,40010601892,http://aaechighschools.com/
6,490018601483,http://aaiutah.org/
7,80345001896,http://aak8.org/
8,62703013064,http://ablecharter.net/
9,550039002880,http://abs.aasd.k12.wi.us/


In [11]:
sorted(list(df_openclose))

['AE',
 'AEOFFERED',
 'AM',
 'AM01F',
 'AM01M',
 'AM02F',
 'AM02M',
 'AM03F',
 'AM03M',
 'AM04F',
 'AM04M',
 'AM05F',
 'AM05M',
 'AM06F',
 'AM06M',
 'AM07F',
 'AM07M',
 'AM08F',
 'AM08M',
 'AM09F',
 'AM09M',
 'AM10F',
 'AM10M',
 'AM11F',
 'AM11M',
 'AM12F',
 'AM12M',
 'AM13F',
 'AM13M',
 'AMAEF',
 'AMAEM',
 'AMALF',
 'AMALM',
 'AMKGF',
 'AMKGM',
 'AMPKF',
 'AMPKM',
 'AMUGF',
 'AMUGM',
 'AS',
 'AS01F',
 'AS01M',
 'AS02F',
 'AS02M',
 'AS03F',
 'AS03M',
 'AS04F',
 'AS04M',
 'AS05F',
 'AS05M',
 'AS06F',
 'AS06M',
 'AS07F',
 'AS07M',
 'AS08F',
 'AS08M',
 'AS09F',
 'AS09M',
 'AS10F',
 'AS10M',
 'AS11F',
 'AS11M',
 'AS12F',
 'AS12M',
 'AS13F',
 'AS13M',
 'ASAEF',
 'ASAEM',
 'ASALF',
 'ASALM',
 'ASKGF',
 'ASKGM',
 'ASPKF',
 'ASPKM',
 'ASUGF',
 'ASUGM',
 'BIES',
 'BL',
 'BL01F',
 'BL01M',
 'BL02F',
 'BL02M',
 'BL03F',
 'BL03M',
 'BL04F',
 'BL04M',
 'BL05F',
 'BL05M',
 'BL06F',
 'BL06M',
 'BL07F',
 'BL07M',
 'BL08F',
 'BL08M',
 'BL09F',
 'BL09M',
 'BL10F',
 'BL10M',
 'BL11F',
 'BL11M',
 'BL12F',

In [8]:
# Remove unnecessary columns from year opened/closed DF
df_openclose = df_openclose[["NCESSCH", "MEMBER", "TOTFRL", "TITLEI", "YEAR_OPENED", "YEAR_CLOSED"]]
df_openclose.head(10)

Unnamed: 0,NCESSCH,MEMBER,TOTFRL,TITLEI,YEAR_OPENED,YEAR_CLOSED
0,1.000020e+10,-1.0,-1.0,M,1998.0,
1,1.000020e+10,,,,1998.0,2014.0
2,1.000020e+10,-1.0,-1.0,M,2003.0,
3,1.000020e+10,-1.0,-1.0,M,2003.0,
4,1.000020e+10,-1.0,-1.0,M,1998.0,
5,1.000020e+10,-1.0,-1.0,M,1998.0,
6,1.000020e+10,-1.0,-1.0,M,2006.0,
7,1.000050e+10,677.0,280.0,1,1998.0,
8,1.000050e+10,1171.0,370.0,1,1998.0,
9,1.000050e+10,748.0,326.0,1,1998.0,


In [13]:
# Prepare for merge by coercing 'NCESSCH' columns to type float64
df_scraped["NCESSCH"] = df_scraped["NCESSCH"].astype(float)
df_cmos["NCESSCH"] = df_cmos["NCESSCH"].astype(float)
df_openclose["NCESSCH"] = df_openclose["NCESSCH"].astype(float)
#df_r100["NCESSCH"] = df_r100["NCESSCH"].astype(float)

KeyError: 'NCESSCH'

In [11]:
# Add 'data' column from df_scraped to df_cmos, joining on 'NCESSCH'
df_full = pd.merge(df_cmos, df_scraped, how="outer", on="NCESSCH")

In [12]:
# Add columns from df_openclose to df_full, joining on 'NCESSCH'
df_full = pd.merge(df_full, df_openclose, how="outer", on="NCESSCH")

In [13]:
# Reduce memory load by converting dtypes:
#df_full = convert_df(df_full)

TypeError: unhashable type: 'list'

In [14]:
df_full.head(10)

Unnamed: 0.1,Unnamed: 0,CMO_NAME,CMO_MEMSUM,SCH_NAME,CMO_STATE,CMO_SCHNUM,CMO_URL,CMO_NUMSTATES,CMO_ALLSTATES,CMO_SECTOR,...,POPTOT_S16,POP517_S16,CHILDPOV_S16,TRUE_CHARTER,data,MEMBER,TOTFRL,TITLEI,YEAR_OPENED,YEAR_CLOSED
0,0,AAEC,1320.0,ARIZONA AGRIBUSINESS & EQUINE CENTER - ESTRELLA,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,,...,,,,1,[(https://www.aaechighschools.com/public-chart...,450.0,-1.0,1.0,2010.0,
1,295,Arizona Agribusiness & Equine Center,1320.0,ARIZONA AGRIBUSINESS & EQUINE CENTER - ESTRELLA,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,CMO,...,,,,1,[(https://www.aaechighschools.com/public-chart...,450.0,-1.0,1.0,2010.0,
2,1,AAEC,1320.0,AAEC - SMCC CAMPUS,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,,...,,,,1,[(https://www.aaechighschools.com/public-chart...,380.0,282.0,1.0,2005.0,
3,296,Arizona Agribusiness & Equine Center,1320.0,AAEC - SMCC CAMPUS,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,CMO,...,,,,1,[(https://www.aaechighschools.com/public-chart...,380.0,282.0,1.0,2005.0,
4,2,AAEC,1320.0,AAEC - PARADISE VALLEY,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,,...,,,,1,"[(https://www.aaechighschools.com/index.php, F...",386.0,-1.0,1.0,1998.0,
5,297,Arizona Agribusiness & Equine Center,1320.0,AAEC - PARADISE VALLEY,AZ,6.0,https://www.aaechighschools.com/,1.0,AZ,CMO,...,,,,1,"[(https://www.aaechighschools.com/index.php, F...",386.0,-1.0,1.0,1998.0,
6,3,ACADEMY OF SKILLS AND KNOWLEDGE,1161.0,CUMBERLAND ACADEMY MIDDLE,TX,4.0,http://www.cumberlandacademy.com/,1.0,TX,,...,,,,1,"[(https://ms.cumberlandacademy.com/, False, 0,...",528.0,193.0,1.0,2013.0,
7,4,ACADEMY OF SKILLS AND KNOWLEDGE,1161.0,CUMBERLAND H S,TX,4.0,http://www.cumberlandacademy.com/,1.0,TX,,...,,,,1,"[(https://hs.cumberlandacademy.com/, False, 0,...",,,,2015.0,
8,5,ACADEMY OF SKILLS AND KNOWLEDGE,1161.0,CUMBERLAND ACADEMY,TX,4.0,http://www.cumberlandacademy.com/,1.0,TX,,...,,,,1,"[(https://elem.cumberlandacademy.com/, False, ...",502.0,204.0,1.0,1998.0,
9,6,ACCELERATED INTERMEDIATE ACADEMY,281.0,ACCELERATED INTERDISCIPLINARY ACAD,TX,3.0,http://www.aiacharterschools.org/,1.0,TX,,...,,,,1,"[(http://www.aiacharterschools.org/, False, 0,...",254.0,243.0,1.0,2004.0,


In [15]:
list(df_full)

['Unnamed: 0',
 'CMO_NAME',
 'CMO_MEMSUM',
 'SCH_NAME',
 'CMO_STATE',
 'CMO_SCHNUM',
 'CMO_URL',
 'CMO_NUMSTATES',
 'CMO_ALLSTATES',
 'CMO_SECTOR',
 'CMO_NUMSTUDENTS_CREDO17',
 'CMO_TYPE',
 'CMO_WEBTEXT',
 'SURVYEAR',
 'FIPST',
 'STABR',
 'SEANAME',
 'LEAID',
 'ST_LEAID',
 'SCHID',
 'ST_SCHID',
 'NCESSCH',
 'MSTREET1',
 'MSTREET2',
 'MSTREET3',
 'MCITY',
 'MSTATE',
 'MZIP',
 'MZIP4',
 'PHONE',
 'LSTREET1',
 'LSTREET2',
 'LSTREET3',
 'LCITY',
 'LSTATE',
 'LZIP',
 'LZIP4',
 'UNION',
 'OUT_OF_STATE_FLAG',
 'SCH_TYPE_TEXT',
 'SCH_TYPE',
 'RECON_STATUS',
 'GSLO',
 'GSHI',
 'LEVEL',
 'VIRTUAL',
 'BIES',
 'SY_STATUS_TEXT',
 'SY_STATUS',
 'UPDATED_STATUS_TEXT',
 'UPDATED_STATUS',
 'EFFECTIVE_DATE',
 'CHARTER_TEXT',
 'PKOFFERED',
 'KGOFFERED',
 'G1OFFERED',
 'G2OFFERED',
 'G3OFFERED',
 'G4OFFERED',
 'G5OFFERED',
 'G6OFFERED',
 'G7OFFERED',
 'G8OFFERED',
 'G9OFFERED',
 'G10OFFERED',
 'G11OFFERED',
 'G12OFFERED',
 'G13OFFERED',
 'AEOFFERED',
 'UGOFFERED',
 'NOGRADES',
 'CHARTAUTH1',
 'CHARTAUTHN1

In [16]:
# Clean up columns a bit
df_full.rename(columns={'data':'WEBTEXT'}, inplace=True) # Rename 'data' to 'WEBTEXT'

df_full = df_full.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1) # Drop mystery columns

In [17]:
# Save this largesse file to full_path
df_full.to_pickle(full_path)