In [1]:
import pandas as pd
import numpy as np
import pickle
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

## Create address column 

In [2]:
#load address data 
address_data = pd.read_csv('../data/sc122a.txt', delimiter = '\t', header = 0, encoding='latin-1')

# Filter to only open schools OPEN in 2012-13 school year (so they probably have a working URL)
filtered_STATUS12 = (address_data['STATUS'] == 1) | (address_data['STATUS'] == 3) | (address_data['STATUS'] == 4) | (address_data['STATUS'] == 5) | (address_data['STATUS'] == 8)

address_data = address_data.loc[filtered_STATUS12].reset_index()
print(address_data.shape)
address_data.head()

  interactivity=interactivity, compiler=compiler, result=result)


(100413, 310)


Unnamed: 0,index,SURVYEAR,NCESSCH,FIPST,LEAID,SCHNO,STID,SEASCH,LEANM,SCHNAM,...,WHITE,WHALM,WHALF,PACIFIC,HPALM,HPALF,TR,TRALM,TRALF,TOTETH
0,0,2012,10000200277,1,100002,277,210,20,ALABAMA YOUTH SERVICES,SEQUOYAH SCH - CHALKVILLE CAMPUS,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,1,2012,10000201402,1,100002,1402,210,25,ALABAMA YOUTH SERVICES,EUFAULA SCH - EUFAULA CAMPUS,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,2,2012,10000201667,1,100002,1667,210,50,ALABAMA YOUTH SERVICES,CAMPS,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,3,2012,10000201670,1,100002,1670,210,60,ALABAMA YOUTH SERVICES,DET CTR,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,4,2012,10000201705,1,100002,1705,210,30,ALABAMA YOUTH SERVICES,WALLACE SCH - MT MEIGS CAMPUS,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [3]:
street = address_data['LSTREE']
city = address_data['LCITY']
zipcode = list(map(str, address_data['LZIP'])) #convert all int in LZIP to strings so they can be appended 
state = address_data['LSTATE']

In [4]:
def make_address(street, city, state, zipcode): 
    '''Returns a new list of complete addresses with combined street, city, state, and zipcode 
    '''
    addresses = [] #empty list to fill with full addresses
    for i in np.arange(len(street)): 
        full_address = street[i].lower() + ' ' + city[i].lower() + ' ' + state[i].lower() + ' ' + zipcode[i].lower() #for every street in list of streets, append city, state, and zip to create full address
        addresses.append(full_address) #append full address into empty list of addresses
    return addresses 

In [5]:
full_address = make_address(street, city, state, zipcode) #list of full addresses 

In [6]:
address_data['ADDRESS13'] = full_address #add column of full addresses

In [7]:
#select only NCESSCH and ADRESS13 columns because we're joining on NCESSCH and we only want to add the ADDRESS13 values to charters_2015.pkl
address_NCESSCH_data = address_data[['NCESSCH', 'ADDRESS13']] 
len(address_NCESSCH_data)

100413

## Match formatting of addresses and names in CER dataframe to formatting of new address data 

In [8]:
CER = pd.read_csv('../data/CER_2012-13.csv', encoding = 'latin-1') #open CER file 

In [9]:
#loop through every address, removing commas and periods, and converting all letters to lowercase
for i in np.arange(len(CER['CER_ADDRESS'])):
    CER.loc[i,'CER_ADDRESS'] = CER.loc[i,'CER_ADDRESS'].replace(',', '').replace('.','').lower() 


In [10]:
#uniform formatting of name by making all characters lowercase and removing all punctuation
CER['CER_NAME'] = [str(name).lower() for name in CER['CER_NAME']]

In [11]:
CER = CER[CER['CER_URL'] != ''][CER['CER_URL'] != '0'].dropna(subset = ['CER_URL'])
len(CER)

6171

In [12]:
CER['CER_URL']

0                             http://www.100academy.com/#
1                          http://www.the100schools.com/#
2              http://www.100lacs.org/Pages/default.aspx#
3                             http://www.21ccharter.org/#
4                                 http://www.21cccs.org/#
5                         http://www.21stprepschool.org/#
6                          http://www.21stcenturypa.org/#
7                         http://www.8pointscharter.org/#
8               http://www.atjonesacademy.com/flash.html#
9                                                       #
10                             http://www.aplusarts.com/#
11                 http://www.apluschildrensacademy.org/#
12                http://dixoncharter.com/Site/Home.html#
13                                 http://www.cfshc.org/#
14                             http://www.manoogian.org/#
15                               http://www.awbrown.org/#
16                               http://www.awbrown.org/#
17            

## Add address column to charter dataframe

In [13]:
#load charter data 
charters = pd.read_pickle('../../nowdata/charters_2015.pkl')

In [14]:
#merge addresses to charters based on NCESSCH
charters_with_address = pd.merge(charters, address_NCESSCH_data, how = 'left', on = 'NCESSCH') 
len(charters_with_address)

10965

In [15]:
#match formatting for name and address 
charters_with_address['SCHNAM12'] = [str(name).lower() for name in charters_with_address['SCHNAM12']]

charters_with_address['ADDRESS13'] = [str(address) for address in charters_with_address['ADDRESS13']]

## Create new column of combined name+address to use as comparison with fuzzy ratio

In [16]:
#combine name and address for charters 
chartername = charters_with_address['SCHNAM12']
charteraddress = charters_with_address['ADDRESS13']
charters_with_address['address_name_combined'] = [name+' '+address for name,address in zip(chartername, charteraddress)]

                                                               


In [17]:
#combine name and address for CER 
CERname = CER['CER_NAME']
CERaddress = CER['CER_ADDRESS']
CER['address_name_combined'] = [name+' '+address for name,address in zip(CERname, CERaddress)]

## Using fuzzy ratio 

1. List fuzzy ratio between name+address columns in both data frames 
2. Keep the highest fuzzy ratio 
3. If fuzzy ratio is over 70, change the column name to its match 

In [18]:
nummatch= 0
index = 0
for combined_charters in charters_with_address['address_name_combined']: #loop through all the school+address in charter data
    ratio = [fuzz.ratio(combined_charters, combinedCER) for combinedCER in CER['address_name_combined']] #find all fuzzy ratio btw the school+address in charter data and the school+addresses in CER 
    greatest_match_index = np.asarray(ratio).argmax() #index of the CER school+address with the greatest fuzzy ratio 
    greatest_match_name = ratio[greatest_match_index] #the greatest fuzzy ratio 
    if greatest_match_name >= 70: #greatest fuzzy ratios greater than 70 are considered a match 
        nummatch += 1 
        charters_with_address.loc[index, 'address_name_combined'] = CER.loc[greatest_match_index, 'address_name_combined'] #replace school+address of CER with the school+address of the matched school+address in charter data; this will give us something to join on 
    index = index + 1 
print(nummatch)

5472


## Merge URL with charter data 

In [19]:
#merge based on combined name and address

charters_merge_CER = pd.merge(charters_with_address, CER, how = 'left', on = ['address_name_combined']) 



In [20]:
CER_columns = list(CER.columns) #list of all columns labels in CER
CER_columns.remove('CER_URL') #remove CER_URL from the list 

In [21]:
charters_merge_CER = charters_merge_CER.drop(labels = CER_columns, axis = 1) #drop all columns in CER except CER_URL 

In [22]:
#num of rows in merged data
len(charters_merge_CER)

10972

In [23]:
charters_merge_CER

Unnamed: 0,NCESSCH,URL,LAT1516,LON1516,AM,AS,BL,HI,HP,TR,...,RIT_VALID_STR,INQUIRY_RATIO,DISCIPLINE_RATIO,ESS_VALID_RATIO,PROG_VALID_RATIO,RIT_VALID_RATIO,in_acs,GEO_LEAID,ADDRESS13,CER_URL
0,1.001970e+10,http://www.maef.net/,,,,,,,,,...,-2.545622,0.007424,0.000446,0.000893,0.001005,0.002847,,,,
1,2.000010e+10,https://education.alaska.gov/DOE_Rolodex/Schoo...,60.796131,-161.765194,167.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,False,200001.0,1010 fourth ave bethel ak 99559,http://www.yupik.org/#
2,2.001500e+10,https://www.kgbsd.org/ketchikancharter,55.347001,-131.641191,74.0,37.0,2.0,5.0,4.0,5.0,...,-6.000000,0.003762,0.000752,0.000000,0.000752,0.000000,False,200150.0,410 schoenbar ketchikan ak 99901,http://kcs.kgbsd.org/#
3,2.001500e+10,http://www.tongassschool.org/,55.347001,-131.641191,57.0,12.0,4.0,6.0,1.0,11.0,...,-2.960010,0.009768,0.000177,0.000212,0.000177,0.001096,False,200150.0,410 schoenbar rd ketchikan ak 99901,http://www.tda.treca.org/#
4,2.001800e+10,https://aquarian.asdk12.org/,61.192407,-149.916872,10.0,11.0,6.0,19.0,2.0,51.0,...,-6.000000,0.000000,0.000000,0.000000,0.000000,0.000000,False,200180.0,1705 west 32nd ave anchorage ak 99517,http://www.aquariancharterschool.com/#
5,2.001800e+10,https://education.alaska.gov/DOE_Rolodex/Schoo...,61.198100,-149.876000,30.0,16.0,18.0,43.0,18.0,69.0,...,,,,,,,False,200180.0,401 east fireweed ln anchorage ak 99503,http://www.fpcs.net/#
6,2.001800e+10,,,,,,,,,,...,,,,,,,,,,
7,2.001800e+10,,,,,,,,,,...,,,,,,,,,,
8,2.001800e+10,http://www.winterberrycharterschool.com/,61.194450,-149.791641,15.0,8.0,5.0,14.0,1.0,26.0,...,-2.659441,0.009728,0.000193,0.000322,0.000644,0.002191,False,200180.0,4802 bryn mawr court anchorage ak 99508,http://wingscharterschool.org/index.html#
9,2.001800e+10,http://www.asdk12.org/aboutschools/eagleacademy/,61.319213,-149.579442,2.0,3.0,8.0,6.0,0.0,18.0,...,-6.000000,0.000000,0.000000,0.000000,0.000000,0.000000,False,200180.0,10901 mausel st eagle river ak 99577,http://www.eagleacademycharterschool.com/#


## Investigating schools without a URL match

In [24]:
#Drop NaN values in CER_URL 
charters_merge_CER_cleaned = charters_merge_CER[charters_merge_CER['CER_URL'] != ''][charters_merge_CER['CER_URL'] != '0'].dropna(subset = ['CER_URL', 'SCHNAM12'])
charters_merge_CER_cleaned = charters_merge_CER.dropna(subset = ['CER_URL'])
non_null_charter = charters_nonduplicate_with_address.dropna(subset = ['SCHNAM12'])

#number of non-NaN URL values in merged data
len(charters_merge_CER_cleaned)


5479

In [25]:
#number of charter schools that were open in 2012-13
len(charters_with_address[charters_with_address['ADDRESS13'] != 'nan'])

6175

In [26]:
#number of rows without URL that represent schools that were open in 2012-13
charters_merge_CER_opened = charters_merge_CER[charters_merge_CER['CER_URL'].isnull() == True][charters_merge_CER['ADDRESS13'] != 'nan']
len(charters_merge_CER_opened)

  


723

In [31]:
charters_merge_CER_opened[['SCHNAM12', 'ADDRESS13']]

Unnamed: 0,SCHNAM12,ADDRESS13
12,rilke schule charter school,650 west international airport anchorage ak 99507
22,american charter academy,7362 west parks hwy 723 wasilla ak 99623
35,horseshoe trails elementary school,5405 east pinnacle vista dr phoenix az 85085
58,bennett academy - venture site,1535 west dunlap ave phoenix az 85020
157,alternative computerized education (ace) chart...,1929 north stone ave tucson az 85705
158,youth works charter high school,1915 east 36th st tucson az 85713
214,ombudsman - charter west,3624 west bell rd glendale az 85308
243,ala mesa,4507 south mountain rd mesa az 85212
311,kaizen education foundation dba el dorado high...,2200 north arizona ave chandler az 85224
319,new west school,98 north oak dr benson az 85602


In [36]:
#list of schools from charter data that do not have a URL but have an address in address column 
charters_merge_CER_opened

Unnamed: 0,NCESSCH,URL,LAT1516,LON1516,AM,AS,BL,HI,HP,TR,...,RIT_VALID_STR,INQUIRY_RATIO,DISCIPLINE_RATIO,ESS_VALID_RATIO,PROG_VALID_RATIO,RIT_VALID_RATIO,in_acs,GEO_LEAID,ADDRESS13,CER_URL
12,2.001800e+10,https://education.alaska.gov/DOE_Rolodex/Schoo...,61.173154,-149.895616,8.0,9.0,1.0,23.0,3.0,48.0,...,,,,,,,False,200180.0,650 west international airport anchorage ak 99507,
22,2.005100e+10,http://www.matsuk12.us/amc,61.581840,-149.634801,21.0,0.0,3.0,3.0,4.0,12.0,...,-6.000000,0.005198,0.000000,0.000000,0.000000,0.000000,False,200510.0,7362 west parks hwy 723 wasilla ak 99623,
35,4.000010e+10,http://www.ccusd93.org/horseshoetrailses,33.733012,-111.966341,2.0,25.0,2.0,48.0,2.0,14.0,...,-6.000000,0.000000,0.000000,0.000000,0.000000,0.000000,False,400001.0,5405 east pinnacle vista dr phoenix az 85085,
58,4.000370e+10,http://www.bennettacademy.com/,33.567150,-112.092451,2.0,6.0,15.0,32.0,0.0,4.0,...,-2.617699,0.010463,0.001205,0.000803,0.000402,0.002412,False,409060.0,1535 west dunlap ave phoenix az 85020,
157,4.000790e+10,http://www.acehs.org/,32.245757,-110.972279,3.0,0.0,7.0,112.0,0.0,2.0,...,-3.430800,0.006308,0.001854,0.001299,0.000371,0.000371,False,408800.0,1929 north stone ave tucson az 85705,
158,4.000790e+10,http://www.youthworkshs.org/,32.193170,-110.942221,2.0,0.0,1.0,26.0,0.0,0.0,...,-3.700098,0.006788,0.001995,0.001797,0.000399,0.000199,False,408800.0,1915 east 36th st tucson az 85713,
214,4.001030e+10,https://az.ombudsman.com/charter-west/,33.640450,-112.136371,4.0,2.0,6.0,39.0,0.0,0.0,...,-2.389363,0.009524,0.000000,0.000000,0.000000,0.004080,False,407750.0,3624 west bell rd glendale az 85308,
243,4.001120e+10,http://www.alaschools.org/arizona/az-schools/m...,33.333600,-111.591301,0.0,10.0,7.0,60.0,5.0,26.0,...,-3.460998,0.002652,0.000231,0.000000,0.000000,0.000346,False,406810.0,4507 south mountain rd mesa az 85212,
311,4.001490e+10,http://www.edhswolverines.com/,33.340481,-111.842773,3.0,1.0,22.0,87.0,0.0,0.0,...,-2.901704,0.005141,0.000762,0.000320,0.000246,0.001254,False,404970.0,2200 north arizona ave chandler az 85224,
319,4.001520e+10,,,,,,,,,,...,,,,,,,,,98 north oak dr benson az 85602,


In [None]:
#charters_merge_CER.to_csv('../../nowdata/backups/charters_full_250_CER_URLs.pkl')