In [1]:
""" Script to take planning applications and match to OSAddressBase addresses

Written by: Christine Langston, April 2024
"""
import pandas as pd
import time
import numpy as np
import copy
import re

In [2]:
#create a function to iterate through csv. use for OSAddressBase file
def read_csv(file_name, columns):
    for chunk in pd.read_csv(file_name, chunksize=10000, usecols=columns, 
                            dtype={'urpn':str, 'parent_urpn': str, 'class': str, 'latitude': float, 
                                                  'longitude': float, 'country': str}):
    
        #if chunk['country'] == 'E':
        yield chunk


In [3]:
#takes a merged dataset and separates matched addresses and not matched
def separate_matches(dataset, column_name, match_strategy):
    dataset_match = dataset.drop(dataset[pd.isna(dataset[column_name]) == True].index)
    dataset_no_match = dataset.drop(dataset[pd.isna(dataset[column_name]) == False].index)
    dataset_match['match_strategy'] = match_strategy
    return dataset_match, dataset_no_match

In [4]:
#function to merge and then split based on if an address was matched  
def my_merge(left, right, left_on, right_on): 
    merged = left.merge(right, how = 'left', left_on = left_on, right_on = right_on)
    merged_match = merged.drop(merged[pd.isna(merged['uprn']) == True].index) 
    merged_no_match = merged.drop(merged[pd.isna(merged['uprn']) == False].index) 
    return merged_match, merged_no_match

In [5]:
%%time
#### READ IN THE DATA IF EXPORTED 
resi_AB = pd.read_csv('data/resi_AB_cleaned_parsed_110524.csv',na_values = '0', low_memory = False)


CPU times: user 4min 50s, sys: 11min 48s, total: 16min 38s
Wall time: 29min 58s


In [273]:
%%time
other_AB = pd.read_csv('data/other_AB_cleaned_parsed_110524.csv', na_values = '0',low_memory = False)



CPU times: user 42.9 s, sys: 18.2 s, total: 1min 1s
Wall time: 1min 18s


In [7]:
#load in the data that hasn't been matched yet
#london_data = pd.read_csv('data/London2_no_match_2603.csv',low_memory = False)
#load in the batch 1 no match rate...
london_data = pd.read_csv('data/London1_no_match_1903.csv',low_memory = False)


Some data cleaning steps to clean up the Nan and numerical data coming from the CSV

In [290]:
### ------ DATA CLEANING  ----------
#edit the parent uprn from OSAddress base to fill out to be strings with 12 digits and leading zeros
#replace nan with empty string

#resi_AB = resi_AB.replace([np.nan, -np.inf], 0)
resi_AB = resi_AB.replace([np.nan, -np.inf], '')


#cast as integer
resi_AB['parent_uprn'] = resi_AB['parent_uprn'].astype('Int64')
resi_AB['uprn'] = resi_AB['uprn'].astype('Int64')


KeyboardInterrupt: 

In [9]:
### ------ DATA CLEANING  ----------
#cast as string 
resi_AB['parent_uprn'] = resi_AB['parent_uprn'].astype(str)
resi_AB['uprn'] = resi_AB['uprn'].astype(str)

#fill in with left side padding zeros 
resi_AB['parent_uprn'] = resi_AB['parent_uprn'].apply(lambda x: '{0:0>12}'.format(x))
resi_AB['uprn'] = resi_AB['uprn'].apply(lambda x: '{0:0>12}'.format(x))


In [326]:
#do the same cleaning as above but for the OTHER addresses
other_AB = other_AB.replace([np.nan, -np.inf], 0)
#other_AB = other_AB.replace([np.nan, -np.inf], '')


#cast as integer
other_AB['parent_uprn'] = other_AB['parent_uprn'].astype('Int64')
other_AB['uprn'] = other_AB['uprn'].astype('Int64')

ValueError: invalid literal for int() with base 10: ''

In [11]:
### ------ DATA CLEANING  ----------
#cast as string 
other_AB['parent_uprn'] = other_AB['parent_uprn'].astype(str)
other_AB['uprn'] = other_AB['uprn'].astype(str)

#fill in with left side padding zeros 
other_AB['parent_uprn'] = other_AB['parent_uprn'].apply(lambda x: '{0:0>12}'.format(x))
other_AB['uprn'] = other_AB['uprn'].apply(lambda x: '{0:0>12}'.format(x))


In [12]:
### ------ DATA CLEANING On RESI AB ----------

#Address matching data clean, make building number into a string 
resi_AB['building_number'] = resi_AB['building_number'].astype('Int64').astype('str') 

In [13]:
resi_AB['pao_start_number'] = resi_AB['pao_start_number'].astype('Int64').astype('str') 


In [None]:
# create new column with the pao_start_number plus pa_start_suffix 
resi_AB['pao_start_num_suffix'] = resi_AB['pao_start_number'] +  resi_AB['pao_start_suffix']

In [14]:
other_AB['building_number'] = other_AB['building_number'].astype('Int64').astype('str') 

In [323]:
other_AB['pao_start_number'] = other_AB['pao_start_number'].astype('Int64').astype('str') 

ValueError: invalid literal for int() with base 10: ''

ValueError: invalid literal for int() with base 10: ''

In [307]:
# create new column with the pao_start_number plus pa_start_suffix 
other_AB['pao_start_num_suffix'] = other_AB.apply(lambda x: np.where(x['pao_start_number']== 0, '', x['pao_start_number']) + np.where(x['pao_start_suffix']== 0, '', x['pao_start_suffix']), axis= 1)

In [309]:
other_AB['pao_start_num_suffix'].head()

0    
1    
2    
3    
4    
Name: pao_start_num_suffix, dtype: object

In [322]:
other_AB[other_AB['postcode_locator'] == 'HA0 4QS']['pao_start_number'].astype('Int64')

ValueError: invalid literal for int() with base 10: ''

In [294]:
other_AB.head()

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,locality,town_name,administrative_area,post_town,postcode,postcode_locator,postcode_sector,parsed_address1,parsed_address2,pao_start_num_suffix
0,10013268085,LU01,,51.628915,0.471228,E,,,,,...,RAMSDEN HEATH,CHELMSFORD,ESSEX,,,CM11 1NN,CM11 1,"MEEPSWOOD, PARK LANE",PARK LANE,0 \n1 \n2 ...
1,10014205932,PS,,51.590568,0.600675,E,,,,,...,,RAYLEIGH,ESSEX,,,SS6 9HJ,SS6 9,"STREET RECORD, FOOTPATH FROM GLEBE DRIVE TO TH...",FOOTPATH FROM GLEBE DRIVE TO THE APPROACH,0 \n1 \n2 ...
2,10014456341,LD01,,53.262298,-2.152675,E,,,,,...,,MACCLESFIELD,CHESHIRE EAST,,,SK10 3GF,SK10 3,"UNIT D5, PAVILION WAY",PAVILION WAY,0 \n1 \n2 ...
3,10015338472,OR01,,50.703129,-4.411257,E,,,,,...,BOYTON,LAUNCESTON,CORNWALL,,,PL15 8NR,PL15 8,POST BOX 12M FROM 2 BENNACOTT COTTAGES ON B325...,B3254 FROM DOLSDON FARM TO BENNACOTT LAKE,0 \n1 \n2 ...
4,10015351527,OI10,,54.781583,-2.355289,E,,,,,...,,ALSTON,WESTMORLAND AND FURNESS,,,CA9 3NP,CA9 3,SHAFT 644M FROM FIDDLERS COTTAGE 117M FROM UNN...,NENTHEAD ON THE A689 TO THE C3039 CROSSING THE...,0 \n1 \n2 ...


------ DATA CLEANING PLANNING APPLICATION DATA  ----------


In [20]:
london_data['uprn_x']

0           <NA>
1           <NA>
2           <NA>
3           <NA>
4           <NA>
         ...    
323    207182849
324    207188743
325    207188743
326         <NA>
327         <NA>
Name: uprn_x, Length: 328, dtype: Int64

In [18]:

#BATCH 1 ONLY london_data has extra quotation marks, need to remove
#london_data['uprn'] = london_data['uprn'].apply(lambda x: x.strip("''") if not pd.isna(x) else x)


In [19]:
### ------ DATA CLEANING  ----------
london_data['uprn_x'] = london_data['uprn_x'].astype('Int64')



In [21]:
#cast the strings and make sure it looks good 
london_data['uprn_x'] = london_data['uprn_x'].astype('str') #apply(lambda x: str(x) if not pd.isna(x) else x)

london_data['uprn_x'] = london_data['uprn_x'].apply(lambda x: '{0:0>12}'.format(x) if not pd.isna(x) else x)

In [22]:
### ------ DATA CLEANING  ----------

london_data = london_data.replace('00000000<NA>',np.NaN)

Ignore the creation of the columns below if matching a batch for a second, third, etc. time

In [20]:
### ------ DATA CLEANING  ----------
#street address from site_name_LPA
london_data['parsed_street_LPA'] = london_data['site_name_LPA'].apply(lambda x: re.findall("[0-9]+.-?.[0-9]+?\s(.+)(Road|Lane|Avenue|Parade|Courtyard|Street|Gardens|Drive)", x) if not pd.isna(x) and '-' in x
                                                                  else (re.findall("[0-9]+\s(.+)(Road|Lane|Avenue|Parade|Courtyard|Street|Gardens|Drive)", x) if not pd.isna(x) else []))


In [21]:
london_data['parsed_street_LPA'] =  london_data['parsed_street_LPA'].apply(lambda x: x[0][0] + x[0][1]  if len(x) > 0 else None)

In [22]:
#cleaning - street address
london_data['parsed_street_GLA'] = london_data['site_name_GLA'].apply(lambda x: re.findall("[0-9]+.-?.[0-9]+?\s(.+)(Road|Lane|Avenue|Parade|Courtyard|Street|Gardens|Drive)", x) if not pd.isna(x) and '-' in x
                                                                  else (re.findall("[0-9]+\s(.+)(Road|Lane|Avenue|Parade|Courtyard|Street|Gardens|Drive)", x) if not pd.isna(x) else []))


In [23]:
london_data['parsed_street_GLA'] =  london_data['parsed_street_GLA'].apply(lambda x: x[0][0] + x[0][1]  if len(x) > 0 else None)

In [24]:
#if the original street was empty, then we want to use the LPA Or GLA parsed name as street_name 

london_data['street_name'] = np.where(london_data['street_name'].isnull(), np.where(london_data['parsed_street_LPA'].isnull(), london_data['parsed_street_GLA'], london_data['parsed_street_LPA']), london_data['street_name'] )

In [27]:
# DATA EXPLORATION - EXPORT CSV IF NEEDED 
#london_data.to_csv('batch2_cleaned.csv', index = False)

Continue here to clean data for all batches

In [23]:
# create new column with the number, street description, and the site name 
london_data['concat_addr'] = np.where(london_data['site_name_clean'].isnull(), '', london_data['site_name_clean'] +  ', ' ) + london_data['site_number_clean'] + ', ' +  london_data['street_name']

In [24]:
london_data['concat_addr'] = london_data['concat_addr'].str.upper()

In [25]:
#make london_data all into capitals to regularize
london_data['street_name'] = london_data['street_name'].str.upper()

In [26]:
#remove unwanted characters
london_data['postcode_clean'] = london_data['postcode_clean'].replace('x000D__x000D_\n', '')

In [27]:
london_data = london_data.replace({'_x000D__x000D_\n': ' ', '_x000d__x000d_\n': ' ' }, regex = True)

In [28]:
#remove white space
london_data['postcode_clean'] = london_data['postcode_clean'].apply(lambda x: str(x).strip())

In [29]:
london_data['postcode_sector'] = london_data['postcode_clean'].apply(lambda x: x[:-2])

In [30]:
#substring of site_name_GLA without the postcode .... 
london_data['site_name_GLA_no_pc'] = london_data.apply(lambda row: str(row['site_name_GLA']).upper().replace(', ' + row['postcode_clean'], ''), axis = 1)

In [31]:
london_data['site_name_LPA_no_pc'] = london_data.apply(lambda row: str(row['site_name_LPA']).upper().replace(row['postcode_clean'], ''), axis = 1)


UPRN MATCHING: If reprocessing data that has already been matched, Ignore the Data matching on the UPRN / Parent UPRN

In [32]:
#### -------- DATA MERGING --------
#join the london data with the RESIDENTIAL AddressBase dataset on UPRN
merged = london_data.merge(resi_AB, how = 'left', left_on = 'uprn', right_on = 'parent_uprn')

#merged['UCL_ID'].nunique()

In [33]:
#separate merged into no match and match 
merged_match, merged_no_match = separate_matches(merged, 'parent_uprn', 'parent_uprn')

merged_no_match = merged_no_match.dropna(axis=1, how='all')

In [34]:
#merge 2 on uprn not parent_uprn 
merged_2 = merged_no_match.merge(resi_AB, how = 'left', left_on = 'uprn_x', right_on = 'uprn')

In [35]:
#separate the merge 2 into two datasets for match v not match 
merged_2_match, merged_2_no_match = separate_matches(merged_2, 'uprn', 'uprn')

merged_2_no_match = merged_2_no_match.dropna(axis=1, how='all')      

In [36]:
merged_2_match = merged_2_match.rename(columns={"uprn": "uprn_OSAB"})
merged_match = merged_match.rename(columns={"uprn_y": "uprn_OSAB"})

#merged_2_match.count()

In [37]:
all_matched = pd.concat([merged_2_match, merged_match])


ADDRESS MATCHING: For all matching versions: Address matching starts here

In [33]:
merged_2_no_match = london_data

In [40]:
london_data.count()

UCL_ID                         328
planning_application_number    328
lpa_name                       328
application_type               328
application_type_full          231
description                    308
number_of_units                250
site_number_clean              296
street_name                    279
postcode_clean                 328
ward                           255
site_name_clean                 75
site_name_GLA                  255
site_name_LPA                   28
uprn_x                          90
decision                       305
status                         282
application_date                27
decision_date                    4
PD_type                        328
parsed_street_LPA               24
parsed_street_GLA               63
short_site_name_LPA]           255
concat_addr                    265
postcode_sector                328
site_name_GLA_no_pc            328
site_name_LPA_no_pc            328
dtype: int64

-------- DATA MERGING --------  STRATEGY 1 ADDRESS MATCH

In [34]:
## ADDRESS STRATEGY ONE 
#use the site name GLA no pc with the parsed address 1 
left_columns = ['site_name_GLA_no_pc', 'postcode_clean']  #['site_number_clean', 'street_name', 'postcode_clean']
right_columns = ['parsed_address1', 'postcode_locator'] #['building_number', 'street_description', 'postcode_locator']

# this is a very strict conservative join 
merged_on_address = merged_2_no_match.merge(resi_AB, how = 'left', left_on = left_columns, right_on = right_columns)

In [35]:
merged_on_address_match,merged_on_address_no_match =  separate_matches(merged_on_address, 'uprn', 'address_1')


In [37]:
#how many unique ids were matched?
merged_on_address_match['UCL_ID'].nunique()

0

In [38]:
## Add the merged_on_address_match to the matched
merged_on_address_match = merged_on_address_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address_match['match_strategy'] = 'address_1'

all_matched = merged_on_address_match

# Use this if UPRN was used for matching above
#all_matched = pd.concat([all_matched, merged_on_address_match])

merged_on_address_no_match = merged_on_address_no_match.dropna(axis=1, how='all')


In [39]:
#### ADDRESS Strategy 2 - 
## use the site name LPA without post code ...
left_columns_2 = ['site_name_LPA_no_pc', 'postcode_clean'] #London Data 
right_columns_2 =  ['parsed_address1', 'postcode_locator'] #AB

merged_on_address2 = merged_on_address_no_match.merge(resi_AB, how = 'left', left_on = left_columns_2, right_on = right_columns_2)

In [41]:
merged_on_address2_match = merged_on_address2.drop(merged_on_address2[pd.isna(merged_on_address2['uprn']) == True].index) 
merged_on_address2_no_match = merged_on_address2.drop(merged_on_address2[pd.isna(merged_on_address2['uprn']) == False].index) 


In [42]:
merged_on_address2_match['UCL_ID'].nunique()

0

In [43]:
merged_on_address2_match = merged_on_address2_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address2_match = merged_on_address2_match.drop(columns = ['short_site_name_LPA]']) 
merged_on_address2_match['match_strategy'] = 'address_2'

In [44]:
all_matched = pd.concat([all_matched, merged_on_address2_match])

In [45]:
merged_on_address2_no_match = merged_on_address2_no_match.dropna(axis=1, how='all')


-------- DATA MERGING --------  STRATEGY 3 ADDRESS MATCH

In [46]:
# #-------- DATA MERGING --------  STRATEGY 3 ADDRESS MATCH 
 # use the site name GLA no pc with the parsed address 2 
left_columns_3 = ['site_name_GLA_no_pc', 'postcode_clean'] 
right_columns_3 = ['parsed_address2', 'postcode_locator'] 


merged_on_address3_match, merged_on_address3_no_match = my_merge(merged_on_address2_no_match, resi_AB, left_columns_3, right_columns_3)

In [43]:
#for batch 2, gets 0 results
#merged_on_address3_no_match.count()

In [47]:
# add the matched into all matched
merged_on_address3_match = merged_on_address3_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address3_match['match_strategy'] = 'address_3'

frames = [all_matched, merged_on_address3_match]

all_matched = pd.concat(frames)

In [48]:
merged_on_address3_no_match = merged_on_address3_no_match.dropna(axis=1, how='all')

In [49]:
#### ADDRESS Strategy 4 
left_columns_4 = ['site_name_LPA_no_pc', 'postcode_clean']  #London Data 
right_columns_4 = ['parsed_address2', 'postcode_locator'] #AB

merged_on_address4 = merged_on_address3_no_match.merge(resi_AB, how = 'left', left_on = left_columns_4, right_on = right_columns_4)

In [50]:
merged_on_address4_match = merged_on_address4.drop(merged_on_address4[pd.isna(merged_on_address4['uprn']) == True].index) 
merged_on_address4_no_match = merged_on_address4.drop(merged_on_address4[pd.isna(merged_on_address4['uprn']) == False].index) 


In [51]:
merged_on_address4_match['UCL_ID'].nunique()

0

In [52]:
merged_on_address4_match = merged_on_address4_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address4_match['match_strategy'] = 'address_4'

frames = [all_matched, merged_on_address4_match]

all_matched = pd.concat(frames)
merged_on_address4_no_match = merged_on_address4_no_match.dropna(axis=1, how='all')

In [93]:
## Strategy 5 
#match on the street number, street name, postcode 

left_columns_5 = ['concat_addr', 'postcode_clean']
right_columns_5 = ['parsed_address1', 'postcode_locator']

merged_on_address5_match, merged_on_address5_no_match = my_merge(merged_on_address4_no_match, resi_AB, left_columns_5, right_columns_5)

In [94]:
merged_on_address5_match['UCL_ID'].nunique()

43

In [96]:
merged_on_address5_match = merged_on_address5_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address5_match['match_strategy'] = 'address_5'

frames = [all_matched, merged_on_address5_match]

all_matched = pd.concat(frames)
merged_on_address5_no_match = merged_on_address5_no_match.dropna(axis=1, how='all')

In [97]:
## Strategy 6
left_columns_6 = ['concat_addr', 'postcode_clean']
right_columns_6 = ['parsed_address2', 'postcode_locator']

merged_on_address6_match, merged_on_address6_no_match = my_merge(merged_on_address5_no_match, resi_AB, left_columns_6, right_columns_6)

In [98]:
merged_on_address6_match['UCL_ID'].nunique()

2

In [99]:
merged_on_address6_match = merged_on_address6_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address6_match['match_strategy'] = 'address_6'

frames = [all_matched, merged_on_address6_match]

all_matched = pd.concat(frames)
merged_on_address6_no_match = merged_on_address6_no_match.dropna(axis=1, how='all')

In [56]:
# pao_start_number + street_description

In [100]:
## Strategy 7
left_columns_7 = ['site_number_clean', 'street_name', 'postcode_clean']
right_columns_7 = ['pao_start_number', 'street_description', 'postcode_locator']

merged_on_address7_match, merged_on_address7_no_match = my_merge(merged_on_address6_no_match, resi_AB, left_columns_7, right_columns_7)

In [101]:
merged_on_address7_match['UCL_ID'].nunique()

44

In [102]:
merged_on_address7_match = merged_on_address7_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address7_match['match_strategy'] = 'address_7'

frames = [all_matched, merged_on_address7_match]

all_matched = pd.concat(frames)
merged_on_address7_no_match = merged_on_address7_no_match.dropna(axis=1, how='all')

In [103]:
### strategy 8 
left_columns_8 = ['site_number_clean', 'street_name', 'postcode_sector_x']
right_columns_8 = ['pao_start_number', 'street_description', 'postcode_sector']

merged_on_address8_match, merged_on_address8_no_match = my_merge(merged_on_address7_no_match, resi_AB, left_columns_8, right_columns_8)


In [104]:
merged_on_address8_match['UCL_ID'].nunique()

8

In [105]:
merged_on_address8_match = merged_on_address8_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address8_match['match_strategy'] = 'address_8'

frames = [all_matched, merged_on_address8_match]

all_matched = pd.concat(frames)
merged_on_address8_no_match = merged_on_address8_no_match.dropna(axis=1, how='all')

In [106]:
### strategy 9 
left_columns_9 = ['site_name_GLA_no_pc',  'postcode_sector_x']
right_columns_9 = ['parsed_address1',  'postcode_sector']

merged_on_address9_match, merged_on_address9_no_match = my_merge(merged_on_address8_no_match, resi_AB, left_columns_9, right_columns_9)


In [107]:
merged_on_address9_match['UCL_ID'].nunique()

0

In [108]:
merged_on_address9_match = merged_on_address9_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address9_match['match_strategy'] = 'address_9'

frames = [all_matched, merged_on_address9_match]

all_matched = pd.concat(frames)
merged_on_address9_no_match = merged_on_address9_no_match.dropna(axis=1, how='all')

In [109]:
### strategy 10 
left_columns_10 = ['site_name_LPA_no_pc',  'postcode_sector_x']
right_columns_10 = ['parsed_address1',  'postcode_sector']

merged_on_address10_match, merged_on_address10_no_match = my_merge(merged_on_address9_no_match, resi_AB, left_columns_10, right_columns_10)


In [110]:
merged_on_address10_match['UCL_ID'].nunique()

0

In [111]:
merged_on_address10_match = merged_on_address10_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address10_match['match_strategy'] = 'address_10'

frames = [all_matched, merged_on_address10_match]

all_matched = pd.concat(frames)
merged_on_address10_no_match = merged_on_address10_no_match.dropna(axis=1, how='all')

In [112]:
### strategy 11 
left_columns_11 = ['site_name_GLA_no_pc',  'postcode_sector_x']
right_columns_11 = ['parsed_address2',  'postcode_sector']

merged_on_address11_match, merged_on_address11_no_match = my_merge(merged_on_address10_no_match, resi_AB, left_columns_11, right_columns_11)


In [113]:
merged_on_address11_match['UCL_ID'].nunique()

2

In [114]:
merged_on_address11_match = merged_on_address11_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address11_match['match_strategy'] = 'address_11'

frames = [all_matched, merged_on_address11_match]

all_matched = pd.concat(frames)
merged_on_address11_no_match = merged_on_address11_no_match.dropna(axis=1, how='all')

In [115]:
### strategy 12 
left_columns_12 = ['site_name_LPA_no_pc',  'postcode_sector_x']
right_columns_12 = ['parsed_address2',  'postcode_sector']

merged_on_address12_match, merged_on_address12_no_match = my_merge(merged_on_address11_no_match, resi_AB, left_columns_12, right_columns_12)


In [116]:
merged_on_address12_match['UCL_ID'].nunique()

0

In [117]:
merged_on_address12_match = merged_on_address12_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address12_match['match_strategy'] = 'address_12'

frames = [all_matched, merged_on_address12_match]

all_matched = pd.concat(frames)
merged_on_address12_no_match = merged_on_address12_no_match.dropna(axis=1, how='all')

In [118]:
### strategy 13 
left_columns_13 = ['site_number_clean', 'street_name',  'postcode_sector_x']
right_columns_13 = ['pao_start_number', 'street_description',   'postcode_sector']


merged_on_address13_match, merged_on_address13_no_match = my_merge(merged_on_address11_no_match, resi_AB, left_columns_13, right_columns_13)


In [119]:
merged_on_address13_match['UCL_ID'].nunique()

0

In [120]:
merged_on_address13_match = merged_on_address13_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address13_match['match_strategy'] = 'address_13'

frames = [all_matched, merged_on_address13_match]

all_matched = pd.concat(frames)
merged_on_address13_no_match = merged_on_address13_no_match.dropna(axis=1, how='all')

In [178]:
### strategy 14
left_columns_14 = ['concat_addr', 'lpa_name']
right_columns_14 = ['parsed_address1', 'administrative_area']

merged_on_address14_match, merged_on_address14_no_match = my_merge(merged_on_address13_no_match, resi_AB, left_columns_14, right_columns_14)


In [179]:
merged_on_address14_match['UCL_ID'].nunique()

3

In [180]:
merged_on_address14_match = merged_on_address14_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address14_match['match_strategy'] = 'address_14'

frames = [all_matched, merged_on_address14_match]

all_matched = pd.concat(frames)
merged_on_address14_no_match = merged_on_address14_no_match.dropna(axis=1, how='all')

In [181]:
### strategy 15
left_columns_15 = ['concat_addr', 'lpa_name']
right_columns_15 = ['parsed_address1', 'administrative_area']

merged_on_address15_match, merged_on_address15_no_match = my_merge(merged_on_address14_no_match, resi_AB, left_columns_15, right_columns_15)


In [182]:
merged_on_address15_match['UCL_ID'].nunique()

0

In [183]:
merged_on_address15_match = merged_on_address15_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address15_match['match_strategy'] = 'address_15'

frames = [all_matched, merged_on_address15_match]

all_matched = pd.concat(frames)
merged_on_address15_no_match = merged_on_address15_no_match.dropna(axis=1, how='all')

In [None]:
## Strategy 16 
left_columns_16 = []
right_columns_16 = []

merged_on_address16_match, merged_on_address16_no_match = my_merge(merged_on_address15_no_match, resi_AB, left_columns_16, right_columns_16)

merged_on_address15_match['UCL_ID'].nunique()

In [None]:
merged_on_address16_match = merged_on_address16_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address16_match['match_strategy'] = 'address_16'

frames = [all_matched, merged_on_address16_match]

all_matched = pd.concat(frames)
merged_on_address16_no_match = merged_on_address16_no_match.dropna(axis=1, how='all')

In [None]:
## Strategy 14: No postcode sector, match on administrative area but replace the & with the 'AND' for BARKING & DAGENHAM

In [184]:
#print match rate
resi_match_rate = 100 * all_matched['UCL_ID'].nunique() / london_data['UCL_ID'].nunique()

print('Resi Match rate: ', resi_match_rate, ' from record count: ', all_matched['UCL_ID'].nunique()) 

Resi Match rate:  31.70731707317073  from record count:  104


------ specific address analysis ----- 

In [142]:
merged_on_address15_no_match[merged_on_address15_no_match['UCL_ID'] == 246]

Unnamed: 0,UCL_ID,planning_application_number,lpa_name,application_type,application_type_full,description,number_of_units,site_number_clean,street_name,postcode_clean,...,dependent_locality,locality,town_name,administrative_area,post_town,postcode,postcode_locator,postcode_sector,parsed_address1,parsed_address2
35,246,20/01567/PRIOFF,BARKING & DAGENHAM,Prior Approval,,Application for prior approval: Notification o...,6.0,7,BACK LANE,RM6 4BP,...,,,,,,,,,,


In [236]:
non_resi_address_merge_no_match15.to_csv('no_match_batch1b_may30.csv', index = False) #	005300 082025	

In [163]:
resi_AB[resi_AB['parent_uprn']== '000202220323']

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,dependent_locality,locality,town_name,administrative_area,post_town,postcode,postcode_locator,postcode_sector,parsed_address1,parsed_address2


In [237]:
resi_AB[resi_AB['postcode_locator'] == 'HA0 4QG']

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,dependent_locality,locality,town_name,administrative_area,post_town,postcode,postcode_locator,postcode_sector,parsed_address1,parsed_address2
441140,202102268,RD06,0,51.54444,-0.297093,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QG,HA0 4,"212, EALING ROAD",EALING ROAD
632847,202214037,RD06,0,51.54435,-0.297097,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QG,HA0 4,"216, EALING ROAD",EALING ROAD
684087,202015134,RD06,0,51.544296,-0.297099,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QG,HA0 4,"218, EALING ROAD",EALING ROAD
963738,202102266,RD06,0,51.544395,-0.297095,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QG,HA0 4,"214, EALING ROAD",EALING ROAD
1792768,202102286,RD06,0,51.544919,-0.297249,E,0.0,0,194A,0,...,0,0,WEMBLEY,BRENT,WEMBLEY,HA0 4QG,HA0 4QG,HA0 4,"194, EALING ROAD","194A, EALING ROAD"
3420019,202102271,RD06,0,51.544539,-0.297119,E,0.0,0,208A,0,...,0,0,WEMBLEY,BRENT,WEMBLEY,HA0 4QG,HA0 4QG,HA0 4,"208, EALING ROAD","208A, EALING ROAD"
4468894,202102273,RD06,0,51.544602,-0.297116,E,0.0,0,206A,0,...,0,0,WEMBLEY,BRENT,WEMBLEY,HA0 4QG,HA0 4QG,HA0 4,"206, EALING ROAD","206A, EALING ROAD"
5162622,202102275,RD06,0,51.544656,-0.297143,E,0.0,0,204A,0,...,0,0,WEMBLEY,BRENT,WEMBLEY,HA0 4QG,HA0 4QG,HA0 4,"204, EALING ROAD","204A, EALING ROAD"
6561429,202148301,RD06,202102270,51.544494,-0.297106,E,0.0,0,210A,0,...,0,0,WEMBLEY,BRENT,WEMBLEY,HA0 4QG,HA0 4QG,HA0 4,"FLAT 1, 210, EALING ROAD","210A, EALING ROAD"
9480478,202102282,RD06,0,51.544801,-0.297224,E,0.0,0,198A,0,...,0,0,WEMBLEY,BRENT,WEMBLEY,HA0 4QG,HA0 4QG,HA0 4,"198, EALING ROAD","198A, EALING ROAD"


In [246]:
other_AB[(other_AB['postcode_locator'] == 'HA0 4QS') & (other_AB['street_description'] == 'BOWRONS AVENUE')]

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,dependent_locality,locality,town_name,administrative_area,post_town,postcode,postcode_locator,postcode_sector,parsed_address1,parsed_address2
1029703,10025177788,CU01,0,51.544389,-0.302875,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QS,HA0 4,ELECTRICITY SUB STATION 66M FROM 95 NORTON ROA...,BOWRONS AVENUE
1309948,202228481,CR,0,51.54493,-0.29745,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QS,HA0 4,"UNIT 1, 1, BOWRONS AVENUE",BOWRONS AVENUE
1561284,202238960,PP,0,51.544557,-0.302108,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QS,HA0 4,"SHELL FOR STUDIO FLATS A TO F AND LOFT D, 39, ...",BOWRONS AVENUE
1569192,202228482,CR10,0,51.544931,-0.297508,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QS,HA0 4,"UNIT 2, 1, BOWRONS AVENUE",BOWRONS AVENUE
3301459,202128619,PS,0,51.544703,-0.30227,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QS,HA0 4,"STREET RECORD, BOWRONS AVENUE",BOWRONS AVENUE
3471091,202221907,PP,0,51.544579,-0.301732,E,0.0,0,0,85,...,0,0,WEMBLEY,BRENT,WEMBLEY,HA0 4QS,HA0 4QS,HA0 4,"85, BOWRONS AVENUE","85, BOWRONS AVENUE"
3641012,202220578,PP,0,51.544845,-0.297742,E,0.0,0,0,3,...,0,0,WEMBLEY,BRENT,WEMBLEY,HA0 4QS,HA0 4QS,HA0 4,"3, BOWRONS AVENUE","3, BOWRONS AVENUE"


In [241]:
resi_AB[(resi_AB['postcode_sector'] == 'HA0 4') & (resi_AB['street_description'] == 'BOWRONS AVENUE')].to_csv('output_HA04.csv', index  = False) # & (resi_AB['building_name'] == 'Apollo House') ] #& (other_AB['street_description'] == 'EAST STREET')]

In [242]:
other_AB[(other_AB['postcode_sector']== 'HA0 4') & (other_AB['street_description'] == 'BOWRONS AVENUE')].to_csv('other_HA04.csv', index = False)

In [171]:
seven_sisters.to_csv('seven_sisters.csv', index=False) 

-------- DATA MERGING --------  COMMERCIAL PROPERTIES --- Reproduce the process with commercial properties 


In [60]:
#Merge on Parent UPRN 
non_resi_merged = merged_on_address3_no_match.merge(other_AB,how = 'left', left_on = 'uprn_x', right_on = 'parent_uprn')

non_resi_match, non_resi_no_match =  separate_matches(non_resi_merged, 'parent_uprn', 'parent_uprn')

non_resi_no_match = non_resi_no_match.dropna(axis=1, how='all')

In [61]:
#merge on UPRN
non_resi_merged_2 = non_resi_no_match.merge(other_AB, how = 'left', left_on = 'uprn_x', right_on = 'uprn')

non_resi_match_2, non_resi_no_match_2 =  separate_matches(non_resi_merged_2, 'parent_uprn', 'uprn')

non_resi_no_match_2 = non_resi_no_match_2.dropna(axis=1, how='all')

In [62]:
non_resi_match_2 = non_resi_match_2.rename(columns={"uprn": "uprn_OSAB"})
#non_resi_match_2['match_strategy'] = 'uprn'
non_resi_match_2.count()

non_resi_match = non_resi_match.rename(columns={"uprn": "uprn_OSAB"})
#non_resi_match['match_strategy'] = 'parent_uprn'

In [63]:
#union them together 
non_resi_all_matched = pd.concat([non_resi_match_2, non_resi_match])

---- Commercial address matching --- 

In [185]:
#now join on addresses strat 1 
left_columns = ['site_name_GLA_no_pc', 'postcode_clean']  #['site_number_clean', 'street_name', 'postcode_clean']
right_columns = ['parsed_address1', 'postcode_locator'] #['building_number', 'street_description', 'postcode_locator']

#non_resi_address_merge_match, non_resi_address_merge_no_match = my_merge(non_resi_no_match_2, other_AB, left_columns, right_columns)

In [186]:
#use this one if skipping UPRN 
non_resi_address_merge_match, non_resi_address_merge_no_match = my_merge(merged_on_address15_no_match, other_AB, left_columns, right_columns)


In [187]:
non_resi_address_merge_no_match = non_resi_address_merge_no_match.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match = non_resi_address_merge_match.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match['match_strategy'] = 'address_1'

non_resi_all_matched = non_resi_address_merge_match

In [188]:
non_resi_address_merge_match['UCL_ID'].nunique()

0

In [96]:
#USE THIS FIRST TIME THRU 
# non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match])

In [189]:
#join addresses strat 2
left_columns_2 =  ['site_name_LPA_no_pc', 'postcode_clean'] 
right_columns_2 =  ['parsed_address1', 'postcode_locator']
 
non_resi_address_merge_match2, non_resi_address_merge_no_match2 = my_merge(non_resi_address_merge_no_match, other_AB, left_columns_2,right_columns_2 )

In [190]:
non_resi_address_merge_match2['UCL_ID'].nunique()

0

In [191]:
non_resi_address_merge_no_match2 = non_resi_address_merge_no_match2.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match2 = non_resi_address_merge_match2.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match2['match_strategy'] = 'address_2'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match2])

In [192]:
# STRATEGY 3 ADDRESS MATCH 
left_columns_3 = ['site_name_GLA_no_pc', 'postcode_clean'] 
right_columns_3 = ['parsed_address2', 'postcode_locator']


non_resi_address_merge_match3, non_resi_address_merge_no_match3 = my_merge(non_resi_address_merge_no_match2, other_AB, left_columns_3, right_columns_3)

In [193]:
non_resi_address_merge_match3['UCL_ID'].nunique()

0

In [194]:
non_resi_address_merge_no_match3 = non_resi_address_merge_no_match3.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match3 = non_resi_address_merge_match3.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match3['match_strategy'] = 'address_3'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match3])

------- ADDRESS ----- Strategy 4 COMM 

In [195]:
# STRATEGY 4 ADDRESS MATCH 
left_columns_4 = ['site_name_LPA_no_pc', 'postcode_clean'] 
right_columns_4 = ['parsed_address2', 'postcode_locator']


non_resi_address_merge_match4, non_resi_address_merge_no_match4 = my_merge(non_resi_address_merge_no_match3, other_AB, left_columns_4, right_columns_4)

In [196]:
non_resi_address_merge_match4['UCL_ID'].nunique()

0

In [197]:
non_resi_address_merge_no_match4 = non_resi_address_merge_no_match4.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match4 = non_resi_address_merge_match4.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match4['match_strategy'] = 'address_4'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match4])

In [199]:
#strategy 5 ADDRESS MATCH 
left_columns_5 = ['concat_addr', 'postcode_clean'] 
right_columns_5 = ['parsed_address1', 'postcode_locator']


non_resi_address_merge_match5, non_resi_address_merge_no_match5 = my_merge(non_resi_address_merge_no_match4, other_AB, left_columns_5, right_columns_5)


In [200]:
non_resi_address_merge_no_match5 = non_resi_address_merge_no_match5.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match5 = non_resi_address_merge_match5.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match5['match_strategy'] = 'address_5'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match5])

strategy 6, same as for resi --- 

In [201]:
## Strategy 6
left_columns_6 = ['concat_addr', 'postcode_clean']  
right_columns_6 = ['parsed_address2', 'postcode_locator'] 

non_resi_address_merge_match6, non_resi_address_merge_no_match6 = my_merge(non_resi_address_merge_no_match5, other_AB, left_columns_6, right_columns_6)

In [203]:
non_resi_address_merge_match6['UCL_ID'].nunique()

9

In [204]:
non_resi_address_merge_no_match6 = non_resi_address_merge_no_match6.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match6 = non_resi_address_merge_match6.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match6['match_strategy'] = 'address_6'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match6])

In [310]:
#strategy 7 ADDRESS MATCH 
left_columns_7 = ['site_number_clean', 'street_name', 'postcode_clean']
right_columns_7 = ['pao_start_num_suffix', 'street_description', 'postcode_locator'] 
#['pao_start_number', 'street_description', 'postcode_locator']

non_resi_address_merge_match7, non_resi_address_merge_no_match7 = my_merge(non_resi_address_merge_no_match6, other_AB, left_columns_7, right_columns_7)


In [311]:
non_resi_address_merge_match7['UCL_ID'].nunique() #was 88

0

In [312]:
non_resi_address_merge_no_match7 = non_resi_address_merge_no_match7.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match7 = non_resi_address_merge_match7.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match7['match_strategy'] = 'address_7'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match7])

  non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match7])


In [313]:
#strategy 8 ADDRESS MATCH 
left_columns_8 = ['site_number_clean', 'street_name', 'postcode_sector_x']
right_columns_8 = ['pao_start_num_suffix','street_description', 'postcode_sector']
#['pao_start_number', 'street_description', 'postcode_sector']

non_resi_address_merge_match8, non_resi_address_merge_no_match8 = my_merge(non_resi_address_merge_no_match7, other_AB, left_columns_8, right_columns_8)

In [314]:
non_resi_address_merge_match8['UCL_ID'].nunique() #was 88

0

In [317]:
other_AB[other_AB['postcode_locator'] == 'HA0 4QS']

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,locality,town_name,administrative_area,post_town,postcode,postcode_locator,postcode_sector,parsed_address1,parsed_address2,pao_start_num_suffix
1029703,10025177788,CU01,,51.544389,-0.302875,E,,,,,...,,WEMBLEY,BRENT,,,HA0 4QS,HA0 4,ELECTRICITY SUB STATION 66M FROM 95 NORTON ROA...,BOWRONS AVENUE,
1309948,202228481,CR,,51.54493,-0.29745,E,,,,,...,,WEMBLEY,BRENT,,,HA0 4QS,HA0 4,"UNIT 1, 1, BOWRONS AVENUE",BOWRONS AVENUE,1.0A
1561284,202238960,PP,,51.544557,-0.302108,E,,,,,...,,WEMBLEY,BRENT,,,HA0 4QS,HA0 4,"SHELL FOR STUDIO FLATS A TO F AND LOFT D, 39, ...",BOWRONS AVENUE,39.0
1569192,202228482,CR10,,51.544931,-0.297508,E,,,,,...,,WEMBLEY,BRENT,,,HA0 4QS,HA0 4,"UNIT 2, 1, BOWRONS AVENUE",BOWRONS AVENUE,1.0A
3301459,202128619,PS,,51.544703,-0.30227,E,,,,,...,,WEMBLEY,BRENT,,,HA0 4QS,HA0 4,"STREET RECORD, BOWRONS AVENUE",BOWRONS AVENUE,
3471091,202221907,PP,,51.544579,-0.301732,E,,,,85.0,...,,WEMBLEY,BRENT,WEMBLEY,HA0 4QS,HA0 4QS,HA0 4,"85, BOWRONS AVENUE","85, BOWRONS AVENUE",85.0
3641012,202220578,PP,,51.544845,-0.297742,E,,,,3.0,...,,WEMBLEY,BRENT,WEMBLEY,HA0 4QS,HA0 4QS,HA0 4,"3, BOWRONS AVENUE","3, BOWRONS AVENUE",3.0


In [210]:
non_resi_address_merge_no_match8 = non_resi_address_merge_no_match8.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match8 = non_resi_address_merge_match8.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match8['match_strategy'] = 'address_8'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match8])

In [211]:
#strategy 9 ADDRESS MATCH 
left_columns_9 = ['site_name_GLA_no_pc',  'postcode_sector_x']
right_columns_9 = ['parsed_address1',  'postcode_sector']

non_resi_address_merge_match9, non_resi_address_merge_no_match9 = my_merge(non_resi_address_merge_no_match8, other_AB, left_columns_9, right_columns_9)

In [212]:
non_resi_address_merge_match9['UCL_ID'].nunique() #was 88

0

In [213]:
non_resi_address_merge_no_match9 = non_resi_address_merge_no_match9.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match9 = non_resi_address_merge_match9.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match9['match_strategy'] = 'address_9'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match9])

In [214]:
#STRATEGY 10
left_columns_10 = ['site_name_LPA_no_pc',  'postcode_sector_x']
right_columns_10 = ['parsed_address1',  'postcode_sector']

non_resi_address_merge_match10, non_resi_address_merge_no_match10 = my_merge(non_resi_address_merge_no_match9, other_AB, left_columns_10, right_columns_10)

In [215]:
non_resi_address_merge_match10['UCL_ID'].nunique() #was 88

0

In [216]:
#STRATEGY 11
left_columns_11 = ['site_name_GLA_no_pc',  'postcode_sector_x']
right_columns_11 = ['parsed_address2',  'postcode_sector']

non_resi_address_merge_match11, non_resi_address_merge_no_match11 = my_merge(non_resi_address_merge_no_match9, other_AB, left_columns_11, right_columns_11)

In [217]:
non_resi_address_merge_match11['UCL_ID'].nunique() #was 88

0

In [218]:
non_resi_address_merge_no_match11 = non_resi_address_merge_no_match11.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match11 = non_resi_address_merge_match11.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match11['match_strategy'] = 'address_11'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match11])

In [219]:
#STRATEGY 12
left_columns_12 = ['site_name_LPA_no_pc',  'postcode_sector_x']
right_columns_12 = ['parsed_address2',  'postcode_sector']

non_resi_address_merge_match12, non_resi_address_merge_no_match12 = my_merge(non_resi_address_merge_no_match11, other_AB, left_columns_12, right_columns_12)

In [220]:
non_resi_address_merge_match12['UCL_ID'].nunique() #was 88

0

In [221]:
non_resi_address_merge_no_match12 = non_resi_address_merge_no_match12.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match12 = non_resi_address_merge_match12.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match12['match_strategy'] = 'address_12'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match12])

In [222]:
#STRATEGY 13
left_columns_13 = ['site_number_clean', 'street_name',  'postcode_sector_x']
right_columns_13 = ['pao_start_number', 'street_description',   'postcode_sector']

non_resi_address_merge_match13, non_resi_address_merge_no_match13 = my_merge(non_resi_address_merge_no_match12, other_AB, left_columns_13, right_columns_13)

In [223]:
non_resi_address_merge_match13['UCL_ID'].nunique() #was 88

0

In [224]:
non_resi_address_merge_no_match13 = non_resi_address_merge_no_match13.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match13 = non_resi_address_merge_match13.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match13['match_strategy'] = 'address_13'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match13])

In [225]:
#STRATEGY 14
left_columns_14 = ['concat_addr', 'lpa_name'] 
right_columns_14 = ['parsed_address1', 'administrative_area']

non_resi_address_merge_match14, non_resi_address_merge_no_match14 = my_merge(non_resi_address_merge_no_match13, other_AB, left_columns_14, right_columns_14)


In [226]:
non_resi_address_merge_match14['UCL_ID'].nunique() #was 88

0

In [227]:
non_resi_address_merge_no_match14 = non_resi_address_merge_no_match14.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match14 = non_resi_address_merge_match14.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match14['match_strategy'] = 'address_14'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match14])

In [228]:
#STRATEGY 15
left_columns_15 = ['concat_addr', 'lpa_name'] 
right_columns_15 = ['parsed_address2', 'administrative_area']

non_resi_address_merge_match15, non_resi_address_merge_no_match15 = my_merge(non_resi_address_merge_no_match14, other_AB, left_columns_15, right_columns_15)


In [229]:
non_resi_address_merge_match15['UCL_ID'].nunique() #was 88

2

In [230]:
non_resi_address_merge_no_match15 = non_resi_address_merge_no_match15.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match15 = non_resi_address_merge_match15.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match15['match_strategy'] = 'address_15'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match15])

----- specific address analysis  ---- 

In [71]:
## ------------------------------- Post Match  -------------------------------

In [231]:
#print match rate
resi_match_rate = 100 * all_matched['UCL_ID'].nunique() / london_data['UCL_ID'].nunique()

print('Resi Match rate: ', resi_match_rate) 

Resi Match rate:  31.70731707317073


In [233]:
all_matched['UCL_ID'].nunique()

104

In [235]:
other_match_rate = 100 * non_resi_all_matched['UCL_ID'].nunique() / london_data['UCL_ID'].nunique()

print('Other Match rate: ', other_match_rate) 

Other Match rate:  8.536585365853659


In [125]:
non_resi_all_matched['ID'].nunique()

310

In [126]:
#before export, make all these columns blank instead of 0 
#'legal_name', 'sub_building_name', 'building_name','building_number','street_description', 'dependent_locality	locality
all_matched['street_name'] = all_matched['street_name'].replace(0,'')
all_matched['legal_name'] = all_matched['legal_name'].replace(0,'')
all_matched['sub_building_name'] = all_matched['sub_building_name'].replace(0,'')
all_matched['building_name'] = all_matched['building_name'].replace(0,'')
all_matched['building_number'] = all_matched['building_number'].replace(0,'')
all_matched['street_description'] = all_matched['street_description'].replace(0,'')
all_matched['dependent_locality'] = all_matched['dependent_locality'].replace(0,'')
all_matched['locality'] = all_matched['locality'].replace(0,'')
all_matched['post_town'] = all_matched['post_town'].replace(0,'')
all_matched['postcode'] = np.where(all_matched['postcode'] == 0, all_matched['postcode_clean'], all_matched['postcode'])


In [127]:
#write out to csv 

all_matched.to_csv('London2b_address_matched_1705.csv', index=False) 

non_resi_all_matched.to_csv('London2b_non_resi_match_1705.csv', index=False)  

In [128]:
non_resi_address_merge_no_match6.to_csv('London2b_no_match_1705.csv', index = False)

In [129]:
#### some analysis on past records
#London2_no_match_2603.csv
#London2_non_resi_match_2603.csv
#London2_address_matched_2603.csv

batch2_no_match = pd.read_csv('data/London2_no_match_2603.csv')

In [130]:
batch2_non_resi = pd.read_csv('data/London2_non_resi_match_2603.csv')
batch2_resi = pd.read_csv('data/London2_address_matched_2603.csv')

In [133]:
batch2_resi.nunique() #1752 no match , 676 non resi , 729 resi

ID                              729
planning_application_number     990
lpa_name                         37
application_type                  1
application_type_full            17
description                     943
number_of_units                  59
site_number_clean               330
street_name                     537
postcode_clean                  695
site_name_clean                 156
site_name_GLA                   747
site_name_LPA                   839
uprn_x                          228
decision                         57
status                           10
application_date                585
decision_date                   667
parsed_street_LPA               345
parsed_street_GLA                17
uprn_OSAB                      5560
class                            10
parent_uprn                     477
latitude                        980
longitude                       979
country                           1
legal_name                        0
sub_building_name           