In [1]:
""" Script to match planning applications to OSAddressBase addresses

This script first loads the OSAddress Base data (separated into residential and other uses in separate script)
and then loads the planning application data scraped from local planning authority websites.

The matching methodology is described in an additional document, and uses multiple strategies to match the planning
application data records to residential addresses first, and then to other uses. It outputs a match rate that summarizes
how many records matched to each type of address. 

Written by: Christine Langston, April - June 2024
"""
import pandas as pd
import time
import numpy as np
import copy
import re

# Define functions

In [3]:
#takes a merged dataset and separates matched addresses and not matched
def separate_matches(dataset, column_name, match_strategy):
    dataset_match = dataset.drop(dataset[pd.isna(dataset[column_name]) == True].index)
    dataset_no_match = dataset.drop(dataset[pd.isna(dataset[column_name]) == False].index)
    dataset_match['match_strategy'] = match_strategy
    return dataset_match, dataset_no_match

In [4]:
#function to merge and then split based on if an address was matched  
def my_merge(left, right, left_on, right_on): 
    merged = left.merge(right, how = 'left', left_on = left_on, right_on = right_on)
    merged_match = merged.drop(merged[pd.isna(merged['uprn']) == True].index) 
    merged_no_match = merged.drop(merged[pd.isna(merged['uprn']) == False].index) 
    return merged_match, merged_no_match

# Load Data

In [5]:
%%time
#### READ IN THE DATA IF EXPORTED 
resi_AB = pd.read_csv('data/resi_AB_cleaned_parsed_110524.csv',na_values = '0', low_memory = False)


CPU times: user 4min 44s, sys: 11min 24s, total: 16min 9s
Wall time: 27min 36s


In [6]:
%%time
other_AB = pd.read_csv('data/other_AB_cleaned_parsed_110524.csv', na_values = '0',low_memory = False)



CPU times: user 39.1 s, sys: 7.03 s, total: 46.2 s
Wall time: 52.9 s


In [7]:
#load in the data that hasn't been matched yet
#london_data = pd.read_csv('data/London2_no_match_2603.csv',low_memory = False)
#load in the batch 1 no match rate...
#london_data = pd.read_csv('data/London1_no_match_1903.csv',low_memory = False)
#london_data = pd.read_csv('London1b_no_match_0406_manual_clean.csv',low_memory = False)
england_data = pd.read_csv('data/Batch3_no_match_1405.csv', low_memory = False)

# Data Cleaning

Data cleaning steps to clean up the Nan and numerical data coming from the CSV

In [8]:
#edit the parent uprn from OSAddress base to fill out to be strings with 12 digits and leading zeros
#replace nan with empty string

resi_AB = resi_AB.replace([np.nan, -np.inf], 0)

#cast as integer
resi_AB['parent_uprn'] = resi_AB['parent_uprn'].astype('Int64')
resi_AB['uprn'] = resi_AB['uprn'].astype('Int64')

In [9]:
#cast as string 
resi_AB['parent_uprn'] = resi_AB['parent_uprn'].astype(str)
resi_AB['uprn'] = resi_AB['uprn'].astype(str)

#fill in with left side padding zeros 
resi_AB['parent_uprn'] = resi_AB['parent_uprn'].apply(lambda x: '{0:0>12}'.format(x))
resi_AB['uprn'] = resi_AB['uprn'].apply(lambda x: '{0:0>12}'.format(x))


In [10]:
#do the same cleaning as above but for the OTHER addresses
other_AB = other_AB.replace([np.nan, -np.inf], 0)

#cast as integer
other_AB['parent_uprn'] = other_AB['parent_uprn'].astype('Int64')
other_AB['uprn'] = other_AB['uprn'].astype('Int64')

In [11]:
#cast as string 
other_AB['parent_uprn'] = other_AB['parent_uprn'].astype(str)
other_AB['uprn'] = other_AB['uprn'].astype(str)

#fill in with left side padding zeros 
other_AB['parent_uprn'] = other_AB['parent_uprn'].apply(lambda x: '{0:0>12}'.format(x))
other_AB['uprn'] = other_AB['uprn'].apply(lambda x: '{0:0>12}'.format(x))


Recast data columns as integers and then strings where needed to eliminate decimals

In [12]:
#Address matching data clean, make building number into a string 
resi_AB['building_number'] = resi_AB['building_number'].astype('Int64').astype('str') 

In [13]:
resi_AB['pao_start_number'] = resi_AB['pao_start_number'].astype('Int64').astype('str') 


In [14]:
resi_AB['pao_start_suffix'] = resi_AB.apply(lambda x: np.where(x['pao_start_suffix']== 0, '', x['pao_start_suffix']), axis = 1)

In [15]:
resi_AB['pao_start_number'] = resi_AB.apply(lambda x: np.where(x['pao_start_number']== '0', '', x['pao_start_number'] )  , axis = 1)

In [16]:
resi_AB['pao_start_number'] = resi_AB['pao_start_number'].replace('0','')

In [17]:
resi_AB['pao_start_number'] = resi_AB['pao_start_number'].astype(str)

In [18]:
resi_AB['pao_start_suffix'] = resi_AB['pao_start_suffix'].astype(str)

In [19]:
# create new column 'pao_start_num_suffix' with the pao_start_number plus pa_start_suffix 

resi_AB['pao_start_num_suffix'] = resi_AB['pao_start_number'].astype(str) + resi_AB['pao_start_suffix'].astype(str)

Same data cleaning for Other AB

In [20]:
other_AB['building_number'] = other_AB['building_number'].astype('Int64').astype('str') 

In [21]:
other_AB['pao_start_number'] = other_AB['pao_start_number'].astype('Int64').astype('str') 

In [22]:
other_AB['pao_start_suffix'] = other_AB.apply(lambda x: np.where(x['pao_start_suffix']== 0, '', x['pao_start_suffix']), axis = 1)

In [23]:
other_AB['pao_start_suffix'] = other_AB['pao_start_suffix'].astype(str)

In [24]:
other_AB['pao_start_number'] = other_AB.apply(lambda x: np.where(x['pao_start_number']== '0', '', x['pao_start_number'] )  , axis = 1)

In [25]:
other_AB['pao_start_number'] = other_AB['pao_start_number'].astype(str)

In [26]:
# create new column 'pao_start_num_suffix' with the pao_start_number plus pa_start_suffix 

other_AB['pao_start_num_suffix'] = other_AB['pao_start_number'].astype(str) + other_AB['pao_start_suffix'].astype(str)

Data cleaning for planning application data

In [27]:
england_data.head()

Unnamed: 0,UCL ID,planning_application_number,lpa_name,application_type,application_type_full,description,number_of_units,postcode_clean,site_number_clean,site_name_clean,...,decision,status,application_date,decision_date,PD_type,Full Address on Planning Application,FPP_PA_Mix,parsed_street_LPA,concat_addr,site_name_LPA_no_pc
0,5004,2/2014/0391,Allerdale,Prior Approval,,Prior approval of proposed change of use of ag...,,CA7 8NQ,,Clea Green,...,Approved,,,2014-06-04 00:00:00,Agricultural to resi,,,,,"CLEA GREEN, WESTWARD, WIGTON, CUMBRIA,"
1,5006,2/2014/0749,Allerdale,Prior Approval,,Notification for Prior Approval under Class MB...,,CA7 0AB,,Church Rigg Farm,...,Prior Approval Not Required,,,2014-10-14 00:00:00,Agricultural to resi,,,,,"CHURCH RIGG FARM, WIGTON, CUMBRIA,"
2,5007,2/2014/0832,Allerdale,Prior Approval,,Prior approval of proposed change of use of ag...,,CA7 8AS,,Brackenridge Farm,...,Approved,,,2014-11-24 00:00:00,Agricultural to resi,,,,,"BARN 2, BRACKENRIDGE FARM, BRACKENTHWAITE, WIG..."
3,5008,2/2014/0896,Allerdale,Prior Approval,,Prior approval of proposed change of use from ...,,CA7 2RA,,Greengill Farm,...,Approved,,,2014-12-19 00:00:00,Agricultural to resi,,,,,"GREENGILL FARM, GREENGILL, ASPATRIA, WIGTON, C..."
4,5009,2/2015/0001,Allerdale,Prior Approval,,Prior approval of proposed change of use agric...,,CA7 5LF,,Powhill Farm Cottage,...,Prior Approval Not Required,,,2015-01-05 00:00:00,Agricultural to resi,,,,,"POWHILL FARM COTTAGE, POWHILL FARM, KIRKBRIDE,..."


In [41]:
#check that uprn_x has NaNs and integers
england_data['uprn_x']

0       000000000nan
1       000000000nan
2       000000000nan
3       000000000nan
4       000000000nan
            ...     
5950    000000000nan
5951    000000000nan
5952    000000000nan
5953    000000000nan
5954    000000000nan
Name: uprn_x, Length: 5955, dtype: object

In [18]:
#BATCH 1 ONLY london_data has extra quotation marks, need to remove
#london_data['uprn'] = london_data['uprn'].apply(lambda x: x.strip("''") if not pd.isna(x) else x)


In [38]:
#clean the uprn_x column
england_data['uprn_x'] = england_data['uprn_x'].astype('Int64', errors = 'ignore')

In [40]:
#cast the strings and make sure it looks correct 
england_data['uprn_x'] = england_data['uprn_x'].astype('str') #apply(lambda x: str(x) if not pd.isna(x) else x)

england_data['uprn_x'] = england_data['uprn_x'].apply(lambda x: '{0:0>12}'.format(x) if not pd.isna(x) else x)

In [42]:
#replace strings with Nan values
england_data = england_data.replace('000000000nan',np.NaN)

In [47]:
england_data.count()

UCL ID                                  5955
planning_application_number             5955
lpa_name                                5955
application_type                        5955
application_type_full                   2566
description                             5815
number_of_units                          229
postcode_clean                          5279
site_number_clean                       2495
site_name_clean                         3468
street_name                             4653
site_name_LPA                           5953
uprn_x                                   385
decision                                4884
status                                   740
application_date                        5070
decision_date                           5141
PD_type                                 5955
Full Address on Planning Application     405
FPP_PA_Mix                                58
parsed_street_LPA                       1485
concat_addr                             4653
site_name_

Ignore the creation of the columns below if columns already exist (such as when matching a batch for a second, third, etc. time)

In [273]:
#create street address from site_name_LPA
london_data['parsed_street_LPA'] = london_data['site_name_LPA'].apply(lambda x: re.findall("[0-9]+.-?.[0-9]+?\s(.+)(Road|Lane|Avenue|Parade|Courtyard|Street|Gardens|Drive)", x) if not pd.isna(x) and '-' in x
                                                                  else (re.findall("[0-9]+\s(.+)(Road|Lane|Avenue|Parade|Courtyard|Street|Gardens|Drive)", x) if not pd.isna(x) else []))


In [274]:
london_data['parsed_street_LPA'] =  london_data['parsed_street_LPA'].apply(lambda x: x[0][0] + x[0][1]  if len(x) > 0 else None)

Only execute fields with GLA if for Batch 1 or 2 (London Data)

In [275]:
#cleaning - street address
london_data['parsed_street_GLA'] = london_data['site_name_GLA'].apply(lambda x: re.findall("[0-9]+.-?.[0-9]+?\s(.+)(Road|Lane|Avenue|Parade|Courtyard|Street|Gardens|Drive)", x) if not pd.isna(x) and '-' in x
                                                                  else (re.findall("[0-9]+\s(.+)(Road|Lane|Avenue|Parade|Courtyard|Street|Gardens|Drive)", x) if not pd.isna(x) else []))


In [276]:
london_data['parsed_street_GLA'] =  london_data['parsed_street_GLA'].apply(lambda x: x[0][0] + x[0][1]  if len(x) > 0 else None)

In [277]:
#if the original street was empty, then we want to use the LPA Or GLA parsed name as street_name 

london_data['street_name'] = np.where(london_data['street_name'].isnull(), np.where(london_data['parsed_street_LPA'].isnull(), london_data['parsed_street_GLA'], london_data['parsed_street_LPA']), london_data['street_name'] )

Continue here to clean data for all batches

In [48]:
# create new column with the number, street description, and the site name 
england_data['concat_addr'] = np.where(england_data['site_name_clean'].isnull(), '', england_data['site_name_clean'] +  ', ' ) + england_data['site_number_clean'] + ', ' +  england_data['street_name']

In [49]:
england_data['concat_addr'] = england_data['concat_addr'].str.upper()

In [50]:
#make street names all into capitals to regularize
england_data['street_name'] = england_data['street_name'].str.upper()

In [51]:
#remove unwanted characters
england_data['postcode_clean'] = england_data['postcode_clean'].replace('x000D__x000D_\n', '')

In [52]:
england_data = england_data.replace({'_x000D__x000D_\n': ' ', '_x000d__x000d_\n': ' ' }, regex = True)

In [53]:
#remove white space
england_data['postcode_clean'] = england_data['postcode_clean'].apply(lambda x: str(x).strip())

In [54]:
#add column with postcode sector
england_data['postcode_sector'] = england_data['postcode_clean'].apply(lambda x: x[:-2])

In [285]:
#substring of site_name_GLA without the postcode .... 
london_data['site_name_GLA_no_pc'] = london_data.apply(lambda row: str(row['site_name_GLA']).upper().replace(', ' + row['postcode_clean'], ''), axis = 1)

In [55]:
england_data['site_name_LPA_no_pc'] = england_data.apply(lambda row: str(row['site_name_LPA']).upper().replace(row['postcode_clean'], ''), axis = 1)


# Data matching

UPRN MATCHING: If reprocessing data that has already been matched, Ignore the Data matching on the UPRN / Parent UPRN

In [32]:
#### -------- DATA MERGING --------
#join the london data with the RESIDENTIAL AddressBase dataset on UPRN
merged = england_data.merge(resi_AB, how = 'left', left_on = 'uprn', right_on = 'parent_uprn')

#merged['UCL_ID'].nunique()

In [33]:
#separate merged into no match and match 
merged_match, merged_no_match = separate_matches(merged, 'parent_uprn', 'parent_uprn')

merged_no_match = merged_no_match.dropna(axis=1, how='all')

In [34]:
#merge 2 on uprn not parent_uprn 
merged_2 = merged_no_match.merge(resi_AB, how = 'left', left_on = 'uprn_x', right_on = 'uprn')

In [35]:
#separate the merge 2 into two datasets for match v not match 
merged_2_match, merged_2_no_match = separate_matches(merged_2, 'uprn', 'uprn')

merged_2_no_match = merged_2_no_match.dropna(axis=1, how='all')      

In [36]:
merged_2_match = merged_2_match.rename(columns={"uprn": "uprn_OSAB"})
merged_match = merged_match.rename(columns={"uprn_y": "uprn_OSAB"})

#merged_2_match.count()

In [37]:
all_matched = pd.concat([merged_2_match, merged_match])


ADDRESS MATCHING: For all matching versions: Address matching starts here

In [63]:
england_data.count()

#england_data['UCL ID'].nunique()

UCL ID                                  5955
planning_application_number             5955
lpa_name                                5955
application_type                        5955
application_type_full                   2566
description                             5815
number_of_units                          229
postcode_clean                          5955
site_number_clean                       2495
site_name_clean                         3468
street_name                             4653
site_name_LPA                           5953
uprn_x                                   385
decision                                4884
status                                   740
application_date                        5070
decision_date                           5141
PD_type                                 5955
Full Address on Planning Application     405
FPP_PA_Mix                                58
parsed_street_LPA                       1485
concat_addr                             2117
site_name_

In [291]:
# drop the extra column postcode_sector_x
#england_data = england_data.drop(columns=['postcode_sector_x'])

In [64]:
merged_2_no_match = england_data

-------- DATA MERGING --------  STRATEGY 1 ADDRESS MATCH

In [65]:
## ADDRESS STRATEGY ONE 
#use the site name GLA no pc with the parsed address 1 
left_columns = ['site_name_GLA_no_pc', 'postcode_clean']  #['site_number_clean', 'street_name', 'postcode_clean']
right_columns = ['parsed_address1', 'postcode_locator'] #['building_number', 'street_description', 'postcode_locator']

# this is a very strict conservative join 
merged_on_address = merged_2_no_match.merge(resi_AB, how = 'left', left_on = left_columns, right_on = right_columns)

KeyError: 'site_name_GLA_no_pc'

In [296]:
merged_on_address_match,merged_on_address_no_match =  separate_matches(merged_on_address, 'uprn', 'address_1')


In [297]:
#how many unique ids were matched?
merged_on_address_match['UCL_ID'].nunique()

0

In [430]:
## Add the merged_on_address_match to the matched
merged_on_address_match = merged_on_address_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address_match['match_strategy'] = 'address_1'

all_matched = merged_on_address_match

# Use this if UPRN was used for matching above
#all_matched = pd.concat([all_matched, merged_on_address_match])

merged_on_address_no_match = merged_on_address_no_match.dropna(axis=1, how='all')


In [431]:
#### ADDRESS Strategy 2 - 
## use the site name LPA without post code ...
left_columns_2 = ['site_name_LPA_no_pc', 'postcode_clean'] #London Data 
right_columns_2 =  ['parsed_address1', 'postcode_locator'] #AB

merged_on_address2 = merged_on_address_no_match.merge(resi_AB, how = 'left', left_on = left_columns_2, right_on = right_columns_2)

In [432]:
merged_on_address2_match = merged_on_address2.drop(merged_on_address2[pd.isna(merged_on_address2['uprn']) == True].index) 
merged_on_address2_no_match = merged_on_address2.drop(merged_on_address2[pd.isna(merged_on_address2['uprn']) == False].index) 


In [433]:
merged_on_address2_match['UCL_ID'].nunique()

0

In [434]:
merged_on_address2_match = merged_on_address2_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address2_match = merged_on_address2_match.drop(columns = ['short_site_name_LPA]']) 
merged_on_address2_match['match_strategy'] = 'address_2'

In [435]:
all_matched = pd.concat([all_matched, merged_on_address2_match])

In [436]:
merged_on_address2_no_match = merged_on_address2_no_match.dropna(axis=1, how='all')


-------- DATA MERGING --------  STRATEGY 3 ADDRESS MATCH

In [437]:
# #-------- DATA MERGING --------  STRATEGY 3 ADDRESS MATCH 
 # use the site name GLA no pc with the parsed address 2 
left_columns_3 = ['site_name_GLA_no_pc', 'postcode_clean'] 
right_columns_3 = ['parsed_address2', 'postcode_locator'] 


merged_on_address3_match, merged_on_address3_no_match = my_merge(merged_on_address2_no_match, resi_AB, left_columns_3, right_columns_3)

In [438]:
#for batch 2, gets 0 results
#merged_on_address3_no_match.count()

In [439]:
# add the matched into all matched
merged_on_address3_match = merged_on_address3_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address3_match['match_strategy'] = 'address_3'

frames = [all_matched, merged_on_address3_match]

all_matched = pd.concat(frames)

In [440]:
merged_on_address3_no_match = merged_on_address3_no_match.dropna(axis=1, how='all')

In [441]:
#### ADDRESS Strategy 4 
left_columns_4 = ['site_name_LPA_no_pc', 'postcode_clean']  #London Data 
right_columns_4 = ['parsed_address2', 'postcode_locator'] #AB

merged_on_address4 = merged_on_address3_no_match.merge(resi_AB, how = 'left', left_on = left_columns_4, right_on = right_columns_4)

In [442]:
merged_on_address4_match = merged_on_address4.drop(merged_on_address4[pd.isna(merged_on_address4['uprn']) == True].index) 
merged_on_address4_no_match = merged_on_address4.drop(merged_on_address4[pd.isna(merged_on_address4['uprn']) == False].index) 


In [443]:
merged_on_address4_match['UCL_ID'].nunique()

0

In [444]:
merged_on_address4_match = merged_on_address4_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address4_match['match_strategy'] = 'address_4'

frames = [all_matched, merged_on_address4_match]

all_matched = pd.concat(frames)
merged_on_address4_no_match = merged_on_address4_no_match.dropna(axis=1, how='all')

In [445]:
## Strategy 5 
#match on the street number, street name, postcode 

left_columns_5 = ['concat_addr', 'postcode_clean']
right_columns_5 = ['parsed_address1', 'postcode_locator']

merged_on_address5_match, merged_on_address5_no_match = my_merge(merged_on_address4_no_match, resi_AB, left_columns_5, right_columns_5)

In [446]:
merged_on_address5_match['UCL_ID'].nunique()

19

In [447]:
merged_on_address5_match = merged_on_address5_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address5_match['match_strategy'] = 'address_5'

frames = [all_matched, merged_on_address5_match]

all_matched = pd.concat(frames)
merged_on_address5_no_match = merged_on_address5_no_match.dropna(axis=1, how='all')

In [448]:
## Strategy 6
left_columns_6 = ['concat_addr', 'postcode_clean']
right_columns_6 = ['parsed_address2', 'postcode_locator']

merged_on_address6_match, merged_on_address6_no_match = my_merge(merged_on_address5_no_match, resi_AB, left_columns_6, right_columns_6)

In [449]:
merged_on_address6_match['UCL_ID'].nunique()

1

In [450]:
merged_on_address6_match = merged_on_address6_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address6_match['match_strategy'] = 'address_6'

frames = [all_matched, merged_on_address6_match]

all_matched = pd.concat(frames)
merged_on_address6_no_match = merged_on_address6_no_match.dropna(axis=1, how='all')

In [451]:
# pao_start_number + street_description

In [452]:
## Strategy 7
left_columns_7 = ['site_number_clean', 'street_name', 'postcode_clean']
right_columns_7 = ['pao_start_num_suffix', 'street_description', 'postcode_locator']

merged_on_address7_match, merged_on_address7_no_match = my_merge(merged_on_address6_no_match, resi_AB, left_columns_7, right_columns_7)

In [453]:
merged_on_address7_match['UCL_ID'].nunique()

36

In [454]:
resi_AB['pao_start_num_suffix'][0]

''

In [455]:
merged_on_address7_match = merged_on_address7_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address7_match['match_strategy'] = 'address_7'

frames = [all_matched, merged_on_address7_match]

all_matched = pd.concat(frames)
merged_on_address7_no_match = merged_on_address7_no_match.dropna(axis=1, how='all')

In [456]:
### strategy 8 
left_columns_8 = ['site_number_clean', 'street_name', 'postcode_sector_x']
right_columns_8 = ['pao_start_num_suffix', 'street_description', 'postcode_sector']

merged_on_address8_match, merged_on_address8_no_match = my_merge(merged_on_address7_no_match, resi_AB, left_columns_8, right_columns_8)


In [457]:
merged_on_address8_match['UCL_ID'].nunique()

13

In [458]:
merged_on_address8_match = merged_on_address8_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address8_match['match_strategy'] = 'address_8'

frames = [all_matched, merged_on_address8_match]

all_matched = pd.concat(frames)
merged_on_address8_no_match = merged_on_address8_no_match.dropna(axis=1, how='all')

In [459]:
left_columns_9 = ['site_number_clean', 'street_name', 'postcode_clean']
right_columns_9 = ['pao_start_number', 'street_description', 'postcode_locator']


merged = merged_on_address8_no_match.merge(resi_AB, how = 'left', left_on = left_columns_9, right_on = right_columns_9)


In [460]:
### strategy 9 
left_columns_9 = ['site_number_clean', 'street_name', 'postcode_clean']
right_columns_9 = ['pao_start_number', 'street_description', 'postcode_locator']

merged_on_address9_match, merged_on_address9_no_match = my_merge(merged_on_address8_no_match, resi_AB, left_columns_9, right_columns_9)


In [461]:
merged_on_address9_match['UCL_ID'].nunique()

4

In [462]:
merged_on_address9_match = merged_on_address9_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address9_match['match_strategy'] = 'address_9'

frames = [all_matched, merged_on_address9_match]

all_matched = pd.concat(frames)
merged_on_address9_no_match = merged_on_address9_no_match.dropna(axis=1, how='all')

In [463]:
### strategy 10 
left_columns_10 = ['site_number_clean', 'street_name', 'postcode_sector_x']
right_columns_10 = ['pao_start_number', 'street_description', 'postcode_sector']

merged_on_address10_match, merged_on_address10_no_match = my_merge(merged_on_address9_no_match, resi_AB, left_columns_10, right_columns_10)


In [464]:
merged_on_address10_match['UCL_ID'].nunique()

4

In [465]:
merged_on_address10_match = merged_on_address10_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address10_match['match_strategy'] = 'address_10'

frames = [all_matched, merged_on_address10_match]

all_matched = pd.concat(frames)
merged_on_address10_no_match = merged_on_address10_no_match.dropna(axis=1, how='all')

In [466]:
### strategy 11 
left_columns_11 = ['site_name_GLA_no_pc',  'postcode_sector_x']
right_columns_11 = ['parsed_address1',  'postcode_sector']

merged_on_address11_match, merged_on_address11_no_match = my_merge(merged_on_address10_no_match, resi_AB, left_columns_11, right_columns_11)


In [467]:
merged_on_address11_match['UCL_ID'].nunique()

0

In [468]:
merged_on_address11_match = merged_on_address11_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address11_match['match_strategy'] = 'address_11'

frames = [all_matched, merged_on_address11_match]

all_matched = pd.concat(frames)
merged_on_address11_no_match = merged_on_address11_no_match.dropna(axis=1, how='all')

In [469]:
### strategy 12 
left_columns_12 = ['site_name_LPA_no_pc',  'postcode_sector_x']
right_columns_12 = ['parsed_address1',  'postcode_sector']

merged_on_address12_match, merged_on_address12_no_match = my_merge(merged_on_address11_no_match, resi_AB, left_columns_12, right_columns_12)


In [470]:
merged_on_address12_match['UCL_ID'].nunique()

0

In [471]:
merged_on_address12_match = merged_on_address12_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address12_match['match_strategy'] = 'address_12'

frames = [all_matched, merged_on_address12_match]

all_matched = pd.concat(frames)
merged_on_address12_no_match = merged_on_address12_no_match.dropna(axis=1, how='all')

In [472]:
### strategy 13 
left_columns_13 = ['site_name_GLA_no_pc',  'postcode_sector_x']
right_columns_13 = ['parsed_address2',  'postcode_sector']


merged_on_address13_match, merged_on_address13_no_match = my_merge(merged_on_address11_no_match, resi_AB, left_columns_13, right_columns_13)


In [473]:
merged_on_address13_match['UCL_ID'].nunique()

0

In [474]:
merged_on_address13_match = merged_on_address13_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address13_match['match_strategy'] = 'address_13'

frames = [all_matched, merged_on_address13_match]

all_matched = pd.concat(frames)
merged_on_address13_no_match = merged_on_address13_no_match.dropna(axis=1, how='all')

In [475]:
### strategy 14
left_columns_14 = ['site_name_LPA_no_pc',  'postcode_sector_x']
right_columns_14 = ['parsed_address2',  'postcode_sector']

merged_on_address14_match, merged_on_address14_no_match = my_merge(merged_on_address13_no_match, resi_AB, left_columns_14, right_columns_14)


In [476]:
merged_on_address14_match['UCL_ID'].nunique()

0

In [477]:
merged_on_address14_match = merged_on_address14_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address14_match['match_strategy'] = 'address_14'

frames = [all_matched, merged_on_address14_match]

all_matched = pd.concat(frames)
merged_on_address14_no_match = merged_on_address14_no_match.dropna(axis=1, how='all')

In [478]:
### strategy 15
left_columns_15 = ['site_number_clean', 'street_name',  'postcode_sector_x']
right_columns_15 = ['pao_start_number', 'street_description',   'postcode_sector']

merged_on_address15_match, merged_on_address15_no_match = my_merge(merged_on_address14_no_match, resi_AB, left_columns_15, right_columns_15)


In [479]:
merged_on_address15_match['UCL_ID'].nunique()

0

In [480]:
merged_on_address15_match = merged_on_address15_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address15_match['match_strategy'] = 'address_15'

frames = [all_matched, merged_on_address15_match]

all_matched = pd.concat(frames)
merged_on_address15_no_match = merged_on_address15_no_match.dropna(axis=1, how='all')

In [481]:
## Strategy 16 
left_columns_16 = ['concat_addr', 'lpa_name']
right_columns_16 = ['parsed_address1', 'administrative_area']

merged_on_address16_match, merged_on_address16_no_match = my_merge(merged_on_address15_no_match, resi_AB, left_columns_16, right_columns_16)

merged_on_address16_match['UCL_ID'].nunique()

3

In [482]:
merged_on_address16_match = merged_on_address16_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address16_match['match_strategy'] = 'address_16'

frames = [all_matched, merged_on_address16_match]

all_matched = pd.concat(frames)
merged_on_address16_no_match = merged_on_address16_no_match.dropna(axis=1, how='all')

In [483]:
## Strategy 17 
left_columns_17 = ['concat_addr', 'lpa_name']
right_columns_17 = ['parsed_address1', 'administrative_area']

merged_on_address17_match, merged_on_address17_no_match = my_merge(merged_on_address16_no_match, resi_AB, left_columns_17, right_columns_17)

merged_on_address17_match['UCL_ID'].nunique()

0

In [484]:
merged_on_address17_match = merged_on_address17_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address17_match['match_strategy'] = 'address_17'

frames = [all_matched, merged_on_address17_match]

all_matched = pd.concat(frames)
merged_on_address17_no_match = merged_on_address17_no_match.dropna(axis=1, how='all')

In [485]:
merged_on_address17_no_match = merged_on_address17_no_match.dropna(axis=1, how='all')

In [486]:
#Strategy 18 
left_columns_18 = ['site_name_clean', 'street_name','postcode_clean']
right_columns_18 = ['pao_text', 'street_description','postcode_locator']

merged_on_address18_match, merged_on_address18_no_match = my_merge(merged_on_address17_no_match, resi_AB, left_columns_18, right_columns_18)

merged_on_address18_match['UCL_ID'].nunique()

2

In [487]:
merged_on_address18_match = merged_on_address18_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address18_match['match_strategy'] = 'address_18'

frames = [all_matched, merged_on_address18_match]

all_matched = pd.concat(frames)
merged_on_address18_no_match = merged_on_address18_no_match.dropna(axis=1, how='all')

In [488]:
#Strategy 19 
left_columns_19 = ['site_number_clean', 'site_name_clean','postcode_clean']
right_columns_19 = ['pao_start_number', 'building_name','postcode_locator']

merged_on_address19_match, merged_on_address19_no_match = my_merge(merged_on_address18_no_match, resi_AB, left_columns_19, right_columns_19)

merged_on_address19_match['UCL_ID'].nunique()

0

In [489]:
merged_on_address19_match = merged_on_address19_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address19_match['match_strategy'] = 'address_19'

frames = [all_matched, merged_on_address19_match]

all_matched = pd.concat(frames)
merged_on_address19_no_match = merged_on_address19_no_match.dropna(axis=1, how='all')

In [490]:
#Strategy 20 
left_columns_20 = ['site_number_clean', 'site_name_clean','postcode_clean']
right_columns_20 = ['building_number', 'building_name','postcode_locator']

merged_on_address20_match, merged_on_address20_no_match = my_merge(merged_on_address19_no_match, resi_AB, left_columns_20, right_columns_20)

merged_on_address20_match['UCL_ID'].nunique()

0

In [494]:
merged_on_address20_match = merged_on_address20_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address20_match['match_strategy'] = 'address_20'

frames = [all_matched, merged_on_address20_match]

all_matched = pd.concat(frames)
merged_on_address20_no_match = merged_on_address20_no_match.dropna(axis=1, how='all')

In [None]:
## Strategy 14: No postcode sector, match on administrative area but replace the & with the 'AND' for BARKING & DAGENHAM

In [495]:
#print match rate
resi_match_rate = 100 * all_matched['UCL_ID'].nunique() / london_data['UCL_ID'].nunique()

print('Resi Match rate: ', resi_match_rate, ' from record count: ', all_matched['UCL_ID'].nunique()) 

Resi Match rate:  35.78431372549019  from record count:  73


------ specific address analysis ----- 

In [142]:
merged_on_address15_no_match[merged_on_address15_no_match['UCL_ID'] == 246]

Unnamed: 0,UCL_ID,planning_application_number,lpa_name,application_type,application_type_full,description,number_of_units,site_number_clean,street_name,postcode_clean,...,dependent_locality,locality,town_name,administrative_area,post_town,postcode,postcode_locator,postcode_sector,parsed_address1,parsed_address2
35,246,20/01567/PRIOFF,BARKING & DAGENHAM,Prior Approval,,Application for prior approval: Notification o...,6.0,7,BACK LANE,RM6 4BP,...,,,,,,,,,,


In [236]:
non_resi_address_merge_no_match15.to_csv('no_match_batch1b_may30.csv', index = False) #	005300 082025	

In [163]:
resi_AB[resi_AB['parent_uprn']== '000202220323']

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,dependent_locality,locality,town_name,administrative_area,post_town,postcode,postcode_locator,postcode_sector,parsed_address1,parsed_address2


In [263]:
resi_AB[(resi_AB['postcode_sector'] == 'TW5 9') & (resi_AB['street_description'] == 'BATH ROAD')].to_csv()

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,locality,town_name,administrative_area,post_town,postcode,postcode_locator,postcode_sector,parsed_address1,parsed_address2,pao_start_num_suffix
1217,100021539712,RD06,100023405589,51.477920,-0.405469,E,0.0,0,0,0,...,0,HOUNSLOW,HOUNSLOW,0,0,TW5 9TL,TW5 9,"FLAT, 624, BATH ROAD",BATH ROAD,624
38040,010093768587,RD06,010093768585,51.479750,-0.412405,E,0.0,0,0,0,...,0,HOUNSLOW,HOUNSLOW,0,0,TW5 9TY,TW5 9,"FLAT 2, 768, BATH ROAD",BATH ROAD,768B
102316,200003978414,RD06,200003985976,51.477969,-0.405638,E,0.0,0,0,0,...,0,HOUNSLOW,HOUNSLOW,0,0,TW5 9TL,TW5 9,"FLAT, 628, BATH ROAD",BATH ROAD,628A
124228,010091693743,RD06,100021539703,51.477123,-0.403297,E,0.0,0,0,0,...,0,HOUNSLOW,HOUNSLOW,0,0,TW5 9UX,TW5 9,"FLAT FIRST FLOOR, 568, BATH ROAD",BATH ROAD,568
180211,010091069046,RD06,100021539736,51.478403,-0.405332,E,0.0,0,0,0,...,0,HOUNSLOW,HOUNSLOW,0,0,TW5 9SR,TW5 9,"FLAT FIRST FLOOR RIGHT, 687, BATH ROAD",BATH ROAD,687
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17000434,100021539674,RD03,000000000000,51.476151,-0.399416,E,0.0,0,0,506,...,0,HOUNSLOW,HOUNSLOW,HOUNSLOW,TW5 9UP,TW5 9UP,TW5 9,"506, BATH ROAD","506, BATH ROAD",506
17223772,100021539908,RD06,100023661721,51.478795,-0.406723,E,0.0,0,13 TREFUSIS COURT,0,...,0,HOUNSLOW,HOUNSLOW,HOUNSLOW,TW5 9SY,TW5 9SY,TW5 9,"TREFUSIS COURT, BATH ROAD","13 TREFUSIS COURT, BATH ROAD",
17238277,100021539902,RD06,100023661721,51.478795,-0.406723,E,0.0,0,7 TREFUSIS COURT,0,...,0,HOUNSLOW,HOUNSLOW,HOUNSLOW,TW5 9SY,TW5 9SY,TW5 9,"TREFUSIS COURT, BATH ROAD","7 TREFUSIS COURT, BATH ROAD",
17640205,010014073678,RD06,100023405715,51.479726,-0.412021,E,0.0,0,758B,0,...,0,HOUNSLOW,HOUNSLOW,HOUNSLOW,TW5 9TY,TW5 9TY,TW5 9,"FLAT B, 758, BATH ROAD","758B, BATH ROAD",758


In [None]:
resi_AB[other_AB['postcode_locator'] == 'TW5 9AT']

In [246]:
other_AB[(other_AB['postcode_locator'] == 'HA0 4QS') & (other_AB['street_description'] == 'BOWRONS AVENUE')]

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,dependent_locality,locality,town_name,administrative_area,post_town,postcode,postcode_locator,postcode_sector,parsed_address1,parsed_address2
1029703,10025177788,CU01,0,51.544389,-0.302875,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QS,HA0 4,ELECTRICITY SUB STATION 66M FROM 95 NORTON ROA...,BOWRONS AVENUE
1309948,202228481,CR,0,51.54493,-0.29745,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QS,HA0 4,"UNIT 1, 1, BOWRONS AVENUE",BOWRONS AVENUE
1561284,202238960,PP,0,51.544557,-0.302108,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QS,HA0 4,"SHELL FOR STUDIO FLATS A TO F AND LOFT D, 39, ...",BOWRONS AVENUE
1569192,202228482,CR10,0,51.544931,-0.297508,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QS,HA0 4,"UNIT 2, 1, BOWRONS AVENUE",BOWRONS AVENUE
3301459,202128619,PS,0,51.544703,-0.30227,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QS,HA0 4,"STREET RECORD, BOWRONS AVENUE",BOWRONS AVENUE
3471091,202221907,PP,0,51.544579,-0.301732,E,0.0,0,0,85,...,0,0,WEMBLEY,BRENT,WEMBLEY,HA0 4QS,HA0 4QS,HA0 4,"85, BOWRONS AVENUE","85, BOWRONS AVENUE"
3641012,202220578,PP,0,51.544845,-0.297742,E,0.0,0,0,3,...,0,0,WEMBLEY,BRENT,WEMBLEY,HA0 4QS,HA0 4QS,HA0 4,"3, BOWRONS AVENUE","3, BOWRONS AVENUE"


In [241]:
resi_AB[(resi_AB['postcode_sector'] == 'HA0 4') & (resi_AB['street_description'] == 'BOWRONS AVENUE')].to_csv('output_HA04.csv', index  = False) # & (resi_AB['building_name'] == 'Apollo House') ] #& (other_AB['street_description'] == 'EAST STREET')]

In [242]:
other_AB[(other_AB['postcode_sector']== 'HA0 4') & (other_AB['street_description'] == 'BOWRONS AVENUE')].to_csv('other_HA04.csv', index = False)

In [171]:
seven_sisters.to_csv('seven_sisters.csv', index=False) 

-------- DATA MERGING --------  COMMERCIAL PROPERTIES --- Reproduce the process with commercial properties 


In [60]:
#Merge on Parent UPRN 
non_resi_merged = merged_on_address3_no_match.merge(other_AB,how = 'left', left_on = 'uprn_x', right_on = 'parent_uprn')

non_resi_match, non_resi_no_match =  separate_matches(non_resi_merged, 'parent_uprn', 'parent_uprn')

non_resi_no_match = non_resi_no_match.dropna(axis=1, how='all')

In [61]:
#merge on UPRN
non_resi_merged_2 = non_resi_no_match.merge(other_AB, how = 'left', left_on = 'uprn_x', right_on = 'uprn')

non_resi_match_2, non_resi_no_match_2 =  separate_matches(non_resi_merged_2, 'parent_uprn', 'uprn')

non_resi_no_match_2 = non_resi_no_match_2.dropna(axis=1, how='all')

In [62]:
non_resi_match_2 = non_resi_match_2.rename(columns={"uprn": "uprn_OSAB"})
#non_resi_match_2['match_strategy'] = 'uprn'
non_resi_match_2.count()

non_resi_match = non_resi_match.rename(columns={"uprn": "uprn_OSAB"})
#non_resi_match['match_strategy'] = 'parent_uprn'

In [63]:
#union them together 
non_resi_all_matched = pd.concat([non_resi_match_2, non_resi_match])

---- Commercial address matching --- 

In [496]:
#now join on addresses strat 1 
left_columns = ['site_name_GLA_no_pc', 'postcode_clean']  #['site_number_clean', 'street_name', 'postcode_clean']
right_columns = ['parsed_address1', 'postcode_locator'] #['building_number', 'street_description', 'postcode_locator']

#non_resi_address_merge_match, non_resi_address_merge_no_match = my_merge(non_resi_no_match_2, other_AB, left_columns, right_columns)

In [497]:
#use this one if skipping UPRN 
non_resi_address_merge_match, non_resi_address_merge_no_match = my_merge(merged_on_address20_no_match, other_AB, left_columns, right_columns)


In [498]:
non_resi_address_merge_no_match = non_resi_address_merge_no_match.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match = non_resi_address_merge_match.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match['match_strategy'] = 'address_1'

non_resi_all_matched = non_resi_address_merge_match

In [499]:
non_resi_address_merge_match['UCL_ID'].nunique()

0

In [None]:
#USE THIS FIRST TIME THRU 
# non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match])

In [500]:
#join addresses strat 2
left_columns_2 =  ['site_name_LPA_no_pc', 'postcode_clean'] 
right_columns_2 =  ['parsed_address1', 'postcode_locator']
 
non_resi_address_merge_match2, non_resi_address_merge_no_match2 = my_merge(non_resi_address_merge_no_match, other_AB, left_columns_2,right_columns_2 )

In [501]:
non_resi_address_merge_match2['UCL_ID'].nunique()

0

In [502]:
non_resi_address_merge_no_match2 = non_resi_address_merge_no_match2.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match2 = non_resi_address_merge_match2.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match2['match_strategy'] = 'address_2'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match2])

In [503]:
# STRATEGY 3 ADDRESS MATCH 
left_columns_3 = ['site_name_GLA_no_pc', 'postcode_clean'] 
right_columns_3 = ['parsed_address2', 'postcode_locator']


non_resi_address_merge_match3, non_resi_address_merge_no_match3 = my_merge(non_resi_address_merge_no_match2, other_AB, left_columns_3, right_columns_3)

In [504]:
non_resi_address_merge_match3['UCL_ID'].nunique()

0

In [505]:
non_resi_address_merge_no_match3 = non_resi_address_merge_no_match3.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match3 = non_resi_address_merge_match3.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match3['match_strategy'] = 'address_3'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match3])

------- ADDRESS ----- Strategy 4 COMM 

In [506]:
# STRATEGY 4 ADDRESS MATCH 
left_columns_4 = ['site_name_LPA_no_pc', 'postcode_clean'] 
right_columns_4 = ['parsed_address2', 'postcode_locator']


non_resi_address_merge_match4, non_resi_address_merge_no_match4 = my_merge(non_resi_address_merge_no_match3, other_AB, left_columns_4, right_columns_4)

In [507]:
non_resi_address_merge_match4['UCL_ID'].nunique()

0

In [508]:
non_resi_address_merge_no_match4 = non_resi_address_merge_no_match4.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match4 = non_resi_address_merge_match4.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match4['match_strategy'] = 'address_4'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match4])

In [509]:
#strategy 5 ADDRESS MATCH 
left_columns_5 = ['concat_addr', 'postcode_clean'] 
right_columns_5 = ['parsed_address1', 'postcode_locator']


non_resi_address_merge_match5, non_resi_address_merge_no_match5 = my_merge(non_resi_address_merge_no_match4, other_AB, left_columns_5, right_columns_5)


In [510]:
non_resi_address_merge_no_match5 = non_resi_address_merge_no_match5.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match5 = non_resi_address_merge_match5.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match5['match_strategy'] = 'address_5'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match5])

Strategy 6 other

In [511]:
## Strategy 6
left_columns_6 = ['concat_addr', 'postcode_clean']  
right_columns_6 = ['parsed_address2', 'postcode_locator'] 

non_resi_address_merge_match6, non_resi_address_merge_no_match6 = my_merge(non_resi_address_merge_no_match5, other_AB, left_columns_6, right_columns_6)

In [512]:
non_resi_address_merge_match6['UCL_ID'].nunique()

5

In [513]:
non_resi_address_merge_no_match6 = non_resi_address_merge_no_match6.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match6 = non_resi_address_merge_match6.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match6['match_strategy'] = 'address_6'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match6])

Strategy 7 Other

In [514]:
#strategy 7 ADDRESS MATCH 
left_columns_7 = ['site_number_clean', 'street_name', 'postcode_clean']
right_columns_7 = ['pao_start_num_suffix', 'street_description', 'postcode_locator'] 

non_resi_address_merge_match7, non_resi_address_merge_no_match7 = my_merge(non_resi_address_merge_no_match6, other_AB, left_columns_7, right_columns_7)


In [515]:
non_resi_address_merge_match7['UCL_ID'].nunique()

7

In [516]:
non_resi_address_merge_no_match7 = non_resi_address_merge_no_match7.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match7 = non_resi_address_merge_match7.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match7['match_strategy'] = 'address_7'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match7])

In [517]:
#strategy 8 ADDRESS MATCH 
left_columns_8 = ['site_number_clean', 'street_name', 'postcode_sector_x']
right_columns_8 = ['pao_start_num_suffix','street_description', 'postcode_sector']

non_resi_address_merge_match8, non_resi_address_merge_no_match8 = my_merge(non_resi_address_merge_no_match7, other_AB, left_columns_8, right_columns_8)

In [518]:
non_resi_address_merge_match8['UCL_ID'].nunique() #was 88

6

In [393]:
#other_AB[other_AB['postcode_locator'] == 'HA0 4QS']

In [519]:
non_resi_address_merge_no_match8 = non_resi_address_merge_no_match8.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match8 = non_resi_address_merge_match8.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match8['match_strategy'] = 'address_8'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match8])

In [520]:
#strategy 9 ADDRESS MATCH 
left_columns_9 = ['site_number_clean', 'street_name', 'postcode_clean']
right_columns_9 = ['pao_start_number', 'street_description', 'postcode_locator']

non_resi_address_merge_match9, non_resi_address_merge_no_match9 = my_merge(non_resi_address_merge_no_match8, other_AB, left_columns_9, right_columns_9)

In [521]:
non_resi_address_merge_match9['UCL_ID'].nunique() #was 88

0

In [522]:
non_resi_address_merge_no_match9 = non_resi_address_merge_no_match9.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match9 = non_resi_address_merge_match9.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match9['match_strategy'] = 'address_9'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match9])

In [523]:
#STRATEGY 10
left_columns_10 = ['site_number_clean', 'street_name', 'postcode_sector_x']
right_columns_10 = ['pao_start_number', 'street_description', 'postcode_sector']

non_resi_address_merge_match10, non_resi_address_merge_no_match10 = my_merge(non_resi_address_merge_no_match9, other_AB, left_columns_10, right_columns_10)

In [524]:
non_resi_address_merge_match10['UCL_ID'].nunique() #was 88

0

In [525]:
#STRATEGY 11
left_columns_11 = ['site_name_GLA_no_pc',  'postcode_sector_x']
right_columns_11 = ['parsed_address1',  'postcode_sector']

non_resi_address_merge_match11, non_resi_address_merge_no_match11 = my_merge(non_resi_address_merge_no_match9, other_AB, left_columns_11, right_columns_11)

In [526]:
non_resi_address_merge_match11['UCL_ID'].nunique() #was 88

0

In [527]:
non_resi_address_merge_no_match11 = non_resi_address_merge_no_match11.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match11 = non_resi_address_merge_match11.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match11['match_strategy'] = 'address_11'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match11])

In [528]:
#STRATEGY 12
left_columns_12 = ['site_name_LPA_no_pc',  'postcode_sector_x']
right_columns_12 = ['parsed_address1',  'postcode_sector']

non_resi_address_merge_match12, non_resi_address_merge_no_match12 = my_merge(non_resi_address_merge_no_match11, other_AB, left_columns_12, right_columns_12)

In [529]:
non_resi_address_merge_match12['UCL_ID'].nunique() #was 88

0

In [530]:
non_resi_address_merge_no_match12 = non_resi_address_merge_no_match12.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match12 = non_resi_address_merge_match12.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match12['match_strategy'] = 'address_12'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match12])

In [531]:
#STRATEGY 13
left_columns_13 = ['site_name_GLA_no_pc',  'postcode_sector_x']
right_columns_13 = ['parsed_address2',  'postcode_sector']

non_resi_address_merge_match13, non_resi_address_merge_no_match13 = my_merge(non_resi_address_merge_no_match12, other_AB, left_columns_13, right_columns_13)

In [532]:
non_resi_address_merge_match13['UCL_ID'].nunique() #was 88

0

In [533]:
non_resi_address_merge_no_match13 = non_resi_address_merge_no_match13.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match13 = non_resi_address_merge_match13.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match13['match_strategy'] = 'address_13'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match13])

In [534]:
#STRATEGY 14
left_columns_14 = ['site_name_LPA_no_pc',  'postcode_sector_x'] 
right_columns_14 = ['parsed_address2',  'postcode_sector']

non_resi_address_merge_match14, non_resi_address_merge_no_match14 = my_merge(non_resi_address_merge_no_match13, other_AB, left_columns_14, right_columns_14)


In [535]:
non_resi_address_merge_match14['UCL_ID'].nunique() #was 88

0

In [536]:
non_resi_address_merge_no_match14 = non_resi_address_merge_no_match14.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match14 = non_resi_address_merge_match14.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match14['match_strategy'] = 'address_14'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match14])

In [537]:
#STRATEGY 15
left_columns_15 = ['site_number_clean', 'street_name',  'postcode_sector_x']
right_columns_15 = ['pao_start_number', 'street_description',   'postcode_sector']

non_resi_address_merge_match15, non_resi_address_merge_no_match15 = my_merge(non_resi_address_merge_no_match14, other_AB, left_columns_15, right_columns_15)


In [538]:
non_resi_address_merge_match15['UCL_ID'].nunique() #was 88

0

In [539]:
non_resi_address_merge_no_match15 = non_resi_address_merge_no_match15.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match15 = non_resi_address_merge_match15.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match15['match_strategy'] = 'address_15'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match15])

In [540]:
#Strategy 16
left_columns_16 = ['concat_addr', 'lpa_name'] 
right_columns_16 = ['parsed_address1', 'administrative_area']

non_resi_address_merge_match16, non_resi_address_merge_no_match16 = my_merge(non_resi_address_merge_no_match15, other_AB, left_columns_16, right_columns_16)
non_resi_address_merge_match16['UCL_ID'].nunique()

3

In [541]:
non_resi_address_merge_no_match16 = non_resi_address_merge_no_match16.dropna(axis=1, how='all')
non_resi_address_merge_match16 = non_resi_address_merge_match16.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match16['match_strategy'] = 'address_16'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match16])

In [542]:
#strategy 17 
left_columns_17 = ['concat_addr', 'lpa_name'] 
right_columns_17 = ['parsed_address2', 'administrative_area']

non_resi_address_merge_match17, non_resi_address_merge_no_match17 = my_merge(non_resi_address_merge_no_match16, other_AB, left_columns_17, right_columns_17)


In [543]:
non_resi_address_merge_match17['UCL_ID'].nunique()

3

In [544]:
non_resi_address_merge_no_match17 = non_resi_address_merge_no_match17.dropna(axis=1, how='all')
non_resi_address_merge_match17 = non_resi_address_merge_match17.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match17['match_strategy'] = 'address_17'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match17])

In [545]:
#strategy 18
left_columns_18 = ['site_name_clean', 'street_name','postcode_clean']
right_columns_18 = ['pao_text', 'street_description','postcode_locator']

non_resi_address_merge_match18, non_resi_address_merge_no_match18 = my_merge(non_resi_address_merge_no_match17, other_AB, left_columns_18, right_columns_18)
non_resi_address_merge_match18['UCL_ID'].nunique()

3

In [546]:
non_resi_address_merge_no_match18 = non_resi_address_merge_no_match18.dropna(axis=1, how='all')
non_resi_address_merge_match18 = non_resi_address_merge_match18.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match18['match_strategy'] = 'address_18'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match18])

In [547]:
#strategy 19
left_columns_19 = ['site_number_clean', 'site_name_clean','postcode_clean']
right_columns_19 = ['pao_start_number', 'building_name','postcode_locator']

non_resi_address_merge_match19, non_resi_address_merge_no_match19 = my_merge(non_resi_address_merge_no_match18, other_AB, left_columns_19, right_columns_19)
non_resi_address_merge_match19['UCL_ID'].nunique()

0

In [548]:
non_resi_address_merge_no_match19 = non_resi_address_merge_no_match19.dropna(axis=1, how='all')
non_resi_address_merge_match19 = non_resi_address_merge_match19.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match19['match_strategy'] = 'address_19'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match19])

In [549]:
non_resi_address_merge_no_match17.to_csv('no_match_june4.csv', index = False)

----- specific address analysis  ---- 

In [71]:
## ------------------------------- Post Match  -------------------------------

In [550]:
#print match rate
resi_match_rate = 100 * all_matched['UCL_ID'].nunique() / london_data['UCL_ID'].nunique()

print('Resi Match rate: ', resi_match_rate) 

Resi Match rate:  35.78431372549019


In [551]:
all_matched['UCL_ID'].nunique()

73

In [552]:
other_match_rate = 100 * non_resi_all_matched['UCL_ID'].nunique() / london_data['UCL_ID'].nunique()

print('Other Match rate: ', other_match_rate) 

Other Match rate:  22.058823529411764


In [553]:
non_resi_all_matched['UCL_ID'].nunique() 

45

In [555]:
non_resi_address_merge_no_match19['UCL_ID'].nunique()

136

In [556]:
#before export, make all these columns blank instead of 0 
#'legal_name', 'sub_building_name', 'building_name','building_number','street_description', 'dependent_locality	locality
all_matched['street_name'] = all_matched['street_name'].replace(0,'')
all_matched['legal_name'] = all_matched['legal_name'].replace(0,'')
all_matched['sub_building_name'] = all_matched['sub_building_name'].replace(0,'')
all_matched['building_name'] = all_matched['building_name'].replace(0,'')
all_matched['building_number'] = all_matched['building_number'].replace(0,'')
all_matched['street_description'] = all_matched['street_description'].replace(0,'')
all_matched['dependent_locality'] = all_matched['dependent_locality'].replace(0,'')
all_matched['locality'] = all_matched['locality'].replace(0,'')
all_matched['post_town'] = all_matched['post_town'].replace(0,'')
all_matched['postcode'] = np.where(all_matched['postcode'] == 0, all_matched['postcode_clean'], all_matched['postcode'])


In [557]:
#write out to csv 

all_matched.to_csv('London1c_address_matched_0406.csv', index=False) 

non_resi_all_matched.to_csv('London1c_non_resi_match_0406.csv', index=False)  

In [558]:
non_resi_address_merge_no_match6.to_csv('London1c_no_match_0406.csv', index = False)

In [129]:
#### some analysis on past records
#London2_no_match_2603.csv
#London2_non_resi_match_2603.csv
#London2_address_matched_2603.csv

batch2_no_match = pd.read_csv('data/London2_no_match_2603.csv')

In [130]:
batch2_non_resi = pd.read_csv('data/London2_non_resi_match_2603.csv')
batch2_resi = pd.read_csv('data/London2_address_matched_2603.csv')

In [133]:
batch2_resi.nunique() #1752 no match , 676 non resi , 729 resi

ID                              729
planning_application_number     990
lpa_name                         37
application_type                  1
application_type_full            17
description                     943
number_of_units                  59
site_number_clean               330
street_name                     537
postcode_clean                  695
site_name_clean                 156
site_name_GLA                   747
site_name_LPA                   839
uprn_x                          228
decision                         57
status                           10
application_date                585
decision_date                   667
parsed_street_LPA               345
parsed_street_GLA                17
uprn_OSAB                      5560
class                            10
parent_uprn                     477
latitude                        980
longitude                       979
country                           1
legal_name                        0
sub_building_name           