In [1]:
""" Script to match planning applications to OSAddressBase addresses

This script first loads the OSAddress Base data (separated into residential and other uses in separate script)
and then loads the planning application data scraped from local planning authority websites.

The matching methodology is described in an additional document, and uses multiple strategies to match the planning
application data records to residential addresses first, and then to other uses. It outputs a match rate that summarizes
how many records matched to each type of address. 

Written by: Christine Langston, April - June 2024
"""
import pandas as pd
import time
import numpy as np
import copy
import re

# Define functions

In [2]:
#takes a merged dataset and separates matched addresses and not matched
def separate_matches(dataset, column_name, match_strategy):
    dataset_match = dataset.drop(dataset[pd.isna(dataset[column_name]) == True].index)
    dataset_no_match = dataset.drop(dataset[pd.isna(dataset[column_name]) == False].index)
    dataset_match['match_strategy'] = match_strategy
    return dataset_match, dataset_no_match

In [3]:
#function to merge and then split based on if an address was matched  
def my_merge(left, right, left_on, right_on): 
    merged = left.merge(right, how = 'left', left_on = left_on, right_on = right_on)
    merged_match = merged.drop(merged[pd.isna(merged['uprn']) == True].index) 
    merged_no_match = merged.drop(merged[pd.isna(merged['uprn']) == False].index) 
    return merged_match, merged_no_match

# Load Data

In [4]:
%%time
#### READ IN THE DATA IF EXPORTED 
resi_AB = pd.read_csv('data/resi_AB_cleaned_parsed_140624.csv',na_values = '0', low_memory = False)


CPU times: user 5min 25s, sys: 15min 23s, total: 20min 49s
Wall time: 50min 37s


In [5]:
%%time
other_AB = pd.read_csv('data/other_AB_cleaned_parsed_140624.csv', na_values = '0',low_memory = False)



CPU times: user 40.9 s, sys: 9.68 s, total: 50.6 s
Wall time: 59.5 s


In [196]:
#load in the data that hasn't been matched yet
#london_data = pd.read_csv('data/London2_no_match_2603.csv',low_memory = False)
#london_data = pd.read_csv('data/London1_no_match_1903.csv',low_memory = False)
#london_data = pd.read_csv('London1b_no_match_0406_manual_clean.csv',low_memory = False)
england_data = pd.read_csv('data/Batch3b_no_match_1306_manual.csv', low_memory = False)

# Data Cleaning

Data cleaning steps to clean up the Nan and numerical data coming from the CSV

In [7]:
#edit the parent uprn from OSAddress base to fill out to be strings with 12 digits and leading zeros
#replace nan with empty string

resi_AB = resi_AB.replace([np.nan, -np.inf], 0)

#cast as integer
resi_AB['parent_uprn'] = resi_AB['parent_uprn'].astype('Int64')
resi_AB['uprn'] = resi_AB['uprn'].astype('Int64')

In [8]:
#cast as string 
resi_AB['parent_uprn'] = resi_AB['parent_uprn'].astype(str)
resi_AB['uprn'] = resi_AB['uprn'].astype(str)

#fill in with left side padding zeros 
resi_AB['parent_uprn'] = resi_AB['parent_uprn'].apply(lambda x: '{0:0>12}'.format(x))
resi_AB['uprn'] = resi_AB['uprn'].apply(lambda x: '{0:0>12}'.format(x))


In [9]:
#do the same cleaning as above but for the OTHER addresses
other_AB = other_AB.replace([np.nan, -np.inf], 0)

#cast as integer
other_AB['parent_uprn'] = other_AB['parent_uprn'].astype('Int64')
other_AB['uprn'] = other_AB['uprn'].astype('Int64')

In [10]:
#cast as string 
other_AB['parent_uprn'] = other_AB['parent_uprn'].astype(str)
other_AB['uprn'] = other_AB['uprn'].astype(str)

#fill in with left side padding zeros 
other_AB['parent_uprn'] = other_AB['parent_uprn'].apply(lambda x: '{0:0>12}'.format(x))
other_AB['uprn'] = other_AB['uprn'].apply(lambda x: '{0:0>12}'.format(x))


Recast data columns as integers and then strings where needed to eliminate decimals

In [11]:
#Address matching data clean, make building number into a string 
resi_AB['building_number'] = resi_AB['building_number'].astype('Int64').astype('str') 

In [12]:
resi_AB['pao_start_number'] = resi_AB['pao_start_number'].astype('Int64').astype('str') 


In [13]:
resi_AB['pao_start_suffix'] = resi_AB.apply(lambda x: np.where(x['pao_start_suffix']== 0, '', x['pao_start_suffix']), axis = 1)

In [14]:
resi_AB['pao_start_number'] = resi_AB.apply(lambda x: np.where(x['pao_start_number']== '0', '', x['pao_start_number'] )  , axis = 1)

In [15]:
resi_AB['pao_start_number'] = resi_AB['pao_start_number'].replace('0','')

In [16]:
resi_AB['pao_start_number'] = resi_AB['pao_start_number'].astype(str)

In [17]:
resi_AB['pao_start_suffix'] = resi_AB['pao_start_suffix'].astype(str)

In [18]:
# create new column 'pao_start_num_suffix' with the pao_start_number plus pa_start_suffix 
resi_AB['pao_start_num_suffix'] = resi_AB['pao_start_number'].astype(str) + resi_AB['pao_start_suffix'].astype(str)

In [19]:
#make all nan into 0
#then cast into string 
resi_AB['building_name'] = resi_AB['building_name'].replace([np.nan, -np.inf], 0).astype(str)

In [20]:
#then make all 0 string into nan 
resi_AB['building_name'] = resi_AB['building_name'].replace('0',np.nan)

In [21]:
#make all nan into 0
#then cast into string 
resi_AB['town_name'] = resi_AB['town_name'].replace([np.nan, -np.inf], 0).astype(str)

In [22]:
resi_AB['town_name'] = resi_AB['town_name'].replace('0',np.nan)

In [23]:
#make all nan into 0
#then cast into string 
resi_AB['post_town'] = resi_AB['post_town'].replace([np.nan, -np.inf], 0).astype(str)

In [24]:
resi_AB['post_town'] = resi_AB['post_town'].replace('0',np.nan)

In [26]:
resi_AB.head()

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,town_name,administrative_area,post_town,postcode,postcode_locator,parsed_address1,parsed_address2,parsed_address3,postcode_sector,pao_start_num_suffix
0,10012778289,RD02,0,54.036005,-2.340833,E,0.0,0,,0,...,SETTLE,NORTH YORKSHIRE,,0,BD24 0LP,"SCOUTBER END FARM, OLD OLIVER LANE TO RAGGED HALL",OLD OLIVER LANE TO RAGGED HALL,", SETTLE,",BD24 0,
1,10013819934,RG02,10013819576,54.780687,-1.510651,E,0.0,0,,0,...,SHERBURN,DURHAM,,0,DH6 1JH,"GARAGE SITE 53, DOWSEY ROAD",DOWSEY ROAD,", SHERBURN,",DH6 1,
2,10014309637,RD04,200003655499,51.26651,0.497553,E,0.0,0,,0,...,MAIDSTONE,KENT,,0,ME16 8LD,"ROOM 1, 85, MILTON STREET",MILTON STREET,", MAIDSTONE,",ME16 8,85.0
3,10033213520,RD01,10002821654,53.903252,-0.158754,E,0.0,0,,0,...,HORNSEA,EAST RIDING OF YORKSHIRE,,0,HU18 1TL,"33 SOUTHFIELD, LONGBEACH LEISURE PARK, HORNSEA...",HORNSEA BURTON ROAD,", HORNSEA,",HU18 1,
4,10033545757,RD06,100023479131,51.514983,-0.17926,E,0.0,0,,0,...,LONDON,CITY OF WESTMINSTER,,0,W2 3UJ,"SECOND FLOOR, 58, WESTBOURNE TERRACE",WESTBOURNE TERRACE,", LONDON,",W2 3,58.0


Same data cleaning for Other AB

In [27]:
other_AB['building_number'] = other_AB['building_number'].astype('Int64').astype('str') 

In [28]:
other_AB['pao_start_number'] = other_AB['pao_start_number'].astype('Int64').astype('str') 

In [29]:
other_AB['pao_start_suffix'] = other_AB.apply(lambda x: np.where(x['pao_start_suffix']== 0, '', x['pao_start_suffix']), axis = 1)

In [30]:
other_AB['pao_start_suffix'] = other_AB['pao_start_suffix'].astype(str)

In [31]:
other_AB['pao_start_number'] = other_AB.apply(lambda x: np.where(x['pao_start_number']== '0', '', x['pao_start_number'] )  , axis = 1)

In [32]:
other_AB['pao_start_number'] = other_AB['pao_start_number'].astype(str)

In [33]:
# create new column 'pao_start_num_suffix' with the pao_start_number plus pa_start_suffix 

other_AB['pao_start_num_suffix'] = other_AB['pao_start_number'].astype(str) + other_AB['pao_start_suffix'].astype(str)

Data cleaning for planning application data

In [198]:
england_data.count()

UCL ID                                  5229
planning_application_number             5229
lpa_name                                5229
application_type                        5229
application_type_full                   2299
description                             5092
number_of_units                          186
manual_uprn                                2
postcode_clean                          4534
site_number_clean                       1830
site_name_clean                         3344
street_name                             3851
site_name_LPA                           5227
uprn_x                                   400
decision                                4292
status                                   707
application_date                        4466
decision_date                           4466
PD_type                                 5229
Full Address on Planning Application     388
FPP_PA_Mix                                51
parsed_street_LPA                        913
concat_add

In [35]:
#check that uprn_x has NaNs and integers
england_data['uprn_x']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
4665    NaN
4666    NaN
4667    NaN
4668    NaN
4669    NaN
Name: uprn_x, Length: 4670, dtype: object

In [36]:
#BATCH 1 ONLY london_data has extra quotation marks, need to remove
#london_data['uprn'] = london_data['uprn'].apply(lambda x: x.strip("''") if not pd.isna(x) else x)


In [199]:
#clean the uprn_x column
england_data['uprn_x'] = england_data['uprn_x'].astype('Int64', errors = 'ignore')

In [200]:
#cast the strings and make sure it looks correct 
england_data['uprn_x'] = england_data['uprn_x'].astype('str') #apply(lambda x: str(x) if not pd.isna(x) else x)

england_data['uprn_x'] = england_data['uprn_x'].apply(lambda x: '{0:0>12}'.format(x) if not pd.isna(x) else x)

In [201]:
#replace strings with Nan values
england_data = england_data.replace('000000000nan',np.NaN)

Ignore the creation of the columns below if columns already exist (such as when matching a batch for a second, third, etc. time)

In [273]:
#create street address from site_name_LPA
london_data['parsed_street_LPA'] = london_data['site_name_LPA'].apply(lambda x: re.findall("[0-9]+.-?.[0-9]+?\s(.+)(Road|Lane|Avenue|Parade|Courtyard|Street|Gardens|Drive)", x) if not pd.isna(x) and '-' in x
                                                                  else (re.findall("[0-9]+\s(.+)(Road|Lane|Avenue|Parade|Courtyard|Street|Gardens|Drive)", x) if not pd.isna(x) else []))


In [274]:
london_data['parsed_street_LPA'] =  london_data['parsed_street_LPA'].apply(lambda x: x[0][0] + x[0][1]  if len(x) > 0 else None)

Only execute fields with GLA if for Batch 1 or 2 (London Data)

In [275]:
#cleaning - street address
london_data['parsed_street_GLA'] = london_data['site_name_GLA'].apply(lambda x: re.findall("[0-9]+.-?.[0-9]+?\s(.+)(Road|Lane|Avenue|Parade|Courtyard|Street|Gardens|Drive)", x) if not pd.isna(x) and '-' in x
                                                                  else (re.findall("[0-9]+\s(.+)(Road|Lane|Avenue|Parade|Courtyard|Street|Gardens|Drive)", x) if not pd.isna(x) else []))


In [276]:
london_data['parsed_street_GLA'] =  london_data['parsed_street_GLA'].apply(lambda x: x[0][0] + x[0][1]  if len(x) > 0 else None)

In [277]:
#if the original street was empty, then we want to use the LPA Or GLA parsed name as street_name 

london_data['street_name'] = np.where(london_data['street_name'].isnull(), np.where(london_data['parsed_street_LPA'].isnull(), london_data['parsed_street_GLA'], london_data['parsed_street_LPA']), london_data['street_name'] )

Continue here to clean data for all batches

In [41]:
# create new column with the number, street description, and the site name 
england_data['concat_addr'] = np.where(england_data['site_name_clean'].isnull(), '', england_data['site_name_clean'] +  ', ' ) + england_data['site_number_clean'] + ', ' +  england_data['street_name']

In [42]:
england_data['concat_addr'] = england_data['concat_addr'].str.upper()

In [43]:
#make street names all into capitals to regularize
england_data['street_name'] = england_data['street_name'].str.upper()

In [44]:
#remove unwanted characters
england_data['postcode_clean'] = england_data['postcode_clean'].replace('x000D__x000D_\n', '')

In [45]:
england_data = england_data.replace({'_x000D__x000D_\n': ' ', '_x000d__x000d_\n': ' ' }, regex = True)

In [212]:
#remove white space
england_data['postcode_clean'] = england_data['postcode_clean'].apply(lambda x: str(x).strip())

In [213]:
#add column with postcode sector
england_data['postcode_sector'] = england_data['postcode_clean'].apply(lambda x: x[:-2])

In [48]:
#substring of site_name_GLA without the postcode .... 
london_data['site_name_GLA_no_pc'] = london_data.apply(lambda row: str(row['site_name_GLA']).upper().replace(', ' + row['postcode_clean'], ''), axis = 1)

NameError: name 'london_data' is not defined

In [49]:
england_data['site_name_LPA_no_pc'] = england_data.apply(lambda row: str(row['site_name_LPA']).replace(row['postcode_clean'], '').rstrip(', ').upper(), axis = 1)


# Data matching

### Residential Class matching - Use only the addresses in AddressBase that are classified as Residential Uses

UPRN MATCHING: If reprocessing data that has already been matched, Ignore the Data matching on the UPRN / Parent UPRN

In [206]:
#### -------- DATA MERGING --------
#join the london data with the RESIDENTIAL AddressBase dataset on UPRN
merged = england_data.merge(resi_AB, how = 'left', left_on = 'uprn_x', right_on = 'parent_uprn')

In [207]:
merged['UCL ID'].nunique()

4670

In [33]:
#separate merged into no match and match 
merged_match, merged_no_match = separate_matches(merged, 'parent_uprn', 'parent_uprn')

merged_no_match = merged_no_match.dropna(axis=1, how='all')

In [34]:
#merge 2 on uprn not parent_uprn 
merged_2 = merged_no_match.merge(resi_AB, how = 'left', left_on = 'uprn_x', right_on = 'uprn')

In [35]:
#separate the merge 2 into two datasets for match v not match 
merged_2_match, merged_2_no_match = separate_matches(merged_2, 'uprn', 'uprn')

merged_2_no_match = merged_2_no_match.dropna(axis=1, how='all')      

In [36]:
merged_2_match = merged_2_match.rename(columns={"uprn": "uprn_OSAB"})
merged_match = merged_match.rename(columns={"uprn_y": "uprn_OSAB"})

#merged_2_match.count()

In [37]:
all_matched = pd.concat([merged_2_match, merged_match])


ADDRESS MATCHING: For all matching versions: Address matching starts here

In [215]:
# drop the extra column postcode_sector_x
england_data = england_data.drop(columns=['postcode_sector_x'])

In [231]:
merged_2_no_match = england_data

-------- DATA MERGING --------  STRATEGY 1 ADDRESS MATCH

Skipped for Batch 3 because it uses a field that Batch 3 doesn't have, the site name GLA for London data only  
'site_name_GLA_no_pc' <> 'parsed_address1' and 'postcode_clean' <> 'postcode_locator'. 

In [232]:
#### ADDRESS Strategy 2 - use the site name LPA without post code ...
left_columns_2 = ['site_name_LPA_no_pc', 'postcode_clean'] #London Data 
right_columns_2 = ['parsed_address1', 'postcode_locator'] #AB

merged_on_address2_match, merged_on_address2_no_match = my_merge(merged_2_no_match, resi_AB, left_columns_2, right_columns_2)

merged_on_address2_match['UCL ID'].nunique()

0

In [233]:
merged_on_address2_match = merged_on_address2_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address2_match['match_strategy'] = 'address_2'

all_matched =  merged_on_address2_match

merged_on_address2_no_match = merged_on_address2_no_match.dropna(axis=1, how='all')


-------- DATA MERGING: STRATEGY 3 ADDRESS MATCH

Skipped for Batch 3 because it uses a field that Batch 3 doesn't have, the site name GLA for London data only  
'site_name_GLA_no_pc' <> 'parsed_address2' and 'postcode_clean' <> 'postcode_locator'. 

In [234]:
#### ADDRESS Strategy 4 
left_columns_4 = ['site_name_LPA_no_pc', 'postcode_clean']  #London Data 
right_columns_4 = ['parsed_address2', 'postcode_locator'] #AB

merged_on_address4_match, merged_on_address4_no_match = my_merge(merged_on_address2_no_match, resi_AB, left_columns_4, right_columns_4)
merged_on_address4_match['UCL ID'].nunique()

0

In [235]:
merged_on_address4_match = merged_on_address4_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address4_match['match_strategy'] = 'address_4'

frames = [all_matched, merged_on_address4_match]

all_matched = pd.concat(frames)
merged_on_address4_no_match = merged_on_address4_no_match.dropna(axis=1, how='all')

In [236]:
## Strategy 5 
#match on the street number, street name, postcode 

left_columns_5 = ['concat_addr', 'postcode_clean']
right_columns_5 = ['parsed_address1', 'postcode_locator']

merged_on_address5_match, merged_on_address5_no_match = my_merge(merged_on_address4_no_match, resi_AB, left_columns_5, right_columns_5)
merged_on_address5_match['UCL ID'].nunique()

0

In [237]:
merged_on_address5_match = merged_on_address5_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address5_match['match_strategy'] = 'address_5'

frames = [all_matched, merged_on_address5_match]

all_matched = pd.concat(frames)
merged_on_address5_no_match = merged_on_address5_no_match.dropna(axis=1, how='all')

In [238]:
## Strategy 6
left_columns_6 = ['concat_addr', 'postcode_clean']
right_columns_6 = ['parsed_address2', 'postcode_locator']

merged_on_address6_match, merged_on_address6_no_match = my_merge(merged_on_address5_no_match, resi_AB, left_columns_6, right_columns_6)

In [239]:
merged_on_address6_match['UCL ID'].nunique()

0

In [240]:
merged_on_address6_match = merged_on_address6_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address6_match['match_strategy'] = 'address_6'

frames = [all_matched, merged_on_address6_match]

all_matched = pd.concat(frames)
merged_on_address6_no_match = merged_on_address6_no_match.dropna(axis=1, how='all')

In [241]:
## Strategy 7
left_columns_7 = ['site_number_clean', 'street_name', 'postcode_clean']
right_columns_7 = ['pao_start_num_suffix', 'street_description', 'postcode_locator']

merged_on_address7_match, merged_on_address7_no_match = my_merge(merged_on_address6_no_match, resi_AB, left_columns_7, right_columns_7)

In [242]:
merged_on_address7_match['UCL ID'].nunique()

156

In [243]:
merged_on_address7_match = merged_on_address7_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address7_match['match_strategy'] = 'address_7'

frames = [all_matched, merged_on_address7_match]

all_matched = pd.concat(frames)
merged_on_address7_no_match = merged_on_address7_no_match.dropna(axis=1, how='all')

In [244]:
### strategy 8 
left_columns_8 = ['site_number_clean', 'street_name', 'postcode_sector_x']
right_columns_8 = ['pao_start_num_suffix', 'street_description', 'postcode_sector']

merged_on_address8_match, merged_on_address8_no_match = my_merge(merged_on_address7_no_match, resi_AB, left_columns_8, right_columns_8)


In [245]:
merged_on_address8_match['UCL ID'].nunique()

39

In [247]:
merged_on_address8_match = merged_on_address8_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address8_match['match_strategy'] = 'address_8'

frames = [all_matched, merged_on_address8_match]

all_matched = pd.concat(frames)
merged_on_address8_no_match = merged_on_address8_no_match.dropna(axis=1, how='all')

In [248]:
### strategy 9 
left_columns_9 = ['site_number_clean', 'street_name', 'postcode_clean']
right_columns_9 = ['pao_start_number', 'street_description', 'postcode_locator']

merged_on_address9_match, merged_on_address9_no_match = my_merge(merged_on_address8_no_match, resi_AB, left_columns_9, right_columns_9)


In [249]:
merged_on_address9_match['UCL ID'].nunique()

26

In [250]:
merged_on_address9_match = merged_on_address9_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address9_match['match_strategy'] = 'address_9'

frames = [all_matched, merged_on_address9_match]

all_matched = pd.concat(frames)
merged_on_address9_no_match = merged_on_address9_no_match.dropna(axis=1, how='all')

In [251]:
### strategy 10 
left_columns_10 = ['site_number_clean', 'street_name', 'postcode_sector_x']
right_columns_10 = ['pao_start_number', 'street_description', 'postcode_sector']

merged_on_address10_match, merged_on_address10_no_match = my_merge(merged_on_address9_no_match, resi_AB, left_columns_10, right_columns_10)


In [252]:
merged_on_address10_match['UCL ID'].nunique()

9

In [253]:
merged_on_address10_match = merged_on_address10_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address10_match['match_strategy'] = 'address_10'

frames = [all_matched, merged_on_address10_match]

all_matched = pd.concat(frames)
merged_on_address10_no_match = merged_on_address10_no_match.dropna(axis=1, how='all')

Strategy 11: Skip for Batch 3 because London specific. Uses site_name_GLA field 

 'site_name_GLA_no_pc' <> 'parsed_address1'
 'postcode_sector_x' <> 'postcode_sector'


In [254]:
### strategy 12 
left_columns_12 = ['site_name_LPA_no_pc',  'postcode_sector_x']
right_columns_12 = ['parsed_address1',  'postcode_sector']

merged_on_address12_match, merged_on_address12_no_match = my_merge(merged_on_address10_no_match, resi_AB, left_columns_12, right_columns_12)


In [255]:
merged_on_address12_match['UCL ID'].nunique()

0

In [256]:
merged_on_address12_match = merged_on_address12_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address12_match['match_strategy'] = 'address_12'

frames = [all_matched, merged_on_address12_match]

all_matched = pd.concat(frames)
merged_on_address12_no_match = merged_on_address12_no_match.dropna(axis=1, how='all')

Strategy 13: Skip for Batch 3 because London specific. Uses site_name_GLA field 

 'site_name_GLA_no_pc' <> 'parsed_address2'
 'postcode_sector_x' <> 'postcode_sector'


In [257]:
### strategy 14
left_columns_14 = ['site_name_LPA_no_pc',  'postcode_sector_x']
right_columns_14 = ['parsed_address2',  'postcode_sector']

merged_on_address14_match, merged_on_address14_no_match = my_merge(merged_on_address12_no_match, resi_AB, left_columns_14, right_columns_14)


In [258]:
merged_on_address14_match['UCL ID'].nunique()

0

In [259]:
merged_on_address14_match = merged_on_address14_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address14_match['match_strategy'] = 'address_14'

frames = [all_matched, merged_on_address14_match]

all_matched = pd.concat(frames)
merged_on_address14_no_match = merged_on_address14_no_match.dropna(axis=1, how='all')

 Realized Strategy 15 is redundant 

### strategy 15
left_columns_15 = ['site_number_clean', 'street_name',  'postcode_sector_x']
right_columns_15 = ['pao_start_number', 'street_description',   'postcode_sector']

merged_on_address15_match, merged_on_address15_no_match = my_merge(merged_on_address14_no_match, resi_AB, left_columns_15, right_columns_15)


merged_on_address15_match['UCL ID'].nunique()

merged_on_address15_match = merged_on_address15_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address15_match['match_strategy'] = 'address_15'

frames = [all_matched, merged_on_address15_match]

all_matched = pd.concat(frames)
merged_on_address15_no_match = merged_on_address15_no_match.dropna(axis=1, how='all')

In [285]:
## Strategy 16 
left_columns_16 = ['concat_addr', 'lpa_name']
right_columns_16 = ['parsed_address1', 'administrative_area']

merged_on_address14_no_match['lpa_name'] = merged_on_address14_no_match['lpa_name'].str.upper()

merged_on_address16_match, merged_on_address16_no_match = my_merge(merged_on_address14_no_match, resi_AB, left_columns_16, right_columns_16)

merged_on_address16_match['UCL ID'].nunique()

22

In [286]:
merged_on_address16_match = merged_on_address16_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address16_match['match_strategy'] = 'address_16'

frames = [all_matched, merged_on_address16_match]

all_matched = pd.concat(frames)
merged_on_address16_no_match = merged_on_address16_no_match.dropna(axis=1, how='all')

In [287]:
## Strategy 17 
left_columns_17 = ['concat_addr', 'lpa_name']
right_columns_17 = ['parsed_address2', 'administrative_area']

merged_on_address17_match, merged_on_address17_no_match = my_merge(merged_on_address16_no_match, resi_AB, left_columns_17, right_columns_17)

merged_on_address17_match['UCL ID'].nunique()

0

In [288]:
merged_on_address17_match = merged_on_address17_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address17_match['match_strategy'] = 'address_17'

frames = [all_matched, merged_on_address17_match]

all_matched = pd.concat(frames)
merged_on_address17_no_match = merged_on_address17_no_match.dropna(axis=1, how='all')

In [289]:
#Strategy 18 
left_columns_18 = ['site_name_clean', 'street_name','postcode_clean']
right_columns_18 = ['pao_text', 'street_description','postcode_locator']

merged_on_address18_match, merged_on_address18_no_match = my_merge(merged_on_address17_no_match, resi_AB, left_columns_18, right_columns_18)

merged_on_address18_match['UCL ID'].nunique()

0

In [290]:
merged_on_address18_match = merged_on_address18_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address18_match['match_strategy'] = 'address_18'

frames = [all_matched, merged_on_address18_match]

all_matched = pd.concat(frames)
merged_on_address18_no_match = merged_on_address18_no_match.dropna(axis=1, how='all')

In [291]:
#Strategy 19 
left_columns_19 = ['site_number_clean', 'site_name_clean','postcode_clean']
right_columns_19 = ['pao_start_number', 'building_name','postcode_locator']

merged_on_address19_match, merged_on_address19_no_match = my_merge(merged_on_address18_no_match, resi_AB, left_columns_19, right_columns_19)

merged_on_address19_match['UCL ID'].nunique()

151

In [292]:
merged_on_address19_match = merged_on_address19_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address19_match['match_strategy'] = 'address_19'

frames = [all_matched, merged_on_address19_match]

all_matched = pd.concat(frames)
merged_on_address19_no_match = merged_on_address19_no_match.dropna(axis=1, how='all')

In [293]:
#Strategy 20 
left_columns_20 = ['site_number_clean', 'site_name_clean','postcode_clean']
right_columns_20 = ['building_number', 'building_name','postcode_locator']

merged_on_address20_match, merged_on_address20_no_match = my_merge(merged_on_address19_no_match, resi_AB, left_columns_20, right_columns_20)

merged_on_address20_match['UCL ID'].nunique()

5

In [294]:
merged_on_address20_match = merged_on_address20_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address20_match['match_strategy'] = 'address_20'

frames = [all_matched, merged_on_address20_match]

all_matched = pd.concat(frames)
merged_on_address20_no_match = merged_on_address20_no_match.dropna(axis=1, how='all')

In [295]:
#Strategy 21 
left_columns_21 = ['site_name_LPA_no_pc','postcode_clean']
right_columns_21 = ['parsed_address3','postcode_locator']
#'building_name', 'town_name', 'post_town'

merged_on_address21_match, merged_on_address21_no_match = my_merge(merged_on_address20_no_match, resi_AB, left_columns_21, right_columns_21)

merged_on_address21_match['UCL ID'].nunique()

0

In [296]:
merged_on_address21_match = merged_on_address21_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address21_match['match_strategy'] = 'address_21'

frames = [all_matched, merged_on_address21_match]

all_matched = pd.concat(frames)
merged_on_address21_no_match = merged_on_address21_no_match.dropna(axis=1, how='all')

In [297]:
#Strategy 22  - match the site name to building name 
left_columns_22 = ['site_name_clean', 'postcode_clean'] #['site_name_LPA_no_pc', 'postcode_clean'] #London Data 
right_columns_22 = ['building_name', 'postcode_locator'] # ['parsed_address1', 'postcode_locator'] #AB

#separate data on if the site_name_clean is empty 
no_site_name = merged_on_address21_no_match.drop(merged_on_address21_no_match[pd.isna(merged_on_address21_no_match['site_name_clean']) == False].index)
has_site_name = merged_on_address21_no_match.drop(merged_on_address21_no_match[pd.isna(merged_on_address21_no_match['site_name_clean']) == True].index)

has_site_name['site_name_clean'] = has_site_name['site_name_clean'].str.upper()

merged_on_address22_match, merged_on_address22_no_match = my_merge(has_site_name, resi_AB, left_columns_22, right_columns_22)

merged_on_address22_match['UCL ID'].nunique()

746

In [298]:
#add the no_site_name and no_match together.. 
merged_on_address22_match = merged_on_address22_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address22_match['match_strategy'] = 'address_22'

frames = [all_matched, merged_on_address22_match]

all_matched = pd.concat(frames)


In [299]:
#the no match will have the addresses without site name excluded at the start
merged_on_address22_no_match = pd.concat([no_site_name, merged_on_address22_no_match])

merged_on_address22_no_match = merged_on_address22_no_match.dropna(axis=1, how='all')

In [300]:
#Strategy 23  - match the site name to building name 
left_columns_23 = ['site_name_clean', 'postcode_clean'] #['site_name_LPA_no_pc', 'postcode_clean'] #London Data 
right_columns_23 = ['pao_text', 'postcode_locator'] # ['parsed_address1', 'postcode_locator'] #AB

#separate data on if the site_name_clean is empty 
no_site_name = merged_on_address22_no_match.drop(merged_on_address22_no_match[pd.isna(merged_on_address22_no_match['site_name_clean']) == False].index)
has_site_name = merged_on_address22_no_match.drop(merged_on_address22_no_match[pd.isna(merged_on_address22_no_match['site_name_clean']) == True].index)
has_site_name['site_name_clean'] = has_site_name['site_name_clean'].str.upper()


merged_on_address23_match, merged_on_address23_no_match = my_merge(has_site_name, resi_AB, left_columns_23, right_columns_23)

merged_on_address23_match['UCL ID'].nunique()

141

In [301]:
#add the no_site_name and no_match together.. 
merged_on_address23_match = merged_on_address23_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address23_match['match_strategy'] = 'address_23'

frames = [all_matched, merged_on_address23_match]

all_matched = pd.concat(frames)


In [302]:
#the no match will have the addresses without site name excluded at the start
merged_on_address23_no_match = pd.concat([no_site_name, merged_on_address23_no_match])

merged_on_address23_no_match = merged_on_address23_no_match.dropna(axis=1, how='all')

In [303]:
#print match rate
resi_match_rate = 100 * all_matched['UCL ID'].nunique() / england_data['UCL ID'].nunique()

print('Resi Match rate: ', resi_match_rate, ' from record count: ', all_matched['UCL ID'].nunique()) 

Resi Match rate:  27.301927194860813  from record count:  1275


In [None]:
#Resi Match rate:  21.798715203426124  from record count:  1018

------ specific address analysis ----- 

In [179]:
resi_AB[(resi_AB['postcode_sector'] == 'CA14 1') & (resi_AB['street_description'] == 'MAIN ROAD')].to_csv('main_CA141.csv')

In [180]:
#NG19 6PH
resi_AB[(resi_AB['postcode_locator'] == 'NG19 6PH')]

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,town_name,administrative_area,post_town,postcode,postcode_locator,parsed_address1,parsed_address2,parsed_address3,postcode_sector,pao_start_num_suffix
12722460,200003313031,RD02,0,53.153104,-1.241189,E,0.0,0,PENNIMENT HOUSE FARM,0,...,SUTTON IN ASHFIELD,NOTTINGHAMSHIRE,MANSFIELD,NG19 6PH,NG19 6PH,"PENNIMENT HOUSE FARM, PENNIMENT LANE","PENNIMENT HOUSE FARM, PENNIMENT LANE","PENNIMENT HOUSE FARM, SUTTON IN ASHFIELD, MANS...",NG19 6,
18906713,10012813264,RD02,0,53.15343,-1.240197,E,0.0,0,PENNIMENT LODGE FARM,0,...,MANSFIELD,NOTTINGHAMSHIRE,MANSFIELD,NG19 6PH,NG19 6PH,"PENNIMENT LODGE FARM, PENNIMENT LANE","PENNIMENT LODGE FARM, PENNIMENT LANE","PENNIMENT LODGE FARM, MANSFIELD, MANSFIELD",NG19 6,
19765929,10023934555,RD02,0,53.153353,-1.240856,E,0.0,0,CROFTERS COTTAGE,0,...,MANSFIELD,NOTTINGHAMSHIRE,MANSFIELD,NG19 6PH,NG19 6PH,"CROFTERS COTTAGE, PENNIMENT LANE","CROFTERS COTTAGE, PENNIMENT LANE","CROFTERS COTTAGE, MANSFIELD, MANSFIELD",NG19 6,
19928388,10023932725,RD02,0,53.153441,-1.240555,E,0.0,0,PENNIMENT BARN,0,...,MANSFIELD,NOTTINGHAMSHIRE,MANSFIELD,NG19 6PH,NG19 6PH,"PENNIMENT BARN, PENNIMENT LANE","PENNIMENT BARN, PENNIMENT LANE","PENNIMENT BARN, MANSFIELD, MANSFIELD",NG19 6,
20069712,10023931171,RD02,0,53.153481,-1.241123,E,0.0,0,OLD FORGE,0,...,MANSFIELD,NOTTINGHAMSHIRE,MANSFIELD,NG19 6PH,NG19 6PH,"THE OLD FORGE, PENNIMENT LANE","OLD FORGE, PENNIMENT LANE","OLD FORGE, MANSFIELD, MANSFIELD",NG19 6,


In [174]:
other_AB[other_AB['administrative_area'] == 'ALLERDALE']

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,town_name,administrative_area,post_town,postcode,postcode_locator,parsed_address1,parsed_address2,parsed_address3,postcode_sector,pao_start_num_suffix


In [246]:
other_AB[(other_AB['postcode_locator'] == 'HA0 4QS') & (other_AB['street_description'] == 'BOWRONS AVENUE')]

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,dependent_locality,locality,town_name,administrative_area,post_town,postcode,postcode_locator,postcode_sector,parsed_address1,parsed_address2
1029703,10025177788,CU01,0,51.544389,-0.302875,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QS,HA0 4,ELECTRICITY SUB STATION 66M FROM 95 NORTON ROA...,BOWRONS AVENUE
1309948,202228481,CR,0,51.54493,-0.29745,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QS,HA0 4,"UNIT 1, 1, BOWRONS AVENUE",BOWRONS AVENUE
1561284,202238960,PP,0,51.544557,-0.302108,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QS,HA0 4,"SHELL FOR STUDIO FLATS A TO F AND LOFT D, 39, ...",BOWRONS AVENUE
1569192,202228482,CR10,0,51.544931,-0.297508,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QS,HA0 4,"UNIT 2, 1, BOWRONS AVENUE",BOWRONS AVENUE
3301459,202128619,PS,0,51.544703,-0.30227,E,0.0,0,0,0,...,0,0,WEMBLEY,BRENT,0,0,HA0 4QS,HA0 4,"STREET RECORD, BOWRONS AVENUE",BOWRONS AVENUE
3471091,202221907,PP,0,51.544579,-0.301732,E,0.0,0,0,85,...,0,0,WEMBLEY,BRENT,WEMBLEY,HA0 4QS,HA0 4QS,HA0 4,"85, BOWRONS AVENUE","85, BOWRONS AVENUE"
3641012,202220578,PP,0,51.544845,-0.297742,E,0.0,0,0,3,...,0,0,WEMBLEY,BRENT,WEMBLEY,HA0 4QS,HA0 4QS,HA0 4,"3, BOWRONS AVENUE","3, BOWRONS AVENUE"


In [241]:
resi_AB[(resi_AB['postcode_sector'] == 'HA0 4') & (resi_AB['street_description'] == 'BOWRONS AVENUE')].to_csv('output_HA04.csv', index  = False) # & (resi_AB['building_name'] == 'Apollo House') ] #& (other_AB['street_description'] == 'EAST STREET')]

In [242]:
other_AB[(other_AB['postcode_sector']== 'HA0 4') & (other_AB['street_description'] == 'BOWRONS AVENUE')].to_csv('other_HA04.csv', index = False)

In [171]:
seven_sisters.to_csv('seven_sisters.csv', index=False) 

### Commercial Class matching - Use only the addresses in AddressBase that are classified as Commercial or other uses

Reproduce the process for Residential with commercial properties 


In [60]:
#Merge on Parent UPRN 
non_resi_merged = merged_on_address3_no_match.merge(other_AB,how = 'left', left_on = 'uprn_x', right_on = 'parent_uprn')

non_resi_match, non_resi_no_match =  separate_matches(non_resi_merged, 'parent_uprn', 'parent_uprn')

non_resi_no_match = non_resi_no_match.dropna(axis=1, how='all')

In [61]:
#merge on UPRN
non_resi_merged_2 = non_resi_no_match.merge(other_AB, how = 'left', left_on = 'uprn_x', right_on = 'uprn')

non_resi_match_2, non_resi_no_match_2 =  separate_matches(non_resi_merged_2, 'parent_uprn', 'uprn')

non_resi_no_match_2 = non_resi_no_match_2.dropna(axis=1, how='all')

In [62]:
non_resi_match_2 = non_resi_match_2.rename(columns={"uprn": "uprn_OSAB"})
#non_resi_match_2['match_strategy'] = 'uprn'
non_resi_match_2.count()

non_resi_match = non_resi_match.rename(columns={"uprn": "uprn_OSAB"})
#non_resi_match['match_strategy'] = 'parent_uprn'

In [63]:
#union them together 
non_resi_all_matched = pd.concat([non_resi_match_2, non_resi_match])

---- Commercial address matching --- 

Strategy 1: Skip for Batch 3 because it depends on data field only used for London: 'site_name_GLA'


'site_name_GLA_no_pc' <> 'parsed_address1' and
'postcode_clean' <> 'postcode_locator'


In [89]:
#join addresses strategy 2
left_columns_2 =  ['site_name_LPA_no_pc', 'postcode_clean'] 
right_columns_2 =  ['parsed_address1', 'postcode_locator']
 
non_resi_address_merge_match2, non_resi_address_merge_no_match2 = my_merge(merged_on_address20_no_match, other_AB, left_columns_2,right_columns_2 )
non_resi_address_merge_match2['UCL ID'].nunique()

NameError: name 'merged_on_address20_no_match' is not defined

In [125]:
non_resi_address_merge_no_match2 = non_resi_address_merge_no_match2.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match2 = non_resi_address_merge_match2.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match2['match_strategy'] = 'address_2'

non_resi_all_matched = non_resi_address_merge_match2

Strategy 3: Skip for Batch 3 because it depends on data field only used for London: 'site_name_GLA'
'site_name_GLA_no_pc' <> 'parsed_address2' and
'postcode_clean' <> 'postcode_locator'

In [126]:
# STRATEGY 4 ADDRESS MATCH 
left_columns_4 = ['site_name_LPA_no_pc', 'postcode_clean'] 
right_columns_4 = ['parsed_address2', 'postcode_locator']


non_resi_address_merge_match4, non_resi_address_merge_no_match4 = my_merge(non_resi_address_merge_no_match2, other_AB, left_columns_4, right_columns_4)

In [127]:
non_resi_address_merge_match4['UCL ID'].nunique()

0

In [128]:
non_resi_address_merge_no_match4 = non_resi_address_merge_no_match4.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match4 = non_resi_address_merge_match4.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match4['match_strategy'] = 'address_4'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match4])

In [129]:
#strategy 5 ADDRESS MATCH 
left_columns_5 = ['concat_addr', 'postcode_clean'] 
right_columns_5 = ['parsed_address1', 'postcode_locator']


non_resi_address_merge_match5, non_resi_address_merge_no_match5 = my_merge(non_resi_address_merge_no_match4, other_AB, left_columns_5, right_columns_5)


In [130]:
non_resi_address_merge_no_match5 = non_resi_address_merge_no_match5.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match5 = non_resi_address_merge_match5.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match5['match_strategy'] = 'address_5'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match5])

In [131]:
## Strategy 6
left_columns_6 = ['concat_addr', 'postcode_clean']  
right_columns_6 = ['parsed_address2', 'postcode_locator'] 

non_resi_address_merge_match6, non_resi_address_merge_no_match6 = my_merge(non_resi_address_merge_no_match5, other_AB, left_columns_6, right_columns_6)

In [132]:
non_resi_address_merge_match6['UCL ID'].nunique()

48

In [133]:
non_resi_address_merge_no_match6 = non_resi_address_merge_no_match6.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match6 = non_resi_address_merge_match6.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match6['match_strategy'] = 'address_6'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match6])

Strategy 7 Other

In [134]:
#strategy 7 ADDRESS MATCH 
left_columns_7 = ['site_number_clean', 'street_name', 'postcode_clean']
right_columns_7 = ['pao_start_num_suffix', 'street_description', 'postcode_locator'] 

non_resi_address_merge_match7, non_resi_address_merge_no_match7 = my_merge(non_resi_address_merge_no_match6, other_AB, left_columns_7, right_columns_7)


In [135]:
non_resi_address_merge_match7['UCL ID'].nunique()

38

In [136]:
non_resi_address_merge_no_match7 = non_resi_address_merge_no_match7.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match7 = non_resi_address_merge_match7.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match7['match_strategy'] = 'address_7'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match7])

In [137]:
#strategy 8 ADDRESS MATCH 
left_columns_8 = ['site_number_clean', 'street_name', 'postcode_sector_x']
right_columns_8 = ['pao_start_num_suffix','street_description', 'postcode_sector']

non_resi_address_merge_match8, non_resi_address_merge_no_match8 = my_merge(non_resi_address_merge_no_match7, other_AB, left_columns_8, right_columns_8)

In [138]:
non_resi_address_merge_match8['UCL ID'].nunique()

33

In [139]:
non_resi_address_merge_no_match8 = non_resi_address_merge_no_match8.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match8 = non_resi_address_merge_match8.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match8['match_strategy'] = 'address_8'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match8])

In [140]:
#strategy 9 ADDRESS MATCH 
left_columns_9 = ['site_number_clean', 'street_name', 'postcode_clean']
right_columns_9 = ['pao_start_number', 'street_description', 'postcode_locator']

non_resi_address_merge_match9, non_resi_address_merge_no_match9 = my_merge(non_resi_address_merge_no_match8, other_AB, left_columns_9, right_columns_9)

In [141]:
non_resi_address_merge_match9['UCL ID'].nunique() #was 88

0

In [142]:
non_resi_address_merge_no_match9 = non_resi_address_merge_no_match9.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match9 = non_resi_address_merge_match9.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match9['match_strategy'] = 'address_9'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match9])

In [143]:
#STRATEGY 10
left_columns_10 = ['site_number_clean', 'street_name', 'postcode_sector_x']
right_columns_10 = ['pao_start_number', 'street_description', 'postcode_sector']

non_resi_address_merge_match10, non_resi_address_merge_no_match10 = my_merge(non_resi_address_merge_no_match9, other_AB, left_columns_10, right_columns_10)

In [144]:
non_resi_address_merge_match10['UCL ID'].nunique() #was 88

1

In [146]:
non_resi_address_merge_no_match10 = non_resi_address_merge_no_match10.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match10 = non_resi_address_merge_match10.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match10['match_strategy'] = 'address_10'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match10])

In [147]:
#STRATEGY 12
left_columns_12 = ['site_name_LPA_no_pc',  'postcode_sector_x']
right_columns_12 = ['parsed_address1',  'postcode_sector']

non_resi_address_merge_match12, non_resi_address_merge_no_match12 = my_merge(non_resi_address_merge_no_match10, other_AB, left_columns_12, right_columns_12)

In [148]:
non_resi_address_merge_match12['UCL ID'].nunique() #was 88

0

In [149]:
non_resi_address_merge_no_match12 = non_resi_address_merge_no_match12.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match12 = non_resi_address_merge_match12.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match12['match_strategy'] = 'address_12'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match12])

In [150]:
#STRATEGY 14
left_columns_14 = ['site_name_LPA_no_pc',  'postcode_sector_x'] 
right_columns_14 = ['parsed_address2',  'postcode_sector']

non_resi_address_merge_match14, non_resi_address_merge_no_match14 = my_merge(non_resi_address_merge_no_match12, other_AB, left_columns_14, right_columns_14)


In [151]:
non_resi_address_merge_match14['UCL ID'].nunique() #was 88

0

In [152]:
non_resi_address_merge_no_match14 = non_resi_address_merge_no_match14.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match14 = non_resi_address_merge_match14.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match14['match_strategy'] = 'address_14'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match14])

In [153]:
#STRATEGY 15
left_columns_15 = ['site_number_clean', 'street_name',  'postcode_sector_x']
right_columns_15 = ['pao_start_number', 'street_description',   'postcode_sector']

non_resi_address_merge_match15, non_resi_address_merge_no_match15 = my_merge(non_resi_address_merge_no_match14, other_AB, left_columns_15, right_columns_15)


In [154]:
non_resi_address_merge_match15['UCL ID'].nunique() #was 88

0

In [155]:
non_resi_address_merge_no_match15 = non_resi_address_merge_no_match15.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match15 = non_resi_address_merge_match15.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match15['match_strategy'] = 'address_15'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match15])

In [156]:
#Strategy 16
left_columns_16 = ['concat_addr', 'lpa_name'] 
right_columns_16 = ['parsed_address1', 'administrative_area']

non_resi_address_merge_match16, non_resi_address_merge_no_match16 = my_merge(non_resi_address_merge_no_match15, other_AB, left_columns_16, right_columns_16)
non_resi_address_merge_match16['UCL ID'].nunique()

0

In [157]:
non_resi_address_merge_no_match16 = non_resi_address_merge_no_match16.dropna(axis=1, how='all')
non_resi_address_merge_match16 = non_resi_address_merge_match16.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match16['match_strategy'] = 'address_16'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match16])

In [158]:
#strategy 17 
left_columns_17 = ['concat_addr', 'lpa_name'] 
right_columns_17 = ['parsed_address2', 'administrative_area']

non_resi_address_merge_match17, non_resi_address_merge_no_match17 = my_merge(non_resi_address_merge_no_match16, other_AB, left_columns_17, right_columns_17)


In [159]:
non_resi_address_merge_match17['UCL ID'].nunique()

0

In [160]:
non_resi_address_merge_no_match17 = non_resi_address_merge_no_match17.dropna(axis=1, how='all')
non_resi_address_merge_match17 = non_resi_address_merge_match17.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match17['match_strategy'] = 'address_17'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match17])

In [161]:
#strategy 18
left_columns_18 = ['site_name_clean', 'street_name','postcode_clean']
right_columns_18 = ['pao_text', 'street_description','postcode_locator']

non_resi_address_merge_match18, non_resi_address_merge_no_match18 = my_merge(non_resi_address_merge_no_match17, other_AB, left_columns_18, right_columns_18)
non_resi_address_merge_match18['UCL ID'].nunique()

31

In [162]:
non_resi_address_merge_no_match18 = non_resi_address_merge_no_match18.dropna(axis=1, how='all')
non_resi_address_merge_match18 = non_resi_address_merge_match18.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match18['match_strategy'] = 'address_18'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match18])

In [163]:
#strategy 19
left_columns_19 = ['site_number_clean', 'site_name_clean','postcode_clean']
right_columns_19 = ['pao_start_number', 'building_name','postcode_locator']

non_resi_address_merge_match19, non_resi_address_merge_no_match19 = my_merge(non_resi_address_merge_no_match18, other_AB, left_columns_19, right_columns_19)
non_resi_address_merge_match19['UCL ID'].nunique()

0

In [164]:
non_resi_address_merge_no_match19 = non_resi_address_merge_no_match19.dropna(axis=1, how='all')
non_resi_address_merge_match19 = non_resi_address_merge_match19.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match19['match_strategy'] = 'address_19'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match19])

In [167]:
#Strategy 20 
left_columns_20 = ['site_number_clean', 'site_name_clean','postcode_clean']
right_columns_20 = ['building_number', 'building_name','postcode_locator']

non_resi_address_merge_match20, non_resi_address_merge_no_match20 = my_merge(merged_on_address19_no_match, other_AB, left_columns_20, right_columns_20)

non_resi_address_merge_match20['UCL ID'].nunique()

2

In [168]:
non_resi_address_merge_no_match20 = non_resi_address_merge_no_match20.dropna(axis=1, how='all')
non_resi_address_merge_match20 = non_resi_address_merge_match20.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match20['match_strategy'] = 'address_20'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match20])

In [549]:
non_resi_address_merge_no_match17.to_csv('no_match_june4.csv', index = False)

#  Post Match

Calculate the match rates, clean data, and save as CSVs

In [170]:
#print match rate
resi_match_rate = 100 * all_matched['UCL ID'].nunique() / england_data['UCL ID'].nunique()

print('Resi Match rate: ', resi_match_rate) 

Resi Match rate:  17.346767422334175


In [171]:
all_matched['UCL ID'].nunique()

1033

In [172]:
other_match_rate = 100 * non_resi_all_matched['UCL ID'].nunique() / england_data['UCL ID'].nunique()

print('Other Match rate: ', other_match_rate) 

Other Match rate:  5.9613769941225865


In [173]:
non_resi_all_matched['UCL ID'].nunique() 

355

In [304]:
#before export, make all these columns blank instead of 0 
#'legal_name', 'sub_building_name', 'building_name','building_number','street_description', 'dependent_locality	locality
all_matched['street_name'] = all_matched['street_name'].replace(0,'')
all_matched['legal_name'] = all_matched['legal_name'].replace(0,'')
all_matched['sub_building_name'] = all_matched['sub_building_name'].replace(0,'')
all_matched['building_name'] = all_matched['building_name'].replace(0,'')
all_matched['building_number'] = all_matched['building_number'].replace(0,'')
all_matched['street_description'] = all_matched['street_description'].replace(0,'')
all_matched['dependent_locality'] = all_matched['dependent_locality'].replace(0,'')
all_matched['locality'] = all_matched['locality'].replace(0,'')
all_matched['post_town'] = all_matched['post_town'].replace(0,'')
all_matched['postcode'] = np.where(all_matched['postcode'] == 0, all_matched['postcode_clean'], all_matched['postcode'])


In [305]:
#write out to csv 

all_matched.to_csv('Batch3c_address_matched_2006.csv', index=False) 



In [None]:
non_resi_all_matched.to_csv('Batch3b_non_resi_match_1306.csv', index=False)  

In [176]:
non_resi_address_merge_no_match6.to_csv('Batch3b_no_match_1306.csv', index = False)

In [307]:
merged_on_address23_no_match.to_csv('Batch3c_no_match_2006.csv', index = False)