In [1]:
""" Script to take planning applications and match to OSAddressBase addresses

Written by: Christine Langston, May 2024
"""
import pandas as pd
import time
import numpy as np
import copy
import re

In [2]:
#create a function to iterate through csv. use for OSAddressBase file
def read_csv(file_name, columns):
    for chunk in pd.read_csv(file_name, chunksize=10000, usecols=columns, 
                            dtype={'urpn':str, 'parent_urpn': str, 'class': str, 'latitude': float, 
                                                  'longitude': float, 'country': str}):
    
        #if chunk['country'] == 'E':
        yield chunk


In [3]:
#takes a merged dataset and separates matched addresses and not matched
def separate_matches(dataset, column_name, match_strategy):
    dataset_match = dataset.drop(dataset[pd.isna(dataset[column_name]) == True].index)
    dataset_no_match = dataset.drop(dataset[pd.isna(dataset[column_name]) == False].index)
    dataset_match['match_strategy'] = match_strategy
    return dataset_match, dataset_no_match

In [4]:
#function to merge and then split based on if an address was matched  
def my_merge(left, right, left_on, right_on): 
    merged = left.merge(right, how = 'left', left_on = left_on, right_on = right_on)
    merged_match = merged.drop(merged[pd.isna(merged['uprn']) == True].index) 
    merged_no_match = merged.drop(merged[pd.isna(merged['uprn']) == False].index) 
    return merged_match, merged_no_match

In [5]:
%%time
#### READ IN THE DATA IF EXPORTED 
resi_AB = pd.read_csv('data/resi_AB_cleaned_parsed_110524.csv', low_memory = False)


CPU times: user 4min 30s, sys: 9min 43s, total: 14min 13s
Wall time: 23min 27s


In [6]:
%%time
other_AB = pd.read_csv('data/other_AB_cleaned_parsed_110524.csv',low_memory = False)



CPU times: user 41.6 s, sys: 9.82 s, total: 51.4 s
Wall time: 1min 30s


In [78]:
#load in the data that hasn't been matched yet
#9092 ID 
london_data = pd.read_csv('data/Batch_3_filtered_out_unwanted_IDs_from_Glenigan&FPP_PA_matching.csv',low_memory = False)

Some data cleaning steps to clean up the Nan and numerical data coming from the CSV

In [9]:
### ------ DATA CLEANING  ----------
#edit the parent uprn from OSAddress base to fill out to be strings with 12 digits and leading zeros
#replace nan
resi_AB = resi_AB.replace([np.nan, -np.inf], 0)

#cast as integer
resi_AB['parent_uprn'] = resi_AB['parent_uprn'].astype('Int64')
resi_AB['uprn'] = resi_AB['uprn'].astype('Int64')


In [10]:
### ------ DATA CLEANING  ----------
#cast as string 
resi_AB['parent_uprn'] = resi_AB['parent_uprn'].astype(str)
resi_AB['uprn'] = resi_AB['uprn'].astype(str)

#fill in with left side padding zeros 
resi_AB['parent_uprn'] = resi_AB['parent_uprn'].apply(lambda x: '{0:0>12}'.format(x))
resi_AB['uprn'] = resi_AB['uprn'].apply(lambda x: '{0:0>12}'.format(x))


In [11]:
#do the same cleaning as above but for the OTHER addresses
other_AB = other_AB.replace([np.nan, -np.inf], 0)

#cast as integer
other_AB['parent_uprn'] = other_AB['parent_uprn'].astype('Int64')
other_AB['uprn'] = other_AB['uprn'].astype('Int64')

In [12]:
### ------ DATA CLEANING  ----------
#cast as string 
other_AB['parent_uprn'] = other_AB['parent_uprn'].astype(str)
other_AB['uprn'] = other_AB['uprn'].astype(str)

#fill in with left side padding zeros 
other_AB['parent_uprn'] = other_AB['parent_uprn'].apply(lambda x: '{0:0>12}'.format(x))
other_AB['uprn'] = other_AB['uprn'].apply(lambda x: '{0:0>12}'.format(x))


In [13]:
resi_AB.head()

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,dependent_locality,locality,town_name,administrative_area,post_town,postcode,postcode_locator,postcode_sector,parsed_address1,parsed_address2
0,10012778289,RD02,0,54.036005,-2.340833,E,0.0,0,0,0.0,...,0,RATHMELL,SETTLE,NORTH YORKSHIRE,0,0,BD24 0LP,BD24 0,"SCOUTBER END FARM, OLD OLIVER LANE TO RAGGED HALL",OLD OLIVER LANE TO RAGGED HALL
1,10013819934,RG02,10013819576,54.780687,-1.510651,E,0.0,0,0,0.0,...,0,0,SHERBURN,DURHAM,0,0,DH6 1JH,DH6 1,"GARAGE SITE 53, DOWSEY ROAD",DOWSEY ROAD
2,10014309637,RD04,200003655499,51.26651,0.497553,E,0.0,0,0,0.0,...,0,0,MAIDSTONE,KENT,0,0,ME16 8LD,ME16 8,"ROOM 1, 85, MILTON STREET",MILTON STREET
3,10033213520,RD01,10002821654,53.903252,-0.158754,E,0.0,0,0,0.0,...,0,0,HORNSEA,EAST RIDING OF YORKSHIRE,0,0,HU18 1TL,HU18 1,"33 SOUTHFIELD, LONGBEACH LEISURE PARK, HORNSEA...",HORNSEA BURTON ROAD
4,10033545757,RD06,100023479131,51.514983,-0.17926,E,0.0,0,0,0.0,...,0,0,LONDON,CITY OF WESTMINSTER,0,0,W2 3UJ,W2 3,"SECOND FLOOR, 58, WESTBOURNE TERRACE",WESTBOURNE TERRACE


In [14]:
### ------ DATA CLEANING On RESI AB ----------

#Address matching data clean, make building number into a string 
resi_AB['building_number'] = resi_AB['building_number'].astype('Int64').astype('str') 

In [15]:
resi_AB['pao_start_number'] = resi_AB['pao_start_number'].astype('Int64').astype('str') 


In [16]:
other_AB['building_number'] = other_AB['building_number'].astype('Int64').astype('str') 

In [17]:
other_AB['pao_start_number'] = other_AB['pao_start_number'].astype('Int64').astype('str') 


------ DATA CLEANING PLANNING APPLICATION DATA  ----------


In [18]:

#BATCH 1 ONLY london_data has extra quotation marks, need to remove
#london_data['uprn'] = london_data['uprn'].apply(lambda x: x.strip("''") if not pd.isna(x) else x)


In [55]:
london_data.head()

Unnamed: 0,UCL ID,planning_application_number,lpa_name,application_type,application_type_full,description,number_of_units,postcode_clean,site_number_clean,site_name_clean,street_name,site_name_LPA,uprn,decision,status,application_date,decision_date,PD_type,FPP_PA_Mix
0,5004,2/2014/0391,Allerdale,Prior Approval,,Prior approval of proposed change of use of ag...,,CA7 8NQ,,Clea Green,,"Clea Green, Westward, Wigton, Cumbria, CA7 8NQ",,Approved,,,04/06/2014 00:00,Agricultural to resi,
1,5005,2/2014/0461,Allerdale,Prior Approval,,Notice of proposed development for change of u...,,CA13 9QN,,Jubilee House,Victoria Road,"Jubilee House, Victoria Road, Cockermouth, Cum...",,Approved,,,02/07/2014 00:00,Retail / SG to resi,
2,5006,2/2014/0749,Allerdale,Prior Approval,,Notification for Prior Approval under Class MB...,,CA7 0AB,,Church Rigg Farm,,"Church Rigg Farm, Wigton, Cumbria, CA7 0AB",,Prior Approval Not Required,,,14/10/2014 00:00,Agricultural to resi,
3,5007,2/2014/0832,Allerdale,Prior Approval,,Prior approval of proposed change of use of ag...,,CA7 8AS,,Brackenridge Farm,,"Barn 2, Brackenridge Farm, Brackenthwaite, Wig...",,Approved,,,24/11/2014 00:00,Agricultural to resi,
4,5008,2/2014/0896,Allerdale,Prior Approval,,Prior approval of proposed change of use from ...,,CA7 2RA,,Greengill Farm,,"Greengill Farm, Greengill, Aspatria, Wigton, C...",,Approved,,,19/12/2014 00:00,Agricultural to resi,


In [74]:
#create a function to catch the value error 
def cast_uprn(x): 
    try: 
        return pd.to_numeric(x, errors = 'coerce') #.astype('float32').astype('Int64')
    #pd.to_numeric(x, errors='coerce')
    except ValueError: 
        return None

In [147]:
#covert the UPRN from a string with scientific notation to a numeric to an integer to a string again 
london_data['uprn'] = london_data['uprn'].apply(cast_uprn)

In [149]:
london_data['uprn'] = london_data['uprn'].astype('Int64')

In [151]:
london_data['uprn'] = london_data['uprn'].astype('str')

In [153]:
london_data['uprn'] = london_data['uprn'].apply(lambda x: '{0:0>12}'.format(x) if not pd.isna(x) else x)

In [154]:
london_data['uprn']

0       00000000<NA>
1       00000000<NA>
2       00000000<NA>
3       00000000<NA>
4       00000000<NA>
            ...     
9087    100121000000
9088    100121000000
9089    100121000000
9090    010003387106
9091    010003387118
Name: uprn, Length: 9092, dtype: object

Skip Below if we already have UPRN: 

In [12]:
### ------ DATA CLEANING  ----------
london_data['uprn_x'] = london_data['uprn_x'].astype('Int64')



In [13]:
#cast the strings and make sure it looks good 
london_data['uprn_x'] = london_data['uprn_x'].astype('str') #apply(lambda x: str(x) if not pd.isna(x) else x)

london_data['uprn_x'] = london_data['uprn_x'].apply(lambda x: '{0:0>12}'.format(x) if not pd.isna(x) else x)

OK continue here 

In [156]:
### ------ DATA CLEANING  ----------

london_data = london_data.replace('00000000<NA>',np.NaN)

Ignore the creation of the columns below if matching a batch for a second, third, etc. time

In [158]:
### ------ DATA CLEANING  ----------
#street address from site_name_LPA
london_data['parsed_street_LPA'] = london_data['site_name_LPA'].apply(lambda x: re.findall("[0-9]+.-?.[0-9]+?\s(.+)(Road|Lane|Avenue|Parade|Courtyard|Street|Gardens|Drive)", x) if not pd.isna(x) and '-' in x
                                                                  else (re.findall("[0-9]+\s(.+)(Road|Lane|Avenue|Parade|Courtyard|Street|Gardens|Drive)", x) if not pd.isna(x) else []))


In [159]:
london_data['parsed_street_LPA'] =  london_data['parsed_street_LPA'].apply(lambda x: x[0][0] + x[0][1]  if len(x) > 0 else None)

In [160]:
london_data

Unnamed: 0,UCL ID,planning_application_number,lpa_name,application_type,application_type_full,description,number_of_units,postcode_clean,site_number_clean,site_name_clean,...,uprn,decision,status,application_date,decision_date,PD_type,FPP_PA_Mix,parsed_street_LPA,concat_addr,site_name_LPA_no_pc
0,5004,2/2014/0391,Allerdale,Prior Approval,,Prior approval of proposed change of use of ag...,,CA7 8NQ,,Clea Green,...,,Approved,,,04/06/2014 00:00,Agricultural to resi,,,,"CLEA GREEN, WESTWARD, WIGTON, CUMBRIA,"
1,5005,2/2014/0461,Allerdale,Prior Approval,,Notice of proposed development for change of u...,,CA13 9QN,,Jubilee House,...,,Approved,,,02/07/2014 00:00,Retail / SG to resi,,,,"JUBILEE HOUSE, VICTORIA ROAD, COCKERMOUTH, CUM..."
2,5006,2/2014/0749,Allerdale,Prior Approval,,Notification for Prior Approval under Class MB...,,CA7 0AB,,Church Rigg Farm,...,,Prior Approval Not Required,,,14/10/2014 00:00,Agricultural to resi,,,,"CHURCH RIGG FARM, WIGTON, CUMBRIA,"
3,5007,2/2014/0832,Allerdale,Prior Approval,,Prior approval of proposed change of use of ag...,,CA7 8AS,,Brackenridge Farm,...,,Approved,,,24/11/2014 00:00,Agricultural to resi,,,,"BARN 2, BRACKENRIDGE FARM, BRACKENTHWAITE, WIG..."
4,5008,2/2014/0896,Allerdale,Prior Approval,,Prior approval of proposed change of use from ...,,CA7 2RA,,Greengill Farm,...,,Approved,,,19/12/2014 00:00,Agricultural to resi,,,,"GREENGILL FARM, GREENGILL, ASPATRIA, WIGTON, C..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9087,15350,22/1012/PNR,Wyre,Prior Approval,,Change of Use of Agricultural Buildings to Dwe...,,DY10 3PN,,"Woodcroft,",...,100121000000,Prior Approval Granted,,,13/02/2023 00:00,Agricultural to resi,,,,"WOODCROFT, WAGGON LANE ISMERE KIDDERMINSTER WO..."
9088,15351,23/0018/PNR,Wyre,Prior Approval,,Change of Use of Agricultural Buildings to Dwe...,,DY14 9YG,,Bliss Farm,...,100121000000,Prior Approval Granted,,,03/02/2023 00:00,Agricultural to resi,,,,"OLD BLISS FARM, GORST HILL ROCK KIDDERMINSTER ..."
9089,15352,23/0120/PNR,Wyre,Prior Approval,,Change of Use of Agricultural Buildings to 2 D...,,DY14 9YG,,Bliss Farm,...,100121000000,Prior Approval Granted,,,30/03/2023 00:00,Agricultural to resi,,,,"OLD BLISS FARM, GORST HILL ROCK KIDDERMINSTER ..."
9090,15353,23/0176/PNR,Wyre,Prior Approval,,Change of Use of Agricultural Buildings to Dwe...,,,,Field House,...,010003387106,Prior Approval Granted,,,05/05/2023 00:00,Agricultural to resi,,,,"BARN AT, FIELD HOUSE FARM TENBURY ROAD ROCK KI..."


In [87]:
#cleaning - street address for LONDON only - BATCHES 1 & 2
london_data['parsed_street_GLA'] = london_data['site_name_GLA'].apply(lambda x: re.findall("[0-9]+.-?.[0-9]+?\s(.+)(Road|Lane|Avenue|Parade|Courtyard|Street|Gardens|Drive)", x) if not pd.isna(x) and '-' in x
                                                                  else (re.findall("[0-9]+\s(.+)(Road|Lane|Avenue|Parade|Courtyard|Street|Gardens|Drive)", x) if not pd.isna(x) else []))

IndentationError: unexpected indent (337522128.py, line 3)

In [23]:
#London ONLY - BATCHES 1 & 2
london_data['parsed_street_GLA'] =  london_data['parsed_street_GLA'].apply(lambda x: x[0][0] + x[0][1]  if len(x) > 0 else None)

In [24]:
#if the original street was empty, then we want to use the LPA Or GLA parsed name as street_name 
# LONDON ONLY 
london_data['street_name'] = np.where(london_data['street_name'].isnull(), np.where(london_data['parsed_street_LPA'].isnull(), london_data['parsed_street_GLA'], london_data['parsed_street_LPA']), london_data['street_name'] )

In [161]:
london_data['street_name'] = np.where(london_data['street_name'].isnull(), london_data['parsed_street_LPA'], london_data['street_name'] )

In [27]:
# DATA EXPLORATION - EXPORT CSV IF NEEDED 
#london_data.to_csv('batch2_cleaned.csv', index = False)

Continue here to clean data for all batches

In [89]:
# create new column with the number, street description, and the site name 
london_data['concat_addr'] = np.where(london_data['site_name_clean'].isnull(), '', london_data['site_name_clean'] +  ', ' ) + london_data['site_number_clean'] + ', ' +  london_data['street_name']

In [162]:
london_data['concat_addr'] = london_data['concat_addr'].str.upper()

In [163]:
#make london_data all into capitals to regularize
london_data['street_name'] = london_data['street_name'].str.upper()

In [164]:
#remove unwanted characters
london_data['postcode_clean'] = london_data['postcode_clean'].replace('x000D__x000D_\n', '')

In [165]:
london_data = london_data.replace({'_x000D__x000D_\n': ' ', '_x000d__x000d_\n': ' ' }, regex = True)

In [166]:
#remove white space
london_data['postcode_clean'] = london_data['postcode_clean'].apply(lambda x: str(x).strip())

In [114]:
#substring of site_name_GLA without the postcode .... 
#LONDON ONLY
london_data['site_name_GLA_no_pc'] = london_data.apply(lambda row: str(row['site_name_GLA']).upper().replace(', ' + row['postcode_clean'], ''), axis = 1)

In [167]:
london_data['site_name_LPA_no_pc'] = london_data.apply(lambda row: str(row['site_name_LPA']).upper().replace(row['postcode_clean'], ''), axis = 1)


In [168]:
london_data.tail()

Unnamed: 0,UCL ID,planning_application_number,lpa_name,application_type,application_type_full,description,number_of_units,postcode_clean,site_number_clean,site_name_clean,...,uprn,decision,status,application_date,decision_date,PD_type,FPP_PA_Mix,parsed_street_LPA,concat_addr,site_name_LPA_no_pc
9087,15350,22/1012/PNR,Wyre,Prior Approval,,Change of Use of Agricultural Buildings to Dwe...,,DY10 3PN,,"Woodcroft,",...,100121000000,Prior Approval Granted,,,13/02/2023 00:00,Agricultural to resi,,,,"WOODCROFT, WAGGON LANE ISMERE KIDDERMINSTER WO..."
9088,15351,23/0018/PNR,Wyre,Prior Approval,,Change of Use of Agricultural Buildings to Dwe...,,DY14 9YG,,Bliss Farm,...,100121000000,Prior Approval Granted,,,03/02/2023 00:00,Agricultural to resi,,,,"OLD BLISS FARM, GORST HILL ROCK KIDDERMINSTER ..."
9089,15352,23/0120/PNR,Wyre,Prior Approval,,Change of Use of Agricultural Buildings to 2 D...,,DY14 9YG,,Bliss Farm,...,100121000000,Prior Approval Granted,,,30/03/2023 00:00,Agricultural to resi,,,,"OLD BLISS FARM, GORST HILL ROCK KIDDERMINSTER ..."
9090,15353,23/0176/PNR,Wyre,Prior Approval,,Change of Use of Agricultural Buildings to Dwe...,,,,Field House,...,10003387106,Prior Approval Granted,,,05/05/2023 00:00,Agricultural to resi,,,,"BARN AT, FIELD HOUSE FARM TENBURY ROAD ROCK KI..."
9091,15354,23/0255/PNR,Wyre,Prior Approval,,Change of Use of Agricultural Building into 1 ...,,DY13 9JE,,,...,10003387118,Prior Approval Granted,,,31/05/2023 00:00,Agricultural to resi,,,,"BARNS REAR OF, SYDWISH WILDEN TOP ROAD STOURPO..."


UPRN MATCHING: If reprocessing data that has already been matched, Ignore the Data matching on the UPRN / Parent UPRN

In [169]:
#### -------- DATA MERGING --------
#join the london data with the RESIDENTIAL AddressBase dataset on UPRN
merged = london_data.merge(resi_AB, how = 'left', left_on = 'uprn', right_on = 'parent_uprn')

#merged['UCL_ID'].nunique()

In [172]:
#separate merged into no match and match 
merged_match, merged_no_match = separate_matches(merged, 'parent_uprn', 'parent_uprn')

merged_no_match = merged_no_match.dropna(axis=1, how='all')

In [174]:
merged_match['UCL ID'].nunique()

379

In [175]:
#merge 2 on uprn not parent_uprn 
merged_2 = merged_no_match.merge(resi_AB, how = 'left', left_on = 'uprn_x', right_on = 'uprn')

In [176]:
#separate the merge 2 into two datasets for match v not match 
merged_2_match, merged_2_no_match = separate_matches(merged_2, 'uprn', 'uprn')

merged_2_no_match = merged_2_no_match.dropna(axis=1, how='all')      

In [177]:
merged_2_match = merged_2_match.rename(columns={"uprn": "uprn_OSAB"})
merged_match = merged_match.rename(columns={"uprn_y": "uprn_OSAB"})

In [179]:
merged_2_match['UCL ID'].nunique()

1433

In [180]:
all_matched = pd.concat([merged_2_match, merged_match])


ADDRESS MATCHING: For all matching versions: Address matching starts here

In [115]:
#merged_2_no_match = london_data

-------- DATA MERGING --------  STRATEGY 1 ADDRESS MATCH

In [181]:
#### ADDRESS Strategy 2 - 
## use the site name LPA without post code ...
left_columns_2 = ['site_name_LPA_no_pc', 'postcode_clean'] #London Data 
right_columns_2 =  ['parsed_address1', 'postcode_locator'] #AB

merged_on_address2 = merged_2_no_match.merge(resi_AB, how = 'left', left_on = left_columns_2, right_on = right_columns_2)

In [182]:
merged_on_address2_match = merged_on_address2.drop(merged_on_address2[pd.isna(merged_on_address2['uprn']) == True].index) 
merged_on_address2_no_match = merged_on_address2.drop(merged_on_address2[pd.isna(merged_on_address2['uprn']) == False].index) 


In [183]:
merged_on_address2_match['UCL ID'].nunique()

0

In [184]:
merged_on_address2_match = merged_on_address2_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address2_match['match_strategy'] = 'address_2'

In [185]:
all_matched = pd.concat([all_matched, merged_on_address2_match])

In [186]:
merged_on_address2_no_match = merged_on_address2_no_match.dropna(axis=1, how='all')


-------- DATA MERGING --------  STRATEGY 3 ADDRESS MATCH

In [187]:
#### ADDRESS Strategy 4 
### TODO THIS NEXT 
left_columns_4 = ['site_name_LPA_no_pc', 'postcode_clean']  #London Data 
right_columns_4 = ['parsed_address2', 'postcode_locator'] #AB

merged_on_address4 = merged_on_address2_no_match.merge(resi_AB, how = 'left', left_on = left_columns_4, right_on = right_columns_4)

In [188]:
merged_on_address4_match = merged_on_address4.drop(merged_on_address4[pd.isna(merged_on_address4['uprn']) == True].index) 
merged_on_address4_no_match = merged_on_address4.drop(merged_on_address4[pd.isna(merged_on_address4['uprn']) == False].index) 


In [189]:
merged_on_address4_match['UCL ID'].nunique()

1

In [190]:
merged_on_address4_match = merged_on_address4_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address4_match['match_strategy'] = 'address_4'

frames = [all_matched, merged_on_address4_match]

all_matched = pd.concat(frames)
merged_on_address4_no_match = merged_on_address4_no_match.dropna(axis=1, how='all')

In [191]:
## Strategy 5 
#match on the street number, street name, postcode 

left_columns_5 = ['concat_addr', 'lpa_name']
right_columns_5 = ['parsed_address1', 'administrative_area']

merged_on_address5_match, merged_on_address5_no_match = my_merge(merged_on_address4_no_match, resi_AB, left_columns_5, right_columns_5)

In [192]:
merged_on_address5_match['UCL ID'].nunique()

0

In [193]:
merged_on_address5_match = merged_on_address5_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address5_match['match_strategy'] = 'address_5'

frames = [all_matched, merged_on_address5_match]

all_matched = pd.concat(frames)
merged_on_address5_no_match = merged_on_address5_no_match.dropna(axis=1, how='all')

In [194]:
## Strategy 6
left_columns_6 = ['concat_addr', 'lpa_name']
right_columns_6 = ['parsed_address2', 'administrative_area']

merged_on_address6_match, merged_on_address6_no_match = my_merge(merged_on_address5_no_match, resi_AB, left_columns_6, right_columns_6)

In [195]:
merged_on_address6_match['UCL ID'].nunique()

0

In [196]:
merged_on_address6_match = merged_on_address6_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address6_match['match_strategy'] = 'address_6'

frames = [all_matched, merged_on_address6_match]

all_matched = pd.concat(frames)
merged_on_address6_no_match = merged_on_address6_no_match.dropna(axis=1, how='all')

In [56]:
# pao_start_number + street_description

In [197]:
## Strategy 6
left_columns_7 = ['site_number_clean', 'street_name', 'postcode_clean']
right_columns_7 = ['pao_start_number', 'street_description', 'postcode_locator']

merged_on_address7_match, merged_on_address7_no_match = my_merge(merged_on_address6_no_match, resi_AB, left_columns_7, right_columns_7)

In [198]:
merged_on_address7_match['UCL ID'].nunique()

918

In [199]:
merged_on_address7_match = merged_on_address7_match.rename(columns={"uprn": "uprn_OSAB"})
merged_on_address7_match['match_strategy'] = 'address_7'

frames = [all_matched, merged_on_address7_match]

all_matched = pd.concat(frames)
merged_on_address7_no_match = merged_on_address7_no_match.dropna(axis=1, how='all')

In [200]:
all_matched['UCL ID'].nunique()

2731

------ specific address analysis ----- 

In [66]:
merged_on_address7_match.head(20)

Unnamed: 0,ID,planning_application_number,lpa_name,application_type,application_type_full,description,number_of_units,site_number_clean,street_name,postcode_clean,...,dependent_locality,locality,town_name,administrative_area,post_town,postcode,postcode_locator,parsed_address1,parsed_address2,match_strategy
7,1547,P2017/2905/PRA,ISLINGTON,Prior Approval,Prior Approval (Class M - formerly IA),Notification for Prior Approval for the change...,1.0,194,SEVEN SISTERS ROAD,N4 3NX,...,0,0,LONDON,ISLINGTON,LONDON,N4 3NX,N4 3NX,"FLAT 3, 194, SEVEN SISTERS ROAD","FLAT 3, 194A, SEVEN SISTERS ROAD",address_6
8,1547,P2017/2905/PRA,ISLINGTON,Prior Approval,Prior Approval (Class M - formerly IA),Notification for Prior Approval for the change...,1.0,194,SEVEN SISTERS ROAD,N4 3NX,...,0,0,LONDON,ISLINGTON,LONDON,N4 3NX,N4 3NX,"FLAT 1, 194, SEVEN SISTERS ROAD","FLAT 1, 194A, SEVEN SISTERS ROAD",address_6
9,1547,P2017/2905/PRA,ISLINGTON,Prior Approval,Prior Approval (Class M - formerly IA),Notification for Prior Approval for the change...,1.0,194,SEVEN SISTERS ROAD,N4 3NX,...,0,0,LONDON,ISLINGTON,LONDON,N4 3NX,N4 3NX,"FLAT 2, 194, SEVEN SISTERS ROAD","FLAT 2, 194A, SEVEN SISTERS ROAD",address_6
10,1547,P2016/2604/PRA,ISLINGTON,Prior Approval,Prior Approval (Class O - formerly J),Prior approval application for the proposed ch...,1.0,222,SEVEN SISTERS ROAD,N4 3NX,...,0,0,LONDON,ISLINGTON,0,0,N4 3NX,"FLAT 5, 222, SEVEN SISTERS ROAD",SEVEN SISTERS ROAD,address_6
11,1547,P2016/2604/PRA,ISLINGTON,Prior Approval,Prior Approval (Class O - formerly J),Prior approval application for the proposed ch...,1.0,222,SEVEN SISTERS ROAD,N4 3NX,...,0,0,LONDON,ISLINGTON,0,0,N4 3NX,"FLAT 3, 222, SEVEN SISTERS ROAD",SEVEN SISTERS ROAD,address_6
12,1547,P2016/2604/PRA,ISLINGTON,Prior Approval,Prior Approval (Class O - formerly J),Prior approval application for the proposed ch...,1.0,222,SEVEN SISTERS ROAD,N4 3NX,...,0,0,LONDON,ISLINGTON,0,0,N4 3NX,"FLAT 1, 222, SEVEN SISTERS ROAD",SEVEN SISTERS ROAD,address_6
13,1547,P2016/2604/PRA,ISLINGTON,Prior Approval,Prior Approval (Class O - formerly J),Prior approval application for the proposed ch...,1.0,222,SEVEN SISTERS ROAD,N4 3NX,...,0,0,LONDON,ISLINGTON,0,0,N4 3NX,"FLAT 2, 222, SEVEN SISTERS ROAD",SEVEN SISTERS ROAD,address_6
14,1547,P2016/2604/PRA,ISLINGTON,Prior Approval,Prior Approval (Class O - formerly J),Prior approval application for the proposed ch...,1.0,222,SEVEN SISTERS ROAD,N4 3NX,...,0,0,LONDON,ISLINGTON,0,0,N4 3NX,"FLAT 4, 222, SEVEN SISTERS ROAD",SEVEN SISTERS ROAD,address_6
15,1547,P2016/2604/PRA,ISLINGTON,Prior Approval,Prior Approval (Class O - formerly J),Prior approval application for the proposed ch...,1.0,222,SEVEN SISTERS ROAD,N4 3NX,...,0,0,LONDON,ISLINGTON,0,0,N4 3NX,"FLAT 5, 222, SEVEN SISTERS ROAD",SEVEN SISTERS ROAD,address_6
16,1547,P2016/2604/PRA,ISLINGTON,Prior Approval,Prior Approval (Class O - formerly J),Prior approval application for the proposed ch...,1.0,222,SEVEN SISTERS ROAD,N4 3NX,...,0,0,LONDON,ISLINGTON,0,0,N4 3NX,"FLAT 3, 222, SEVEN SISTERS ROAD",SEVEN SISTERS ROAD,address_6


In [62]:
other_AB[(other_AB['building_number'] == '3') & (other_AB['street_description'] == 'HIGH STREET') & (other_AB['postcode_locator'] == 'BR1 1LF')]

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,street_description,dependent_locality,locality,town_name,administrative_area,post_town,postcode,postcode_locator,parsed_address1,parsed_address2


In [178]:
### hmm why isnt itfinding the correct record?? 
#this record exists in other AB - okay... so might work once I match taht! 
other_AB[(other_AB['street_description'] == 'SEVEN SISTERS ROAD')  & (other_AB['postcode_locator'] == 'N4 3NX') 
                        & (other_AB['parsed_address1'] == '194, SEVEN SISTERS ROAD')]

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,street_description,dependent_locality,locality,town_name,administrative_area,post_town,postcode,postcode_locator,parsed_address1,parsed_address2
4055311,5300082025,CR,0,51.562168,-0.109969,E,0.0,0,0,194.0,...,SEVEN SISTERS ROAD,0,0,LONDON,ISLINGTON,LONDON,N4 3NX,N4 3NX,"194, SEVEN SISTERS ROAD","194, SEVEN SISTERS ROAD"
4136519,5300082026,PP,0,51.562168,-0.109969,E,0.0,0,0,,...,SEVEN SISTERS ROAD,0,0,LONDON,ISLINGTON,0,0,N4 3NX,"194, SEVEN SISTERS ROAD",SEVEN SISTERS ROAD


In [171]:
seven_sisters.to_csv('seven_sisters.csv', index=False) 

In [236]:
merged_on_address7_no_match

Unnamed: 0,UCL ID,planning_application_number,lpa_name,application_type,application_type_full,description,number_of_units,postcode_clean,site_number_clean,site_name_clean,...,uprn_x,decision,status,application_date,decision_date,PD_type,FPP_PA_Mix,parsed_street_LPA,concat_addr,site_name_LPA_no_pc
0,5004,2/2014/0391,Allerdale,Prior Approval,,Prior approval of proposed change of use of ag...,,CA7 8NQ,,Clea Green,...,,Approved,,,04/06/2014 00:00,Agricultural to resi,,,,"CLEA GREEN, WESTWARD, WIGTON, CUMBRIA,"
1,5005,2/2014/0461,Allerdale,Prior Approval,,Notice of proposed development for change of u...,,CA13 9QN,,Jubilee House,...,,Approved,,,02/07/2014 00:00,Retail / SG to resi,,,,"JUBILEE HOUSE, VICTORIA ROAD, COCKERMOUTH, CUM..."
2,5006,2/2014/0749,Allerdale,Prior Approval,,Notification for Prior Approval under Class MB...,,CA7 0AB,,Church Rigg Farm,...,,Prior Approval Not Required,,,14/10/2014 00:00,Agricultural to resi,,,,"CHURCH RIGG FARM, WIGTON, CUMBRIA,"
3,5007,2/2014/0832,Allerdale,Prior Approval,,Prior approval of proposed change of use of ag...,,CA7 8AS,,Brackenridge Farm,...,,Approved,,,24/11/2014 00:00,Agricultural to resi,,,,"BARN 2, BRACKENRIDGE FARM, BRACKENTHWAITE, WIG..."
4,5008,2/2014/0896,Allerdale,Prior Approval,,Prior approval of proposed change of use from ...,,CA7 2RA,,Greengill Farm,...,,Approved,,,19/12/2014 00:00,Agricultural to resi,,,,"GREENGILL FARM, GREENGILL, ASPATRIA, WIGTON, C..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10026,15345,22/0417/PNR,Wyre,Prior Approval,,Change of use from agriculture to dwellinghouses,,,376296,,...,010003386706,Prior Approval Granted,,,15/07/2022 00:00,Office to resi,,"271138, Heightington Road","376296, HEIGHTINGTON ROAD","BARN AT 376296 271138, HEIGHTINGTON ROAD HEIGH..."
10027,15347,22/0546/PNR,Wyre,Prior Approval,,Change of Use of Agricultural Building to Dwel...,,DY11 5SQ,,Hill Farm,...,010003386687,Prior Approval Granted,,,11/11/2022 00:00,Agricultural to resi,,,,"BARN AT, CASTLE HILL FARM CASTLE HILL WOLVERLE..."
10028,15349,22/0988/PNR,Wyre,Prior Approval,,Conversion of an agricultural storage building...,,DY11 5XF,,The Shortyard,...,010003385583,Prior Approval Granted,,,13/02/2023 00:00,Agricultural to resi,,,,"LAND AT OS 383575 280035, THE SHORTYARD WOLVER..."
10029,15353,23/0176/PNR,Wyre,Prior Approval,,Change of Use of Agricultural Buildings to Dwe...,,,,Field House,...,010003387106,Prior Approval Granted,,,05/05/2023 00:00,Agricultural to resi,,,,"BARN AT, FIELD HOUSE FARM TENBURY ROAD ROCK KI..."


In [240]:
merged_on_address7_no_match.nunique() # still 6361 Uniuqe UCL IDs, 766 uprns 

UCL ID                         6361
planning_application_number    6359
lpa_name                         98
application_type                  1
application_type_full           270
description                    5529
number_of_units                  55
postcode_clean                 3971
site_number_clean               692
site_name_clean                2542
street_name                    2238
site_name_LPA                  5688
uprn_x                          766
decision                        175
status                           30
application_date               2722
decision_date                  2754
PD_type                           5
FPP_PA_Mix                        1
parsed_street_LPA               694
concat_addr                    1528
site_name_LPA_no_pc            5662
dtype: int64

In [201]:
##-------- DATA MERGING --------  COMMERCIAL PROPERTIES --- Reproduce the process with commercial properties 
#Merge on Parent UPRN 
non_resi_merged = merged_on_address7_no_match.merge(other_AB,how = 'left', left_on = 'uprn_x', right_on = 'parent_uprn')

non_resi_match, non_resi_no_match =  separate_matches(non_resi_merged, 'parent_uprn', 'parent_uprn')

non_resi_no_match = non_resi_no_match.dropna(axis=1, how='all')

In [237]:
other_AB[other_AB['uprn'] == '010003386706']

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,dependent_locality,locality,town_name,administrative_area,post_town,postcode,postcode_locator,postcode_sector,parsed_address1,parsed_address2
1223014,10003386706,CB,0,52.337794,-2.349273,E,0.0,0,0,0,...,0,HEIGHTINGTON,BEWDLEY,WORCESTERSHIRE,0,0,DY12 2XU,DY12 2,"BARN AT 376296 271138, HEIGHTINGTON ROAD",HEIGHTINGTON ROAD


In [202]:
#merge on UPRN
non_resi_merged_2 = non_resi_no_match.merge(other_AB, how = 'left', left_on = 'uprn_x', right_on = 'uprn')

non_resi_match_2, non_resi_no_match_2 =  separate_matches(non_resi_merged_2, 'parent_uprn', 'uprn')

non_resi_no_match_2 = non_resi_no_match_2.dropna(axis=1, how='all')

In [238]:
non_resi_match_2[non_resi_match_2['uprn_x'] == '010003386706']

Unnamed: 0,UCL ID,planning_application_number,lpa_name,application_type,application_type_full,description,number_of_units,postcode_clean,site_number_clean,site_name_clean,...,locality,town_name,administrative_area,post_town,postcode,postcode_locator,postcode_sector,parsed_address1,parsed_address2,match_strategy
6271,15345,22/0417/PNR,Wyre,Prior Approval,,Change of use from agriculture to dwellinghouses,,,376296,,...,HEIGHTINGTON,BEWDLEY,WORCESTERSHIRE,0,0,DY12 2XU,DY12 2,"BARN AT 376296 271138, HEIGHTINGTON ROAD",HEIGHTINGTON ROAD,uprn


In [203]:
non_resi_match_2 = non_resi_match_2.rename(columns={"uprn": "uprn_OSAB"})
#non_resi_match_2['match_strategy'] = 'uprn'
non_resi_match_2.count()

non_resi_match = non_resi_match.rename(columns={"uprn": "uprn_OSAB"})
#non_resi_match['match_strategy'] = 'parent_uprn'

In [242]:
non_resi_match_2.nunique() #675 were matched on uprn 

UCL ID                         590
planning_application_number    590
lpa_name                        34
application_type                 1
application_type_full           49
description                    565
number_of_units                 31
postcode_clean                 326
site_number_clean              157
site_name_clean                235
street_name                    341
site_name_LPA                  517
uprn_x                         474
decision                        52
status                          13
application_date               421
decision_date                  465
PD_type                          5
FPP_PA_Mix                       1
parsed_street_LPA              126
concat_addr                    200
site_name_LPA_no_pc            518
uprn_OSAB                      474
class                           51
parent_uprn                    114
latitude                       473
longitude                      473
country                          1
legal_name          

In [253]:
#union them together 
non_resi_all_matched = pd.concat([non_resi_match_2, non_resi_match])

In [255]:
#non_resi_all_matched.nunique() #675

---- Commercial address matching --- 

In [249]:
#now join on addresses strat 1 
left_columns = ['site_number_clean', 'street_name', 'postcode_clean']
right_columns = ['building_number', 'street_description', 'postcode_locator']

non_resi_address_merge_match, non_resi_address_merge_no_match = my_merge(non_resi_no_match_2, other_AB, left_columns, right_columns)

In [209]:
#use this one if skipping UPRN 
#non_resi_address_merge_match, non_resi_address_merge_no_match = my_merge(merged_on_address7_no_match, other_AB, left_columns, right_columns)


In [257]:
non_resi_address_merge_no_match = non_resi_address_merge_no_match.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match = non_resi_address_merge_match.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match['match_strategy'] = 'address_1'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match])

In [258]:
non_resi_all_matched['UCL ID'].nunique() #914

914

In [214]:
#USE THIS FIRST TIME THRU 
#non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match])

In [259]:
# STRATEGY 3 ADDRESS MATCH 
left_columns_3 = ['site_name_LPA_no_pc', 'postcode_clean'] 
right_columns_3 = ['parsed_address1', 'postcode_locator']


non_resi_address_merge_match3, non_resi_address_merge_no_match3 = my_merge(non_resi_address_merge_no_match, other_AB, left_columns_3, right_columns_3)

In [260]:
non_resi_address_merge_match3['UCL ID'].nunique()

0

In [261]:
non_resi_address_merge_no_match3 = non_resi_address_merge_no_match3.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match3 = non_resi_address_merge_match3.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match3['match_strategy'] = 'address_3'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match3])

------- ADDRESS ----- Strategy 4 COMM 

In [262]:
# STRATEGY 4A ADDRESS MATCH 
left_columns_4B = ['site_name_LPA_no_pc', 'postcode_clean'] 
right_columns_4B = ['parsed_address2', 'postcode_locator']


non_resi_address_merge_match4B, non_resi_address_merge_no_match4B = my_merge(non_resi_address_merge_no_match3, other_AB, left_columns_4B, right_columns_4B)

In [263]:
non_resi_address_merge_match4B['UCL ID'].nunique()

1

In [264]:
non_resi_address_merge_no_match4B = non_resi_address_merge_no_match4B.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match4B = non_resi_address_merge_match4B.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match4B['match_strategy'] = 'address_4A'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match4B])

In [265]:
#strategy 5 ADDRESS MATCH 
left_columns_5 = ['concat_addr', 'lpa_name'] 
right_columns_5 = ['parsed_address1', 'administrative_area']


non_resi_address_merge_match5, non_resi_address_merge_no_match5 = my_merge(non_resi_address_merge_no_match4B, other_AB, left_columns_5, right_columns_5)


In [266]:
non_resi_address_merge_no_match5 = non_resi_address_merge_no_match5.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match5 = non_resi_address_merge_match5.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match5['match_strategy'] = 'address_5'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match5])

In [267]:
#strategy 5B ADDRESS MATCH 
left_columns_5B = ['concat_addr', 'lpa_name'] 
right_columns_5B = ['parsed_address2', 'administrative_area']


non_resi_address_merge_match5B, non_resi_address_merge_no_match5B = my_merge(non_resi_address_merge_no_match5, other_AB, left_columns_5B, right_columns_5B)


In [268]:
non_resi_address_merge_match5B['UCL ID'].nunique() #was 88

0

In [269]:
non_resi_address_merge_no_match5B = non_resi_address_merge_no_match5B.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match5B = non_resi_address_merge_match5B.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match5B['match_strategy'] = 'address_5B'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match5B])

strategy 6, same as for resi --- 

In [270]:
## Strategy 6
left_columns_6 = ['site_number_clean', 'street_name', 'postcode_clean']
right_columns_6 = ['pao_start_number', 'street_description', 'postcode_locator']

non_resi_address_merge_match6, non_resi_address_merge_no_match6 = my_merge(non_resi_address_merge_no_match5B, other_AB, left_columns_6, right_columns_6)

In [271]:
non_resi_address_merge_match6['UCL ID'].nunique()

67

In [272]:
non_resi_address_merge_no_match6 = non_resi_address_merge_no_match6.dropna(axis=1, how='all')

## Add the merged_on_address_match to the matched
non_resi_address_merge_match6 = non_resi_address_merge_match6.rename(columns={"uprn": "uprn_OSAB"})
non_resi_address_merge_match6['match_strategy'] = 'address_6'

non_resi_all_matched = pd.concat([non_resi_all_matched, non_resi_address_merge_match6])

----- specific address analysis  ---- 

In [119]:
#old address had CR4 4NA as the postcode 
willow = other_AB[(other_AB['administrative_area'] == 'MERTON') & 
        (other_AB['street_description'] == 'WILLOW LANE')]

In [121]:
willow.to_csv('willow_lane_export.csv',index = False )

In [187]:
resi_AB[(resi_AB['administrative_area'] == 'BROMLEY') & 
        (resi_AB['street_description'] == 'LIDDON ROAD') & (resi_AB['building_number'] == '55')]

#.to_csv('merton_central_road.csv', index = False)

#(other_AB['postcode_locator'] == 'BR1 1LF') & 

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,street_description,dependent_locality,locality,town_name,administrative_area,post_town,postcode,postcode_locator,parsed_address1,parsed_address2
20804318,10070020009,RD06,100022887828,51.401269,0.031991,E,0.0,FLAT 14,NAPOLEON HOUSE,55,...,LIDDON ROAD,0,0,BROMLEY,BROMLEY,BROMLEY,BR1 2GP,BR1 2GP,"FLAT 14, NAPOLEON HOUSE, 55, LIDDON ROAD","FLAT 14, NAPOLEON HOUSE, 55, LIDDON ROAD"
20832268,10070020007,RD06,100022887828,51.401269,0.031991,E,0.0,FLAT 12,NAPOLEON HOUSE,55,...,LIDDON ROAD,0,0,BROMLEY,BROMLEY,BROMLEY,BR1 2GP,BR1 2GP,"FLAT 12, NAPOLEON HOUSE, 55, LIDDON ROAD","FLAT 12, NAPOLEON HOUSE, 55, LIDDON ROAD"
20924122,10070020010,RD06,100022887828,51.401269,0.031991,E,0.0,FLAT 15,NAPOLEON HOUSE,55,...,LIDDON ROAD,0,0,BROMLEY,BROMLEY,BROMLEY,BR1 2GP,BR1 2GP,"FLAT 15, NAPOLEON HOUSE, 55, LIDDON ROAD","FLAT 15, NAPOLEON HOUSE, 55, LIDDON ROAD"
21025891,10070020006,RD06,100022887828,51.401269,0.031991,E,0.0,FLAT 11,NAPOLEON HOUSE,55,...,LIDDON ROAD,0,0,BROMLEY,BROMLEY,BROMLEY,BR1 2GP,BR1 2GP,"FLAT 11, NAPOLEON HOUSE, 55, LIDDON ROAD","FLAT 11, NAPOLEON HOUSE, 55, LIDDON ROAD"
21136690,10070020001,RD06,100022887828,51.401269,0.031991,E,0.0,FLAT 6,NAPOLEON HOUSE,55,...,LIDDON ROAD,0,0,BROMLEY,BROMLEY,BROMLEY,BR1 2GP,BR1 2GP,"FLAT 6, NAPOLEON HOUSE, 55, LIDDON ROAD","FLAT 6, NAPOLEON HOUSE, 55, LIDDON ROAD"
21255647,10070020003,RD06,100022887828,51.401269,0.031991,E,0.0,FLAT 8,NAPOLEON HOUSE,55,...,LIDDON ROAD,0,0,BROMLEY,BROMLEY,BROMLEY,BR1 2GP,BR1 2GP,"FLAT 8, NAPOLEON HOUSE, 55, LIDDON ROAD","FLAT 8, NAPOLEON HOUSE, 55, LIDDON ROAD"
21376683,10070019997,RD06,100022887828,51.401269,0.031991,E,0.0,FLAT 2,NAPOLEON HOUSE,55,...,LIDDON ROAD,0,0,BROMLEY,BROMLEY,BROMLEY,BR1 2GP,BR1 2GP,"FLAT 2, NAPOLEON HOUSE, 55, LIDDON ROAD","FLAT 2, NAPOLEON HOUSE, 55, LIDDON ROAD"
21390613,10070020004,RD06,100022887828,51.401269,0.031991,E,0.0,FLAT 9,NAPOLEON HOUSE,55,...,LIDDON ROAD,0,0,BROMLEY,BROMLEY,BROMLEY,BR1 2GP,BR1 2GP,"FLAT 9, NAPOLEON HOUSE, 55, LIDDON ROAD","FLAT 9, NAPOLEON HOUSE, 55, LIDDON ROAD"
21423503,10070019999,RD06,100022887828,51.401269,0.031991,E,0.0,FLAT 4,NAPOLEON HOUSE,55,...,LIDDON ROAD,0,0,BROMLEY,BROMLEY,BROMLEY,BR1 2GP,BR1 2GP,"FLAT 4, NAPOLEON HOUSE, 55, LIDDON ROAD","FLAT 4, NAPOLEON HOUSE, 55, LIDDON ROAD"
21499252,10070020011,RD06,100022887828,51.401269,0.031991,E,0.0,FLAT 16,NAPOLEON HOUSE,55,...,LIDDON ROAD,0,0,BROMLEY,BROMLEY,BROMLEY,BR1 2GP,BR1 2GP,"FLAT 16, NAPOLEON HOUSE, 55, LIDDON ROAD","FLAT 16, NAPOLEON HOUSE, 55, LIDDON ROAD"


In [127]:
other_AB[(other_AB['administrative_area'] == 'BROMLEY') & 
        (other_AB['street_description'] == 'HIGH STREET')].to_csv('bromley_high_street_OTHER.csv', index = False)

#(other_AB['postcode_locator'] == 'BR1 1LF') & 

In [129]:
non_resi_address_merge_no_match5B.iloc[1]

ID                                                                          1544
planning_application_number                                             15/P0298
lpa_name                                                                  MERTON
application_type                                                  Prior Approval
application_type_full                      Prior Approval (Class O - formerly J)
description                    Prior approval in relation to the change of us...
number_of_units                                                             51.0
site_number_clean                                                       21 & 21A
street_name                                                          WILLOW LANE
postcode_clean                                                           CR4 4NA
site_name_clean                                      Clock House & Connect House
site_name_GLA                  Connect House, 21a And Clock House, 21, Willow...
site_name_LPA               

In [104]:
non_resi_address_merge_no_match5B.head(20)

Unnamed: 0,ID,planning_application_number,lpa_name,application_type,application_type_full,description,number_of_units,site_number_clean,street_name,postcode_clean,...,decision,status,application_date,decision_date,parsed_street_LPA,parsed_street_GLA,short_site_name_LPA],concat_addr,site_name_LPA_no_pc,site_name_GLA_no_pc
0,1544,15/P0298,MERTON,Prior Approval,Prior Approval (Class O - formerly J),Prior approval in relation to the change of us...,1.0,21 & 21A,WILLOW LANE,CR4 4NA,...,Prior Approval Granted,Completed,09/02/2015,29/03/2015,Willow Lane,,clock house 21 willow lane & connect house,"Clock House & Connect House, 21 & 21A, WILLOW ...","Clock House 21 Willow Lane & Connect House, 2...","Connect House, 21a And Clock House, 21, Willow..."
1,1544,15/P0298,MERTON,Prior Approval,Prior Approval (Class O - formerly J),Prior approval in relation to the change of us...,51.0,21 & 21A,WILLOW LANE,CR4 4NA,...,Prior Approval Granted,Completed,09/02/2015,29/03/2015,Willow Lane,,clock house 21 willow lane & connect house,"Clock House & Connect House, 21 & 21A, WILLOW ...","Clock House 21 Willow Lane & Connect House, 2...","Connect House, 21a And Clock House, 21, Willow..."
2,1544,15/P0298,MERTON,Prior Approval,Prior Approval (Class O - formerly J),Prior approval in relation to the change of us...,26.0,21 & 21A,WILLOW LANE,CR4 4NA,...,Prior Approval Granted,Completed,09/02/2015,29/03/2015,Willow Lane,,clock house 21 willow lane & connect house,"Clock House & Connect House, 21 & 21A, WILLOW ...","Clock House 21 Willow Lane & Connect House, 2...","Connect House, 21a And Clock House, 21, Willow..."
3,1544,15/P0404,MERTON,Prior Approval,Prior Approval (Class O - formerly J),Prior approval in relation to the change of us...,5.0,21 & 21A,WILLOW LANE,CR4 4NA,...,Prior Approval Granted,Completed,09/02/2015,01/04/2015,Willow Lane,,clock house 21 willow lane & connect house,"Clock House & Connect House, 21 & 21A, WILLOW ...","Clock House 21 Willow Lane & Connect House, 2...","Clock House And Connect House, 21-21a, Willow ..."
4,1544,13/P4059,MERTON,Prior Approval,Prior Approval: Change of use - offices to dwe...,PRIOR APPROVAL IN RELATION TO THE CHANGE OF US...,46.0,21 & 21A,WILLOW LANE,CR4 4NA,...,,Superseded,,16/02/2015,Willow Lane,,clock house 21 willow lane & connect house,"Clock House & Connect House, 21 & 21A, WILLOW ...","Clock House 21 Willow Lane & Connect House, 2...",
5,1544,14/P1849,MERTON,Prior Approval,Prior Approval: Change of use - offices to dwe...,PRIOR APPROVAL IN RELATION TO THE CHANGE OF US...,46.0,21 & 21A,WILLOW LANE,CR4 4NA,...,,Superseded,,16/07/2014,Willow Lane,,clock house 21 willow lane & connect house,"Clock House & Connect House, 21 & 21A, WILLOW ...","Clock House 21 Willow Lane & Connect House, 2...",
6,1544,14/P3278,MERTON,Prior Approval,Prior Approval: Change of use - offices to dwe...,PRIOR APPROVAL IN RELATION TO THE CHANGE OF US...,101.0,21 & 21A,WILLOW LANE,CR4 4NA,...,,Superseded,,23/10/2014,Willow Lane,,clock house 21 willow lane & connect house,"Clock House & Connect House, 21 & 21A, WILLOW ...","Clock House 21 Willow Lane & Connect House, 2...",
7,1551,18/P0981,MERTON,Prior Approval,Prior Approval (Class M - formerly IA),PRIOR APPROVAL FOR CHANGE OF USE FROM RETAIL (...,1.0,107b,CENTRAL ROAD,SM4 5SQ,...,Prior Approval Granted,Completed,13/03/2018,16/04/2018,,,107b central road morden sm4 5sq,"107b, CENTRAL ROAD",107b Central Road Morden,"107, Central Road"
8,1551,18/P0981,MERTON,Prior Approval,Prior Approval (Class M - formerly IA),PRIOR APPROVAL FOR CHANGE OF USE FROM RETAIL (...,1.0,107b,CENTRAL ROAD,SM4 5SQ,...,Prior Approval Granted,Completed,13/03/2018,16/04/2018,,,107b central road morden sm4 5sq,"107b, CENTRAL ROAD",107b Central Road Morden,"107, Central Road"
10,1560,19/00597/CUTA3,BROMLEY,Prior Approval,Prior Approval: Change of use - retail/service...,Change of use of the ground floor from betting...,0.0,16 - 18,HIGH STREET,BR1 1EA,...,GRPA,Approved,20/02/2019,16/04/2019,High Street,,16 - 18 high street bromley br1 1ea,"16 - 18, HIGH STREET",16 - 18 High Street Bromley,


In [71]:
## ------------------------------- Post Match  -------------------------------

In [273]:
#print match rate
resi_match_rate = 100 * all_matched['UCL ID'].nunique() / london_data['UCL ID'].nunique()

print('Resi Match rate: ', resi_match_rate) 

Resi Match rate:  30.037395512538495


In [275]:
other_match_rate = 100 * non_resi_all_matched['UCL ID'].nunique() / london_data['UCL ID'].nunique()

print('Other Match rate: ', other_match_rate) 

Other Match rate:  10.800703915530136


In [276]:
#before export, make all these columns blank instead of 0 
#'legal_name', 'sub_building_name', 'building_name','building_number','street_description', 'dependent_locality	locality
all_matched['street_name'] = all_matched['street_name'].replace(0,'')
all_matched['legal_name'] = all_matched['legal_name'].replace(0,'')
all_matched['sub_building_name'] = all_matched['sub_building_name'].replace(0,'')
all_matched['building_name'] = all_matched['building_name'].replace(0,'')
all_matched['building_number'] = all_matched['building_number'].replace(0,'')
all_matched['street_description'] = all_matched['street_description'].replace(0,'')
all_matched['dependent_locality'] = all_matched['dependent_locality'].replace(0,'')
all_matched['locality'] = all_matched['locality'].replace(0,'')
all_matched['post_town'] = all_matched['post_town'].replace(0,'')
all_matched['postcode'] = np.where(all_matched['postcode'] == 0, all_matched['postcode_clean'], all_matched['postcode'])

In [279]:
## if exists
all_matched['sao_start_number'] = all_matched['sao_start_number'].replace(0,'')
all_matched['sao_start_suffix'] = all_matched['sao_start_suffix'].replace(0,'')
all_matched['sao_end_number'] = all_matched['sao_end_number'].replace(0,'')
all_matched['sao_end_suffix'] = all_matched['sao_end_suffix'].replace(0,'')
all_matched['sao_text'] = all_matched['sao_text'].replace(0,'')
all_matched['pao_start_number'] = all_matched['pao_start_number'].replace(0,'')
all_matched['pao_start_suffix'] = all_matched['pao_start_suffix'].replace(0,'')
all_matched['pao_end_number'] = all_matched['pao_end_number'].replace(0,'')
all_matched['pao_end_suffix'] = all_matched['pao_end_suffix'].replace(0,'')
all_matched['pao_text'] = all_matched['pao_text'].replace(0,'')

In [280]:
#write out to csv 

all_matched.to_csv('Batch3_address_matched_1305.csv', index=False) 

non_resi_all_matched.to_csv('Batch3_non_resi_match_1305.csv', index=False)  

In [278]:
non_resi_address_merge_no_match6.to_csv('Batch3_no_match_1305.csv', index = False)