In [1]:

#In this script, we read in the OSAddressBase file. We only take in the fields that we need for the matching process.
# Then, we combine a few fields to create new address columns
# We also add a column of postcode sector (the postcode minus the last two characters)
# Then we divide the data into a residential dataset and other dataset based on 'class'


## INPUTS: from the OneDrive: OS ADDRESSBASE file ab_plus_england_202308150944.csv
## OUTPUTS: Residential and Other address base CSVs

In [2]:
import pandas as pd
import time
import numpy as np
import copy
import re

In [3]:
#create a function to iterate through csv. use for OSAddressBase file
def read_csv(file_name, columns):
    for chunk in pd.read_csv(file_name, chunksize=10000, usecols=columns, 
                            dtype={'urpn':int, 'parent_urpn': int, 'class': str, 'latitude': float, 
                                    'longitude': float, 'country': str, 'legal_name': str, 
                                      'sub_building_name': str, 'building_name': str, 'building_number':'float64', 
                                  'sao_start_number': 'float64', 'sao_start_suffix': str,'sao_end_number': 'float64', 
                                  'sao_end_suffix': str, 'sao_text': str, 'pao_start_number': 'float64', 'pao_start_suffix':str, 
                                  'pao_end_number':'float64', 'pao_end_suffix': str , 'pao_text': str, 'street_description': str} 
                               , 
                             low_memory=False):
    
        yield chunk


In [4]:
"""
fewer_columns  = ['uprn', 'parent_uprn', 'class', 'latitude', 'longitude', 'country' ]
all_columns = ['uprn', 'parent_uprn', 'class', 'latitude', 'longitude', 'country', 'legal_name','sub_building_name',
           'building_name','building_number','sao_start_number','sao_start_suffix','sao_end_number',
           'sao_end_suffix','sao_text','alt_language_sao_text','pao_start_number','pao_start_suffix',
           'pao_end_number','pao_end_suffix','pao_text','alt_language_pao_text','usrn','usrn_match_indicator',
           'area_name','level','official_flag','os_address_toid','os_address_toid_version','os_roadlink_toid',
           'os_roadlink_toid_version','os_topo_toid','os_topo_toid_version','voa_ct_record','voa_ndr_record',
           'street_description','alt_language_street_description','dependent_thoroughfare','thoroughfare',
           'double_dependent_locality','dependent_locality','locality', 'town_name', 
          'administrative_area','post_town','postcode','postcode_locator' ]

   
                                   'dependent_locality': str, 'locality': object, 'town_name': str, 
                                  'administrative_area': str, 'post_town': str,  'postcode': str, 
                                  'postcode_locator': str}
"""

address_matching_columns = ['uprn', 'parent_uprn', 'class', 'latitude', 'longitude', 'country',
                    'legal_name','sub_building_name', 'building_name','building_number',
                    'sao_start_number','sao_start_suffix','sao_end_number',
           'sao_end_suffix','sao_text', 'pao_start_number','pao_start_suffix',
           'pao_end_number','pao_end_suffix','pao_text', 'street_description',
                    'dependent_locality','locality', 'town_name', 'administrative_area','post_town','postcode', 'postcode_locator' ]

address_base_file = "/Users/christine/Documents/_UCL_grad school/research/research_pd_24/data/ab_plus_england_202308150944.csv"


In [5]:
#read in the address base data - 32 Million rows
t = time.time()
df_lst = [] 


# Iterate over the file based on the criteria and append to the list
for df in read_csv(address_base_file, address_matching_columns):
    df_lst +=   [df.copy()] 

# And finally combine filtered df_lst into the final larger output say 'df_final' dataframe 
df_final = pd.concat(df_lst)
print('seconds: ', time.time() - t)

seconds:  331.6477301120758


In [6]:
def combine_columns_1(row): 
    columns = ['sao_text','pao_text',  'pao_start_number', 'street_description']
    list_ = [str(row[x]) for x in columns]
    return ', '.join(filter(None, list_))

In [7]:
def combine_columns_2(row): 
    columns = ['sub_building_name', 'building_name','building_number', 'street_description']
    list_ = [str(row[x]) for x in columns]
    return ', '.join(filter(None, list_))

In [8]:
df_final['building_number'] = df_final['building_number'].astype('Int64').astype('str') 

In [10]:
df_final['pao_start_number'] = df_final['pao_start_number'].astype('Int64').astype('str') 

In [12]:
# here I am replacing the Nans with e
df_final = df_final.replace([np.nan, -np.inf], '')


In [13]:
# add in the replacement for NA (in building number) so that it is just empty string
df_final = df_final.replace('<NA>', '')

In [14]:
df_final.head()

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,pao_end_suffix,pao_text,street_description,dependent_locality,locality,town_name,administrative_area,post_town,postcode,postcode_locator
0,10012778289,RD02,,54.036005,-2.340833,E,,,,,...,,SCOUTBER END FARM,OLD OLIVER LANE TO RAGGED HALL,,RATHMELL,SETTLE,NORTH YORKSHIRE,,,BD24 0LP
1,10013268085,LU01,,51.628915,0.471228,E,,,,,...,,MEEPSWOOD,PARK LANE,,RAMSDEN HEATH,CHELMSFORD,ESSEX,,,CM11 1NN
2,10013819934,RG02,10013819576.0,54.780687,-1.510651,E,,,,,...,,GARAGE SITE 53,DOWSEY ROAD,,,SHERBURN,DURHAM,,,DH6 1JH
3,10014205932,PS,,51.590568,0.600675,E,,,,,...,,STREET RECORD,FOOTPATH FROM GLEBE DRIVE TO THE APPROACH,,,RAYLEIGH,ESSEX,,,SS6 9HJ
4,10014309637,RD04,200003655499.0,51.26651,0.497553,E,,,,,...,,,MILTON STREET,,,MAIDSTONE,KENT,,,ME16 8LD


In [15]:
%%time
## Add the column that adds parsed string to the entire dataset...
df_final['parsed_address1'] = df_final.apply(combine_columns_1, axis = 1)
df_final['parsed_address2'] = df_final.apply(combine_columns_2, axis = 1)

CPU times: user 17min 38s, sys: 6min 11s, total: 23min 49s
Wall time: 28min 21s


In [16]:
#also create a third column alternate address
df_final['parsed_address3'] = df_final[['building_name', 'town_name', 'post_town']].agg(lambda x: ', '.join(x[~x.isnull()].values), axis=1)

In [17]:
#create a new column that is just the postcode sector (the postcode minus the last two characters)
df_final['postcode_sector'] = df_final['postcode_locator'].apply(lambda x: x[:-2])

In [18]:
#Divide the Address Base data into residential use class and other use class 
resi_AB = df_final[df_final['class'].str.startswith('R')] 
other_AB = df_final[df_final['class'].str.startswith('R') == False] 

In [20]:
# DATA EXPLORATION - EXPORT CSV IF NEEDED 
resi_AB.to_csv('resi_AB_cleaned_parsed_140624.csv', index = False)

In [21]:
# DATA EXPLORATION - EXPORT CSV IF NEEDED 
other_AB.to_csv('other_AB_cleaned_parsed_140624.csv', index = False)