In [1]:
""" Script to clean datasets
    Input: AddressBase
    Output: Address Base -> Resi_AB and Other_AB

Written by: Christine Langston, March 2024
"""
import pandas as pd
import time
import numpy as np
import copy
import re

In [2]:
#create a function to iterate through csv. use for OSAddressBase file
def read_csv(file_name, columns):
    for chunk in pd.read_csv(file_name, chunksize=10000, usecols=columns, 
                            dtype={'urpn':int, 'parent_urpn': int, 'class': str, 'latitude': float, 
                                    'longitude': float, 'country': str, 'legal_name': str, 
                                      'sub_building_name': str, 'building_name': str, 'building_number':'float64', 
                                  'sao_start_number': 'float64', 'sao_start_suffix': str,'sao_end_number': 'float64', 
                                  'sao_end_suffix': str, 'sao_text': str, 'pao_start_number': 'float64', 'pao_start_suffix':str, 
                                  'pao_end_number':'float64', 'pao_end_suffix': str , 'pao_text': str, 'street_description': str} 
                               , 
                             low_memory=False):
        yield chunk


In [5]:
# Set the columns and datasource
"""
fewer_columns  = ['uprn', 'parent_uprn', 'class', 'latitude', 'longitude', 'country' ]
all_columns = ['uprn', 'parent_uprn', 'class', 'latitude', 'longitude', 'country', 'legal_name','sub_building_name',
           'building_name','building_number','sao_start_number','sao_start_suffix','sao_end_number',
           'sao_end_suffix','sao_text','alt_language_sao_text','pao_start_number','pao_start_suffix',
           'pao_end_number','pao_end_suffix','pao_text','alt_language_pao_text','usrn','usrn_match_indicator',
           'area_name','level','official_flag','os_address_toid','os_address_toid_version','os_roadlink_toid',
           'os_roadlink_toid_version','os_topo_toid','os_topo_toid_version','voa_ct_record','voa_ndr_record',
           'street_description','alt_language_street_description','dependent_thoroughfare','thoroughfare',
           'double_dependent_locality','dependent_locality','locality', 'town_name', 
          'administrative_area','post_town','postcode','postcode_locator' ]
"""

address_matching_columns = ['uprn', 'parent_uprn', 'class', 'latitude', 'longitude', 'country',
                    'legal_name','sub_building_name', 'building_name','building_number',
                    'sao_start_number','sao_start_suffix','sao_end_number',
           'sao_end_suffix','sao_text', 'pao_start_number','pao_start_suffix',
           'pao_end_number','pao_end_suffix','pao_text', 'street_description',
                    'dependent_locality','locality', 'town_name', 'administrative_area','post_town','postcode', 
                                                        'postcode_locator', 'parsed_address' ]

address_base_file = "/Users/christine/Documents/_UCL_grad school/research/resi_AB_cleaned_parsed_280324.csv"


In [6]:
#read in the address base data - 32 Million rows
t = time.time()
df_lst = [] 


# Iterate over the file based on the criteria and append to the list
for df in read_csv(address_base_file, address_matching_columns):
    df_lst +=   [df.copy()] 

# And finally combine filtered df_lst into the final larger output say 'df_final' dataframe 
df_final = pd.concat(df_lst)
print('seconds: ', time.time() - t)

seconds:  229.36897921562195


In [9]:
#check the data... 

df_final = df_final.replace([np.nan, -np.inf], '')


In [10]:
df_final.head()

Unnamed: 0,uprn,class,parent_uprn,latitude,longitude,country,legal_name,sub_building_name,building_name,building_number,...,pao_text,street_description,dependent_locality,locality,town_name,administrative_area,post_town,postcode,postcode_locator,parsed_address
0,10012778289,RD02,,54.036005,-2.340833,E,,,,,...,SCOUTBER END FARM,OLD OLIVER LANE TO RAGGED HALL,,RATHMELL,SETTLE,NORTH YORKSHIRE,,,BD24 0LP,"<NA>, SCOUTBER END FARM, OLD OLIVER LANE TO RA..."
1,10013819934,RG02,10013819576.0,54.780687,-1.510651,E,,,,,...,GARAGE SITE 53,DOWSEY ROAD,,,SHERBURN,DURHAM,,,DH6 1JH,"<NA>, 22.0, GARAGE SITE 53, DOWSEY ROAD"
2,10014309637,RD04,200003655499.0,51.26651,0.497553,E,,,,,...,,MILTON STREET,,,MAIDSTONE,KENT,,,ME16 8LD,"<NA>, ROOM 1, 85.0, MILTON STREET"
3,10033213520,RD01,10002821654.0,53.903252,-0.158754,E,,,,,...,LONGBEACH LEISURE PARK,HORNSEA BURTON ROAD,,,HORNSEA,EAST RIDING OF YORKSHIRE,,,HU18 1TL,"<NA>, 33 SOUTHFIELD, LONGBEACH LEISURE PARK, H..."
4,10033545757,RD06,100023479131.0,51.514983,-0.17926,E,,,,,...,,WESTBOURNE TERRACE,,,LONDON,CITY OF WESTMINSTER,,,W2 3UJ,"<NA>, SECOND FLOOR, 58.0, WESTBOURNE TERRACE"


In [11]:
### ------ DATA CLEANING  ----------
#edit the parent uprn from OSAddress base to fill out to be strings with 12 digits and leading zeros
#replace nan
#df_final = df_final.replace([np.nan, -np.inf], 0)

#cast as integer
#df_final['parent_uprn'] = df_final['parent_uprn'].astype('Int64')
#df_final['uprn'] = df_final['uprn'].astype('Int64')


In [14]:
### ------ DATA CLEANING  ----------
#cast as string 
df_final['parent_uprn'] = df_final['parent_uprn'].astype(str)
df_final['uprn'] = df_final['uprn'].astype(str)

#fill in with left side padding zeros 
df_final['parent_uprn'] = df_final['parent_uprn'].apply(lambda x: '{0:0>12}'.format(x))
df_final['uprn'] = df_final['uprn'].apply(lambda x: '{0:0>12}'.format(x))

#print(df_final['parent_uprn'])

In [13]:
df_final.info

<bound method DataFrame.info of                   uprn class     parent_uprn   latitude  longitude country  \
0          10012778289  RD02                  54.036005  -2.340833       E   
1          10013819934  RG02   10013819576.0  54.780687  -1.510651       E   
2          10014309637  RD04  200003655499.0  51.266510   0.497553       E   
3          10033213520  RD01   10002821654.0  53.903252  -0.158754       E   
4          10033545757  RD06  100023479131.0  51.514983  -0.179260       E   
...                ...   ...             ...        ...        ...     ...   
26492122  100061604724  RD02                  51.188899  -0.815405       E   
26492123  100070351297  RD04                  52.453545  -1.981924       E   
26492124   10007935972  RD06  100091465340.0  51.790332   1.161525       E   
26492125   10070040744  RD03                  51.902065  -0.367300       E   
26492126   10070372825  RD06                  53.677436  -2.662616       E   

         legal_name sub_buildin

In [28]:
### ------ DATA CLEANING  ----------

#Address matching data clean, make building number into a string 
df_final['building_number'] = df_final['building_number'].astype('Int64').astype('str') 

In [29]:
#split the AddressBase into Residential uses and all others
# 26,492,127 data ponits
resi_AB = df_final[df_final['class'].str.startswith('R')] 
other_AB = df_final[df_final['class'].str.startswith('R') == False] 

In [30]:
# DATA EXPLORATION - EXPORT CSV IF NEEDED 
resi_AB.to_csv('resi_AB_cleaned.csv', index = False)

In [31]:
# DATA EXPLORATION - EXPORT CSV IF NEEDED 
other_AB.to_csv('other_AB_cleaned.csv', index = False)

In [114]:
##-------- DATA MERGING --------  STRATEGY 2 ADDRESS MATCH
# concat AB Building number + AB street description = site_name_LPA before comma
    #  and postcode  = postcode 
#create new column and oncatenate the buildign number and street

resi_AB['number_street'] = np.where(resi_AB['building_name'] != 0,  resi_AB['building_name'].apply(lambda x: re.sub('[A-Z]', '',x) if type(x) == str else x)
, resi_AB['building_number'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resi_AB['number_street'] = np.where(resi_AB['building_name'] != 0,  resi_AB['building_name'].apply(lambda x: re.sub('[A-Z]', '',x) if type(x) == str else x)


In [115]:
resi_AB['number_street'] =  resi_AB['number_street'] + ' ' + resi_AB['street_description'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resi_AB['number_street'] =  resi_AB['number_street'] + ' ' + resi_AB['street_description'].str.lower()
