# Location Name Filter
Processing of locations scraped from OneMap. Code is written to extract relevant names based on the raw data format in data/singapore-postal-codes. Import first JSON, then a function to read the relevant files.

In [1]:
import json

data_path = '../data/singapore-postal-codes/'

def load_data(file):
    with open(data_path + file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

A function to properly capitalise location names is placed here to facilitate cleaning location names for all values later.

In [2]:
# Function to properly capitalise and clean values
def properly_capitalise(location_name):
    properly_capitalised_name = location_name.lower().title()
    return properly_capitalised_name

# MRT/LRT Station Names Extraction
Function created to extract station names, based on the raw format of the json they came in

In [3]:
# function to extract mrt & lrt station names
def stn_name_filter(data):
    # filters out "possible locations" by zooming in on sub-list
    list_of_stns= []
    for i in range(len(data)):
        list_of_stns.append(json.dumps(data[i]["Possible Locations"]))

    # filters out actual station names (14 index after "SEARCHVAL": " to ", "X":) in a loop
    count = 0
    for each_stn in list_of_stns:  
        if each_stn != []:
            final_name = each_stn[each_stn.find('"SEARCHVAL": "') + 14:each_stn.find('", "X":')]
            #replace original entry with actual station name
            list_of_stns[count] = final_name.lower()
        count += 1
    #filters out blank strings
    while ('' in list_of_stns):
        list_of_stns.remove('')
    
    return list_of_stns

Extraction, updating and cleaning of LRT station names

In [4]:
#load LRT station names and append missing stations
lrt_data0 = load_data("lrt_stations.json")
uncleaned_lrt_stns = stn_name_filter(lrt_data0) + ['senja lrt station', 'sengkang lrt station', 'punggol lrt station']

In [5]:
#Properly capitalise LRT station names and add to list
lrt_stns = []
for station in uncleaned_lrt_stns:
    lrt_stn_name = properly_capitalise(station).replace("Lrt", "LRT")
    lrt_stns.append(lrt_stn_name)
    lrt_no_stn = lrt_stn_name.replace(" LRT Station", '')
    lrt_stns.append(lrt_no_stn)

Extraction, updating and cleaning of MRT station names

In [6]:
#load MRT station names and append missing stations
mrt_data0 = load_data("mrt_stations.json")
#append missing station names
uncleaned_mrt_stns = stn_name_filter(mrt_data0) + ['woodlands north mrt station', 'woodlands south mrt station', 'springleaf mrt station', 'lentor mrt station', 'mayflower mrt station', 'bright hill mrt station', 'upper thomson mrt station']

In [7]:
#Properly capitalise MRT station names and add to list
mrt_stns = []
for station in uncleaned_mrt_stns:
    mrt_stn_name = properly_capitalise(station).replace("Mrt", "MRT")
    mrt_stns.append(mrt_stn_name)
    mrt_no_stn = mrt_stn_name.replace(" MRT Station", '')
    mrt_stns.append(mrt_no_stn)


# Building Names Extraction

From buildings.json, several categories of location values were selected that are meaningful to everyday identification of building locations. The values for POSTAL, BUILDING, ROAD_NAME and BLK_NO were picked out, correctly capitalised and put together to give the following list of values:

SIMPLE ADDRESS - created from the cleaned values of BLK_NO and ROAD_NAME  
POSTCODE - created by concat of "Singapore" to POSTAL  
BUILDING NAME - created by cleaning BUILDING  
ROAD NAME - created by cleaning ROAD NAME

These lists are segragated to enable future provision to filter them as seperate entity tags, rather than a singular "LOC". For now, they will be concat together into a single JSON file.

Previously, SIMPLE ADDRESS was created by taking SEARCHVAL, then deleting the postcode "SINGAPORE XXXXXX" via RegEx and building name if duplicated. However, this process was not precise enough - certain private estate names were input as the building name in the raw data, which result in filtering out even road names from SIMPLE ADDRESS. This left the filtered data with entries that are only numbers, which messed up the EntityRuler later on in the NER creation process.

In [8]:
buildings_data0 = load_data("buildings.json")

In [9]:
#creating three seperate lists for buildingname, address and postcode. not sure if it is meaningful to split but doing so for now if we need it in the future.
buildings_name_list = []
buildings_address_list = []
buildings_postcode = []
road_names_list = []

In [10]:
def extract_value(value):
    value_result = properly_capitalise(json.dumps(buildings_data0[i][value], sort_keys = True).strip('"'))
    return value_result

In [11]:
#running through each item in the scraped buildings list
for i in range(len(buildings_data0)):
    onemap_postal = "Singapore " + extract_value("POSTAL")
    buildings_postcode.append(onemap_postal)

    onemap_building = extract_value('BUILDING')
    buildings_name_list.append(onemap_building)
    
    onemap_road_name = extract_value('ROAD_NAME')
    road_names_list.append(onemap_road_name)

    onemap_blk_no = extract_value("BLK_NO")
    simple_buildings_address = onemap_blk_no + " " + onemap_road_name
    buildings_address_list.append(simple_buildings_address)

In [12]:
#function to remove duplicate items in list
def duplicate_remover(thelist):
    org_length = len(thelist)
    the_new_list = sorted(list(set(thelist)))
    final_length = len(the_new_list)
    print(f"Number of unique items reduced from {org_length} to {final_length}")
    return the_new_list

In [13]:
#putting unique items into their finalised lists
new_buildings_name_list = duplicate_remover(buildings_name_list)
new_buildings_address_list = duplicate_remover(buildings_address_list)
new_buildings_postcode = duplicate_remover(buildings_postcode)
new_road_names_list = duplicate_remover(road_names_list)

Number of unique items reduced from 141726 to 16165
Number of unique items reduced from 141726 to 132757
Number of unique items reduced from 141726 to 121361
Number of unique items reduced from 141726 to 3867


# Transfer of Data to JSON

In [14]:
save_data_path = "../../data/extracted_locations/"

def save_data(file, data):
    with open (save_data_path + file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

save_data('extracted_mrt_stns.json', mrt_stns)

save_data('extracted_lrt_stns.json', lrt_stns)

save_data('extracted_buildings_address_list.json', new_buildings_address_list)

save_data('extracted_buildings_name_list.json', new_buildings_name_list)

save_data('extracted_buildings_postcode.json', new_buildings_postcode)

save_data('extracted_road_names_list.json', new_road_names_list)