In [None]:
# get all the different addresses

import pandas as pd

train = pd.read_csv("data/train_origin.csv")
test = pd.read_csv("data/test_origin.csv")

for df in [train, test]:
    df["address"] = (df["BLOCK"].astype(str).str.strip() + " " +
                     df["STREET"].astype(str).str.strip()).str.upper()

train_addr = set(train["address"].unique())
test_addr = set(test["address"].unique())

union_addr = sorted(train_addr.union(test_addr))

union_df = pd.DataFrame({"address": union_addr})
union_df.to_csv("union_address.csv", index=False)

print("file saved: union_address.csv")
print(union_df.head())

In [None]:
# get corresponding coordinates using Google API

import pandas as pd
import requests
import time

GOOGLE_KEY = "xxx"

def get_coords_google(address):
    url = "https://maps.googleapis.com/maps/api/geocode/json"
    params = {
        "address": address + ", Singapore",
        "key": GOOGLE_KEY
    }
    res = requests.get(url, params=params)
    data = res.json()

    if data.get("status") == "OK":
        loc = data["results"][0]["geometry"]["location"]
        return loc["lat"], loc["lng"]
    else:
        print("address not found:", address, "| status:", data.get("status"))
        return None, None


df = pd.read_csv("union_addresses.csv")

latitudes, longitudes = [], []

for i, street in enumerate(df["address"], start=1):
    lat, lng = get_coords_google(street)
    latitudes.append(lat)
    longitudes.append(lng)
    print(f"{i}/{len(df)} {street}: ({lat}, {lng})")
    time.sleep(0.2)

df["LATITUDE"] = latitudes
df["LONGITUDE"] = longitudes

df.to_csv("address_with_coords.csv", index=False)
print("file saved: address_with_coords.csv")


In [None]:
# add the coordinates to train and test tables

import pandas as pd

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_addr = pd.read_csv("address_with_coords.csv")

for df in [df_train, df_test]:
    df["BLOCK_UPPER"] = df["BLOCK"].astype(str).str.upper()
    df["STREET_UPPER"] = df["STREET"].str.upper()
    df["address"] = df["BLOCK_UPPER"] + " " + df["STREET_UPPER"]

df_addr["address"] = df_addr["address"].str.upper()

df_train = df_train.merge(df_addr, on="address", how="left")
df_test = df_test.merge(df_addr, on="address", how="left")

df_train.to_csv("train_with_coords.csv", index=False)
df_test.to_csv("test_with_coords.csv", index=False)
print("file savedï¼štrain_with_coords.csv / test_with_coords.csv")


In [None]:
# calculate the distances of 5 nearest facilities to every hdb

import pandas as pd
import numpy as np
from tqdm import tqdm

df_train = pd.read_csv("train_with_coords.csv")
df_test = pd.read_csv("test_with_coords.csv")

facilities = {
    "PRIMARY": pd.read_csv("auxiliary-data/sg-primary-schools.csv"),
    "SECONDARY": pd.read_csv("auxiliary-data/sg-secondary-schools.csv"),
    "MALL": pd.read_csv("auxiliary-data/sg-shopping-malls.csv"),
    "MRT": pd.read_csv("auxiliary-data/sg-mrt-stations.csv"),
    "HAWKER": pd.read_csv("auxiliary-data/sg-gov-hawkers.csv")
}

def euclidean_km(lat1, lon1, lat2, lon2):
    return 111 * np.sqrt((lat1 - lat2)**2 + (lon1 - lon2)**2)

def add_topk_facility_distances(df, facility_df, prefix, k=5):
    print(f"Processing {prefix} ...")
    facility_coords = facility_df[["LATITUDE", "LONGITUDE"]].values
    topk_cols = [f"DIST_{prefix}_{i+1}" for i in range(k)]
    topk_matrix = np.zeros((len(df), k))

    for idx, (lat, lon) in tqdm(
        enumerate(zip(df["LATITUDE"], df["LONGITUDE"])),
        total=len(df),
        desc=f"{prefix} distances"
    ):
        distances = euclidean_km(lat, lon, facility_coords[:, 0], facility_coords[:, 1])
        topk = np.sort(distances)[:k]
        if len(topk) < k:
            topk = np.pad(topk, (0, k - len(topk)), constant_values=topk[-1])
        topk_matrix[idx, :] = topk

    for i in range(k):
        df[topk_cols[i]] = topk_matrix[:, i]

    return df

for name, df_fac in facilities.items():
    df_train = add_topk_facility_distances(df_train, df_fac, name, k=5)
    df_test = add_topk_facility_distances(df_test, df_fac, name, k=5)

df_train.to_csv("train_5.csv", index=False)
df_test.to_csv("test_5.csv", index=False)

print("file saved: train_5.csv / test_5.csv")



In [None]:
# normalize the distances just calculated

import pandas as pd

df_train = pd.read_csv("train_5.csv")
df_test = pd.read_csv("test_5.csv")

start_col = "DIST_MRT_MIN"
end_col = "DIST_HAWKER_MIN"

start_idx = df_train.columns.get_loc(start_col)
end_idx = df_train.columns.get_loc(end_col) + 1
cols_to_drop = df_train.columns[start_idx:end_idx]

df_train = df_train.drop(columns=cols_to_drop)
df_test = df_test.drop(columns=cols_to_drop)

dist_cols = [c for c in df_train.columns if c.startswith("DIST_")]

for col in dist_cols:
    d_min = min(df_train[col].min(), df_test[col].min())
    d_max = max(df_train[col].max(), df_test[col].max())
    if d_max == d_min:
        df_train[col] = 1.0
        df_test[col] = 1.0
    else:
        df_train[col] = 1 - (df_train[col] - d_min) / (d_max - d_min)
        df_test[col] = 1 - (df_test[col] - d_min) / (d_max - d_min)

df_train.to_csv("train_5_norm.csv", index=False)
df_test.to_csv("test_5_norm.csv", index=False)

print("file saved: train_5_norm.csv / test_5_norm.csv")


In [None]:
# get region map

import pandas as pd

mrt = pd.read_csv("auxiliary-data/sg-mrt-stations.csv")
pri = pd.read_csv("auxiliary-data/sg-primary-schools.csv")
sec = pd.read_csv("auxiliary-data/sg-secondary-schools.csv")
mall = pd.read_csv("auxiliary-data/sg-shopping-malls.csv")

dfs = [mrt[["PLANNING_AREA", "REGION"]],
       pri[["PLANNING_AREA", "REGION"]],
       sec[["PLANNING_AREA", "REGION"]],
       mall[["PLANNING_AREA", "REGION"]]]

union_df = pd.concat(dfs, axis=0).drop_duplicates().reset_index(drop=True)

union_df["PLANNING_AREA"] = union_df["PLANNING_AREA"].str.strip().str.upper()

union_df = union_df.drop_duplicates(subset=["PLANNING_AREA"]).reset_index(drop=True)

union_df.to_csv("region.csv", index=False)

print("file saved: region.csv")



In [None]:
# map the regions to train and test tables; then do one-hot encoding

region_map = pd.read_csv("region.csv")

for name in ["train_5_norm.csv", "test_5_norm.csv"]:
    df = pd.read_csv(name)
    df["TOWN_UPPER"] = df["TOWN"].str.strip().str.upper()
    df = df.merge(region_map.rename(columns={"PLANNING_AREA": "TOWN_UPPER"}),
                  on="TOWN_UPPER", how="left")

    df = pd.get_dummies(df, columns=["REGION"], prefix="REGION")

    out_name = name.replace(".csv", "_region.csv")
    df.to_csv(out_name, index=False)
    print(f"file saved: {out_name}")
