In [1]:
import pandas as pd
import numpy as np
import openrouteservice as ors
from dotenv import load_dotenv
from pyonemap import OneMap
import os
import requests
import json
import time

In [2]:
load_dotenv()

True

In [3]:
ors_key = os.getenv("ORS_API_KEY")

client = ors.Client(key= ors_key)

In [4]:
directory = os.getcwd()

os.chdir(directory)

hdbCentroids_df = pd.read_csv(r"..\data\hdb_cluster_centroids.csv",header = None,names = ['Latitude','Longitude'])
mrt_stations_df = pd.read_csv(r"..\data\mrt_stations.csv",usecols = [1,2,3])

In [5]:

one_map_email = os.getenv("ONE_MAP_EMAIL")
one_map_password = os.getenv("ONE_MAP_PASSWORD")
payload = {
        "email": one_map_email,
        "password": one_map_password
      }
api_key = requests.request("POST", "https://www.onemap.gov.sg/api/auth/post/getToken", json=payload)
api_key = api_key.json()["access_token"]

In [6]:
onemap = OneMap(api_key)

In [7]:
def get_centroid_name(row):
    geocode = onemap.reverseGeocode.revGeoCode(row['Latitude'], row['Longitude'])
    if geocode['GeocodeInfo'][0]['BUILDINGNAME'] != "NIL":
        return geocode['GeocodeInfo'][0]['BUILDINGNAME']
    else:
        if geocode['GeocodeInfo'][0]['BLOCK'] != "NIL":
            return geocode['GeocodeInfo'][0]['BLOCK'] + " " + geocode['GeocodeInfo'][0]['ROAD']
        else:
            return geocode['GeocodeInfo'][0]['ROAD']

In [8]:
hdbCentroids_df['centroid_name'] = hdbCentroids_df.apply(get_centroid_name, axis=1)

In [9]:
#create a psuedo index for my residential centroids df
hdbCentroids_df['index'] = hdbCentroids_df.index

hdbCentroids_df['index']

#create a psuedo index for my residential centroids df
mrt_stations_df['index'] = mrt_stations_df.index

mrt_stations_df['index']

##create a dummy variable to cross join on 
hdbCentroids_df['join_key'] = "A"
mrt_stations_df['join_key'] = "A"

#Remove a non existent train station called SUB STATION
mrt_stations_df = mrt_stations_df[~mrt_stations_df['MRT Name'].str.contains('SUB')]

#Cross join to obtain combinations of all possible pairings between MRTs and Residential Centroids
combined_df = pd.merge(hdbCentroids_df, mrt_stations_df, on='join_key')

print(combined_df)

       Latitude_x  Longitude_x            centroid_name  index_x join_key  \
0        1.432477   103.791322  503A WOODLANDS DRIVE 14        0        A   
1        1.432477   103.791322  503A WOODLANDS DRIVE 14        0        A   
2        1.432477   103.791322  503A WOODLANDS DRIVE 14        0        A   
3        1.432477   103.791322  503A WOODLANDS DRIVE 14        0        A   
4        1.432477   103.791322  503A WOODLANDS DRIVE 14        0        A   
...           ...          ...                      ...      ...      ...   
58845    1.388649   103.901674       COMPASSVALE PLAINS      274        A   
58846    1.388649   103.901674       COMPASSVALE PLAINS      274        A   
58847    1.388649   103.901674       COMPASSVALE PLAINS      274        A   
58848    1.388649   103.901674       COMPASSVALE PLAINS      274        A   
58849    1.388649   103.901674       COMPASSVALE PLAINS      274        A   

                        MRT Name  Latitude_y  Longitude_y  index_y  
0     

In [10]:
#Defining a function that calculates the Euclidean Distance between two points using Haversine Method?
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth radius in kilometers

    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    distance = R * c
    return distance

In [11]:
#Apply function to dataframe and store distances in new column 'euclidean distance'
combined_df['euclidean_distance'] = haversine(combined_df['Latitude_x'], combined_df['Longitude_x'], combined_df['Latitude_y'], combined_df['Longitude_y'])

#Group by residential centroid, filter out the closest MRT by distance for each centroid into another dataframe and reset its index
result_df = combined_df.loc[combined_df.groupby('index_x')['euclidean_distance'].idxmin()].reset_index()

result_df

#Create a new column 'coordinate_pair' to store coordinate pairs to pass to openrouteservice API direction query
result_df['coordinate_pair'] = list(zip(result_df['Longitude_x'], result_df['Latitude_x'], result_df['Longitude_y'], result_df['Latitude_y']))
result_df['coordinate_pair'] = result_df['coordinate_pair'].apply(lambda x: [[x[0], x[1]], [x[2], x[3]]])


#create an empty column 'route' to later store query response
result_df['route'] = np.nan

In [12]:
def get_route(coordinate_pair):
    time.sleep(2)
    try:
        return client.directions(coordinate_pair, profile='cycling-regular', format='geojson', validate=False)
    except Exception as e:
        print(f"Error: {e}")
        return None

In [13]:
result_df['route'] = result_df['coordinate_pair'].apply(get_route)

Error: 404 ({'error': {'code': 2010, 'message': 'Could not find routable point within a radius of 350.0 meters of specified coordinate 1: 103.9878836 1.3574790.'}, 'info': {'engine': {'build_date': '2024-01-29T14:41:12Z', 'version': '7.1.1'}, 'timestamp': 1711276737536}})


In [14]:
def get_distance(route):
    try:
        return route['features'][0]['properties']['segments'][0]['distance']
    except (KeyError, IndexError,TypeError) as e:
        print(f"Error: {e}")
        return None

result_df['distance'] = result_df['route'].apply(get_distance)

def get_time(route):
    try:
        return route['features'][0]['properties']['segments'][0]['duration']
    except (KeyError, IndexError,TypeError) as e:
        print(f"Error: {e}")
        return None
    

result_df['duration'] = result_df['route'].apply(get_time)

Error: 'NoneType' object is not subscriptable
Error: 'NoneType' object is not subscriptable


In [15]:
new_result_df = result_df. loc[:, result_df. columns != 'route']
new_result_df.to_csv(r"..\data\HDB_Centroid_MRT pairing data.csv")
routes = result_df['route'].copy(deep = True)
routes.to_json(r'..\data\HDB_MRT_routes.json', orient='records')

In [16]:
result_df[result_df['route'].isna()]

Unnamed: 0,index,Latitude_x,Longitude_x,centroid_name,index_x,join_key,MRT Name,Latitude_y,Longitude_y,index_y,euclidean_distance,coordinate_pair,route,distance,duration
65,13949,1.389,103.98795,4 CHANGI VILLAGE ROAD,65,A,CHANGI AIRPORT MRT STATION,1.357479,103.987884,40,3.504935,"[[103.98795002672024, 1.3889995410271168], [10...",,,


In [17]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275 entries, 0 to 274
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               275 non-null    int64  
 1   Latitude_x          275 non-null    float64
 2   Longitude_x         275 non-null    float64
 3   centroid_name       275 non-null    object 
 4   index_x             275 non-null    int64  
 5   join_key            275 non-null    object 
 6   MRT Name            275 non-null    object 
 7   Latitude_y          275 non-null    float64
 8   Longitude_y         275 non-null    float64
 9   index_y             275 non-null    int64  
 10  euclidean_distance  275 non-null    float64
 11  coordinate_pair     275 non-null    object 
 12  route               274 non-null    object 
 13  distance            274 non-null    float64
 14  duration            274 non-null    float64
dtypes: float64(7), int64(3), object(5)
memory usage: 32.4+ KB