In [1]:
import os
import json
import requests
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm.notebook import tqdm
import plotly.graph_objects as go

In [52]:
current_mth = datetime.now().date().strftime("%Y-%m")
mths_2017_onwards = [str(i)[:7] for i in pd.date_range(
    "2017-01-01", current_mth+"-01", freq='MS').tolist()]

In [6]:
df_cols = ['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range', 'floor_area_sqm', 
           'remaining_lease', 'lease_commence_date', 'resale_price' ]

param_fields = ",".join(df_cols)
basic_df = pd.DataFrame()

mth_2017 = "?resource_id=d_8b84c4ee58e3cfc0ece0d773c8ca6abc"
base_url = "https://data.gov.sg/api/action/datastore_search"
url = base_url + mth_2017

for mth in tqdm(mths_2017_onwards):
    params = {
        "fields": param_fields,
        "filters": json.dumps({'month': mth}),
        "limit": 10000
    }
    response = requests.get(url, params=params)
    mth_df = pd.DataFrame(response.json().get("result").get("records"))   
    basic_df = pd.concat([basic_df, mth_df], axis=0)

  0%|          | 0/90 [00:00<?, ?it/s]

In [7]:
basic_df.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,remaining_lease,lease_commence_date,resale_price
0,2017-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44,61 years 04 months,1979,232000
1,2017-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67,60 years 07 months,1978,250000
2,2017-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67,62 years 05 months,1980,262000
3,2017-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68,62 years 01 month,1980,265000
4,2017-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67,62 years 05 months,1980,265000


In [21]:
latest_df = basic_df.copy()
latest_df = latest_df[['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range', 
                       'floor_area_sqm', 'remaining_lease', 'resale_price']]

latest_df.columns = ['month', 'town', 'flat', 'block', 'street_name', 'storey_range', 'area', 'lease_left', 'price']
# latest_df['month'] = pd.to_datetime(latest_df['month'])
latest_df = latest_df[latest_df['month'] >= '2020-01']
latest_df['area'] = latest_df['area'].astype(np.float)
# latest_df['price'] = latest_df['price'].astype(np.float16)
# latest_df['start_date'] = latest_df['start_date'].astype(np.uint16)
latest_df['lease_left'] = [i.replace(' years', 'y') for i in latest_df['lease_left']]
latest_df['lease_left'] = [i.replace(' months', 'm') for i in latest_df['lease_left']]
latest_df['lease_left'] = [i.replace(' month', 'm') for i in latest_df['lease_left']]
latest_df['storey_range'] = [i.replace(' TO ', '-') for i in latest_df['storey_range']]
latest_df['flat'] = [i.replace(" ROOM", "R") for i in latest_df['flat']]
latest_df['flat'] = [i.replace("EXECUTIVE", "EC") for i in latest_df['flat']]
latest_df['flat'] = [i.replace("MULTI-GENERATION", "MG") for i in latest_df['flat']]
latest_df['flat'] = latest_df['flat'].astype(str)

In [22]:
latest_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 118467 entries, 0 to 1898
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   month         118467 non-null  object 
 1   town          118467 non-null  object 
 2   flat          118467 non-null  object 
 3   block         118467 non-null  object 
 4   street_name   118467 non-null  object 
 5   storey_range  118467 non-null  object 
 6   area          118467 non-null  float64
 7   lease_left    118467 non-null  object 
 8   price         118467 non-null  object 
dtypes: float64(1), object(8)
memory usage: 9.0+ MB


In [23]:
latest_df.head()

Unnamed: 0,month,town,flat,block,street_name,storey_range,area,lease_left,price
0,2020-01,ANG MO KIO,3R,208,ANG MO KIO AVE 1,04-06,73.0,55y 07m,265000
1,2020-01,ANG MO KIO,3R,307C,ANG MO KIO AVE 1,19-21,70.0,91y 08m,470000
2,2020-01,ANG MO KIO,3R,319,ANG MO KIO AVE 1,01-03,73.0,56y 04m,230000
3,2020-01,ANG MO KIO,3R,216,ANG MO KIO AVE 1,04-06,73.0,55y 03m,280000
4,2020-01,ANG MO KIO,3R,556,ANG MO KIO AVE 10,07-09,68.0,59y 01m,220000


#### MongoDB Upload 

In [3]:
from pymongo import mongo_client
from typing import Dict, List
from datetime import datetime
import time
import os

# MongoDB credentials
MONGO_PASSWORD = os.environ["mongo_pw"]
base_url = "mongodb+srv://cliffchew84:"
end_url = "cliff-nlb.t0whddv.mongodb.net/?retryWrites=true&w=majority"
mongo_url = f"{base_url}{MONGO_PASSWORD}@{end_url}"


def connect_mdb():
    return mongo_client.MongoClient(mongo_url, serverSelectionTimeoutMS=5000)

db = connect_mdb()
db.list_database_names()

['nlb', 'admin', 'local']

In [4]:
# Checking DB
db_nlb = db['nlb']

# Check the collections inside the BD
db_nlb.list_collection_names()

['users',
 'user_status',
 'hdb_dump',
 'user_books',
 'user_search',
 'books_avail',
 'books_info']

#### Dump HDB data into MongoDB 

In [30]:
hdb_dump = db_nlb['hdb_dump']

# Convert DataFrame to dictionary
data_dict = latest_df.to_dict("records")

# Insert data into MongoDB
hdb_dump.insert_many(data_dict)

#### Dump MRT location data into MongoDB 

In [37]:
mrt_df = pd.read_csv("data/mrt_locations.csv")
mrt_df = mrt_df[['code_string', 'code_num', 'BUILDING', 'POSTAL', 'LATITUDE', 'LONGITUDE']]
mrt_df.code_num = mrt_df.code_num.astype(str)
mrt_df['loc_info'] = mrt_df['code_string'] + mrt_df['code_num'] + " | " + mrt_df['BUILDING']
final_mrt_df = mrt_df[['loc_info', "LATITUDE", "LONGITUDE"]]
final_mrt_df.head()

Unnamed: 0,loc_info,LATITUDE,LONGITUDE
0,TE1 | WOODLANDS NORTH,1.448292,103.785693
1,TE2 | WOODLANDS,1.436058,103.787939
2,TE3 | WOODLANDS SOUTH,1.427396,103.793264
3,NS1 | JURONG EAST,1.333153,103.742286
4,NS2 | BUKIT BATOK,1.349033,103.749566


In [38]:
# Convert DataFrame to dictionary
data_dict = final_mrt_df.to_dict("records")

In [39]:
# Insert data into MongoDB
mrt_location = db_nlb['p_mrt']
mrt_location.insert_many(data_dict)

<pymongo.results.InsertManyResult at 0x134ff2340>

In [42]:
final_mrt_df.to_csv("data/p_mrt.csv")

#### Dump MRT location data into MongoDB 

In [43]:
att_df = pd.read_csv("data/attractions_geo_locations.csv")

In [44]:
att_df.head()

Unnamed: 0,title,review_counts,address,search_keyword,LATITUDE,LONGITUDE,X,Y,score,keyword_len
0,Gardens by the Bay,59519,"18 Marina Gardens Drive Bayfront Plaza, Singap...",18953,1.281758,103.861586,31145.901288,29355.847797,0.0,6
1,Singapore Mass Rapid Transit (SMRT),24046,Singapore 069112 Singapore,69112,1.279354,103.846196,29433.149988,29089.977309,0.0,6
2,Singapore Zoo,22349,"80 Mandai Lake Road, Singapore 729826 Singapore",729826,1.403548,103.788541,23016.906993,42822.731295,0.0,6
3,Singapore Botanic Gardens,19802,"1 Cluny Road, Singapore 259 569 Singapore",259569,1.322157,103.818186,26315.88028,33822.886246,0.0,6
4,Singapore Flyer,17334,"30 Raffles Avenue, Singapore 039803 Singapore",39803,1.289713,103.863372,31344.706763,30235.496031,0.0,6


In [48]:
att_df['loc_info'] = att_df['title']
final_att_df = att_df[['loc_info', "LATITUDE", "LONGITUDE"]]
final_att_df.head()

Unnamed: 0,loc_info,LATITUDE,LONGITUDE
0,Gardens by the Bay,1.281758,103.861586
1,Singapore Mass Rapid Transit (SMRT),1.279354,103.846196
2,Singapore Zoo,1.403548,103.788541
3,Singapore Botanic Gardens,1.322157,103.818186
4,Singapore Flyer,1.289713,103.863372


In [49]:
# Convert DataFrame to dictionary
data_dict = final_att_df.to_dict("records")

In [50]:
# Insert data into MongoDB
attractions = db_nlb['p_att']
attractions.insert_many(data_dict)

<pymongo.results.InsertManyResult at 0x135b98820>

In [51]:
final_att_df.to_csv("data/p_attractions.csv")

#### Test query from MongoDB 

In [22]:
query_output = db_nlb['mrt_location'].find({}, {})
documents_list = list(query_output)
pd.DataFrame(documents_list)

Unnamed: 0,_id,code_string,code_num,BUILDING,POSTAL,LATITUDE,LONGITUDE
0,66973d9e6aa26544af2107f3,TE,1,WOODLANDS NORTH,737668,1.448292,103.785693
1,66973d9e6aa26544af2107f4,TE,2,WOODLANDS,737736,1.436058,103.787939
2,66973d9e6aa26544af2107f5,TE,3,WOODLANDS SOUTH,737741,1.427396,103.793264
3,66973d9e6aa26544af2107f6,NS,1,JURONG EAST,609690,1.333153,103.742286
4,66973d9e6aa26544af2107f7,NS,2,BUKIT BATOK,659958,1.349033,103.749566
...,...,...,...,...,...,...,...
140,66973d9e6aa26544af21087f,CC,25,HAW PAR VILLA,117396,1.282571,103.781753
141,66973d9e6aa26544af210880,CC,26,PASIR PANJANG,117424,1.276214,103.791350
142,66973d9e6aa26544af210881,CC,27,LABRADOR PARK,109029,1.272254,103.802632
143,66973d9e6aa26544af210882,CC,28,TELOK BLANGAH,109028,1.270706,103.809762


### Query 

In [None]:
['month', 'town', 'flat', 'block', 'street_name', 'storey_range', 'area', 'lease_left', 'price']

In [31]:
query_output = db_nlb['hdb_dump'].find({ 
        "town": { "$exists": True },
        "month": { "$exists": True },
        "flat": { "$exists": True },
        "block": { "$exists": True },
        "street_name": { "$exists": True },
        "storey_range": { "$exists": True },
        "area": { "$exists": True },
        "lease_left": { "$exists": True },
        "price": { "$exists": True },
        "month": { "$exists": True },
    }, {
        "_id": 0,
        "town": 1,
        "month": 1,
        "flat": 1,
        "block": 1,
        "street_name": 1,
        "storey_range": 1,
        "area": 1,
        "lease_left": 1,
        "price": 1,
        "month": 1,
    }).limit(10)
documents_list = list(query_output)
df = pd.DataFrame(documents_list).head()
df

Unnamed: 0,month,town,flat,block,street_name,storey_range,area,lease_left,price
0,2020-01,ANG MO KIO,3R,208,ANG MO KIO AVE 1,04-06,73.0,55y 07m,265000
1,2020-01,ANG MO KIO,3R,307C,ANG MO KIO AVE 1,19-21,70.0,91y 08m,470000
2,2020-01,ANG MO KIO,3R,319,ANG MO KIO AVE 1,01-03,73.0,56y 04m,230000
3,2020-01,ANG MO KIO,3R,216,ANG MO KIO AVE 1,04-06,73.0,55y 03m,280000
4,2020-01,ANG MO KIO,3R,556,ANG MO KIO AVE 10,07-09,68.0,59y 01m,220000


In [33]:
os.getcwd()

'/Users/cliff/main/sides/hdb_dash'

In [35]:
latest_df.to_csv("local_df.csv", index=False)