In [1]:
import numpy as np
import pandas as pd
import requests
import urllib.parse
import json
import re

from datetime import datetime
from google.oauth2.service_account import Credentials
from geopy.geocoders import Nominatim

In [2]:
target_table = "real_estate.jakarta"
target_table_2 = "real_estate.most_recent"
project_id = "jakarta-housing-price"
credential_file = "jakarta-housing-price-595a9cff2797.json"
credential = Credentials.from_service_account_file(credential_file)
job_location = "asia-southeast2"

In [3]:
df_original = pd.read_gbq(f"SELECT * FROM `{project_id}.{target_table}`", project_id=project_id, credentials=credential)
df_original["date"] = df_original["date"].dt.tz_localize(None)
df_original.head()

Unnamed: 0,date,title,link,address,bedroom,bathroom,garage,land_m2,building_m2,price_idr,monthly_payment_idr,agent,district,kemendagri_code,latitude_longitude
0,2023-06-10,Rumah kost dekat Binus,https://www.rumah123.com/properti/jakarta-bara...,"Palmerah, Jakarta Barat",25.0,,,185,305,6300000000.0,45000000.0,IRWANTO,"Palmerah, Jakarta Barat",31.73.07,"-6.1910017, 106.7943633086912"
1,2023-06-15,Dijual Rumah Kontrakan 20 Pintu Lelang Bank,https://www.rumah123.com/properti/jakarta-bara...,"Pegadungan, Jakarta Barat",20.0,,,200,400,1500000000.0,10000000.0,Suandi Widiarto,"Kalideres, Jakarta Barat",31.73.06,"-6.13700575, 106.70159379305983"
2,2023-06-13,Rumah di Jual Siap Huni Samping Sman 88 Jaktim,https://www.rumah123.com/properti/jakarta-timu...,"Ciracas, Jakarta Timur",,,,179,120,1500000000.0,10000000.0,Erwin Mawati Ndraha,"Ciracas, Jakarta Timur",31.75.09,"-6.3296346, 106.87660392506864"
3,2023-06-14,Bisa Nego Rumah di Bpn Tonjong 200 Meter Jl. R...,https://www.rumah123.com/properti/jakarta-timu...,"Ciracas, Jakarta Timur",,,,120,80,750000000.0,5000000.0,Erwin Mawati Ndraha,"Ciracas, Jakarta Timur",31.75.09,"-6.3296346, 106.87660392506864"
4,2023-06-11,Jual Rumah Harga Nego 12 Menit Pintu Tol Ciman...,https://www.rumah123.com/properti/jakarta-timu...,"Ciracas, Jakarta Timur",,,,72,72,750000000.0,5000000.0,Erwin Mawati Ndraha,"Ciracas, Jakarta Timur",31.75.09,"-6.3296346, 106.87660392506864"


In [4]:
df_original.shape

(22463, 15)

In [5]:
df = df_original.copy()
df = df.drop(["district", "latitude_longitude", "kemendagri_code"], axis=1)

In [6]:
geolocator = Nominatim(user_agent="my_user_agent")

def get_district(text):
    if "Kav" in text:
        text = text.replace("Kav", "Kavling")

    cities = ["Jakarta Utara", "Jakarta Timur", "Jakarta Selatan", "Jakarta Barat", "Jakarta Pusat"]

    if "bintaro" in text.lower():
        result = "Pesanggrahan, Jakarta Selatan"
    elif "daan mogot" in text.lower():
        result = "Grogol Petamburan, Jakarta Barat"
    else:
        try:
            location = geolocator.geocode(text)
            if location is not None:
                address = location.raw["display_name"]
                for city in cities:
                    if city in address:
                        district = address.split(city)[0].strip()
                        district = district.split(",")
                        district = district[-2].strip()
                        break
                result = f"{district}, {city}"
            else:
                result = np.nan
        except:
            result = np.nan

    return result

In [7]:
for idx, val in enumerate(np.sort(df["address"].unique()), start=1):
    print(f"{idx}.\t{val}")

1.	Alfa Indah, Jakarta Barat
2.	Ampera, Jakarta Selatan
3.	Ancol, Jakarta Utara
4.	Angke, Jakarta Barat
5.	Antasari, Jakarta Selatan
6.	Asemka, Jakarta Barat
7.	Bambu Apus, Jakarta Timur
8.	Bandara, Jakarta Barat
9.	Bandengan, Jakarta Utara
10.	Bangka, Jakarta Selatan
11.	Batu Ceper, Jakarta Pusat
12.	Bendungan Hilir, Jakarta Pusat
13.	Bintaro, Jakarta Selatan
14.	Blok M, Jakarta Selatan
15.	Blok S, Jakarta Selatan
16.	Bojong Indah, Jakarta Barat
17.	Buaran, Jakarta Timur
18.	Bukit Duri, Jakarta Selatan
19.	Bungur, Jakarta Pusat
20.	CBD, Jakarta Selatan
21.	Cakung, Jakarta Timur
22.	Cawang, Jakarta Timur
23.	Cempaka Mas, Jakarta Pusat
24.	Cempaka Putih, Jakarta Pusat
25.	Cengkareng Barat, Jakarta Barat
26.	Cengkareng, Jakarta Barat
27.	Central Park, Jakarta Barat
28.	Cibubur, Jakarta Timur
29.	Cideng, Jakarta Pusat
30.	Ciganjur, Jakarta Selatan
31.	Cijantung, Jakarta Timur
32.	Cikini, Jakarta Pusat
33.	Cikoko, Jakarta Selatan
34.	Cilandak, Jakarta Selatan
35.	Cilangkap, Jakarta Timur
3

In [8]:
%%time

unique_locations = pd.DataFrame({"address": df["address"].unique()})
unique_locations["district"] = unique_locations["address"].apply(get_district)
unique_locations["district"] = unique_locations["district"].str.replace(r"(?i)\b(kec(?:amatan)?|kec)\b\.?|^\.|\.$", "", regex=True).str.strip()
unique_locations

CPU times: total: 78.1 ms
Wall time: 2min 5s


Unnamed: 0,address,district
0,"Palmerah, Jakarta Barat","Palmerah, Jakarta Barat"
1,"Pegadungan, Jakarta Barat","Kalideres, Jakarta Barat"
2,"Ciracas, Jakarta Timur","Ciracas, Jakarta Timur"
3,"Marunda, Jakarta Utara","Cilincing, Jakarta Utara"
4,"Kemayoran, Jakarta Pusat","Kemayoran, Jakarta Pusat"
...,...,...
255,"Panglima Polim, Jakarta Selatan","Kebayoran Baru, Jakarta Selatan"
256,"Kota Bambu Utara, Jakarta Barat","Palmerah, Jakarta Barat"
257,"Kapten Tendean, Jakarta Selatan","Mampang Prapatan, Jakarta Selatan"
258,"Tawakal, Jakarta Barat","Grogol Petamburan, Jakarta Barat"


In [9]:
jkt_districts = pd.read_excel("jakarta_districts.xlsx")
jkt_districts.head()

Unnamed: 0,kemendagri_code,district,city,district_city,latitude_longitude
0,31.71.05,Cempaka Putih,Jakarta Pusat,"Cempaka Putih, Jakarta Pusat","-6.181214499999999, 106.86854766095992"
1,31.71.01,Gambir,Jakarta Pusat,"Gambir, Jakarta Pusat","-6.1766841, 106.8306534"
2,31.71.08,Johar Baru,Jakarta Pusat,"Johar Baru, Jakarta Pusat","-6.1857706, 106.8567766"
3,31.71.03,Kemayoran,Jakarta Pusat,"Kemayoran, Jakarta Pusat","-6.1625464999999995, 106.85689034074326"
4,31.71.06,Menteng,Jakarta Pusat,"Menteng, Jakarta Pusat","-6.1950265, 106.83222419920054"


In [10]:
updated_unique_locations = unique_locations.merge(jkt_districts, left_on="district", right_on="district_city", how="inner")
updated_unique_locations = updated_unique_locations[["address", "district_x", "kemendagri_code", "latitude_longitude"]]
updated_unique_locations = updated_unique_locations.rename(columns={"district_x": "district"})
updated_unique_locations.head()

Unnamed: 0,address,district,kemendagri_code,latitude_longitude
0,"Palmerah, Jakarta Barat","Palmerah, Jakarta Barat",31.73.07,"-6.1910017, 106.7943633086912"
1,"Kemanggisan, Jakarta Barat","Palmerah, Jakarta Barat",31.73.07,"-6.1910017, 106.7943633086912"
2,"Slipi, Jakarta Barat","Palmerah, Jakarta Barat",31.73.07,"-6.1910017, 106.7943633086912"
3,"Kota Bambu Utara, Jakarta Barat","Palmerah, Jakarta Barat",31.73.07,"-6.1910017, 106.7943633086912"
4,"Pegadungan, Jakarta Barat","Kalideres, Jakarta Barat",31.73.06,"-6.13700575, 106.70159379305983"


In [11]:
for idx, (val1, val2) in enumerate(zip(updated_unique_locations["address"], updated_unique_locations["district"]), start=1):
    print(idx)
    print(val1)
    print(val2)
    print(10*"-")

1
Palmerah, Jakarta Barat
Palmerah, Jakarta Barat
----------
2
Kemanggisan, Jakarta Barat
Palmerah, Jakarta Barat
----------
3
Slipi, Jakarta Barat
Palmerah, Jakarta Barat
----------
4
Kota Bambu Utara, Jakarta Barat
Palmerah, Jakarta Barat
----------
5
Pegadungan, Jakarta Barat
Kalideres, Jakarta Barat
----------
6
Kalideres, Jakarta Barat
Kalideres, Jakarta Barat
----------
7
Taman Surya, Jakarta Barat
Kalideres, Jakarta Barat
----------
8
Citra Garden, Jakarta Barat
Kalideres, Jakarta Barat
----------
9
Taman Palem, Jakarta Barat
Kalideres, Jakarta Barat
----------
10
Semanan, Jakarta Barat
Kalideres, Jakarta Barat
----------
11
Bandara, Jakarta Barat
Kalideres, Jakarta Barat
----------
12
Ciracas, Jakarta Timur
Ciracas, Jakarta Timur
----------
13
Cibubur, Jakarta Timur
Ciracas, Jakarta Timur
----------
14
Marunda, Jakarta Utara
Cilincing, Jakarta Utara
----------
15
Cilincing, Jakarta Utara
Cilincing, Jakarta Utara
----------
16
Rorotan, Jakarta Utara
Cilincing, Jakarta Utara
----

In [12]:
merged_df = df.merge(updated_unique_locations, on="address", how="inner").reset_index(drop=True)
merged_df = merged_df[["date", "title", "link", "address", "district", "kemendagri_code", "latitude_longitude", "bedroom", "bathroom", "garage", "land_m2", "building_m2", "price_idr", "monthly_payment_idr", "agent"]]
merged_df.head()

Unnamed: 0,date,title,link,address,district,kemendagri_code,latitude_longitude,bedroom,bathroom,garage,land_m2,building_m2,price_idr,monthly_payment_idr,agent
0,2023-06-10,Rumah kost dekat Binus,https://www.rumah123.com/properti/jakarta-bara...,"Palmerah, Jakarta Barat","Palmerah, Jakarta Barat",31.73.07,"-6.1910017, 106.7943633086912",25.0,,,185,305,6300000000.0,45000000.0,IRWANTO
1,2023-06-13,DI RUMAH KOMPLEK MIGAS PALMERAH,https://www.rumah123.com/properti/jakarta-bara...,"Palmerah, Jakarta Barat","Palmerah, Jakarta Barat",31.73.07,"-6.1910017, 106.7943633086912",,,,250,225,3800000000.0,27000000.0,HJ LUCIANA MYPRO
2,2023-06-11,Rumah termurah jarang ada di Tanjung Raya Palm...,https://www.rumah123.com/properti/jakarta-bara...,"Palmerah, Jakarta Barat","Palmerah, Jakarta Barat",31.73.07,"-6.1910017, 106.7943633086912",,,,202,300,4300000000.0,30000000.0,Intan Aprilianda
3,2023-06-14,Rumah Mewah 2 Lantai di Komplek DPR RI Kemangg...,https://www.rumah123.com/properti/jakarta-bara...,"Palmerah, Jakarta Barat","Palmerah, Jakarta Barat",31.73.07,"-6.1910017, 106.7943633086912",4.0,,,560,600,12000000000.0,85000000.0,Litha Hendrilia Zetliec
4,2023-06-13,Dijual Cluster Cantik Cash Bertahap di Lokasi ...,https://www.rumah123.com/properti/jakarta-bara...,"Palmerah, Jakarta Barat","Palmerah, Jakarta Barat",31.73.07,"-6.1910017, 106.7943633086912",2.0,1.0,1.0,60,36,215000000.0,1000000.0,Zulham Munir


In [13]:
schema = [
    {"name": "date", "type": "DATE"},
    {"name": "title", "type": "STRING"},
    {"name": "link", "type": "STRING"},
    {"name": "address", "type": "STRING"},
    {"name": "district", "type": "STRING"},
    {"name": "kemendagri_code", "type": "STRING"},
    {"name": "latitude_longitude", "type": "STRING"},
    {"name": "bedroom", "type": "FLOAT64"},
    {"name": "bathroom", "type": "FLOAT64"},
    {"name": "garage", "type": "FLOAT64"},
    {"name": "land_m2", "type": "FLOAT64"},
    {"name": "building_m2", "type": "FLOAT64"},
    {"name": "price_idr", "type": "FLOAT64"},
    {"name": "monthly_payment_idr", "type": "FLOAT64"},
    {"name": "agent", "type": "STRING"}
]

In [14]:
merged_df.to_gbq(
    destination_table=target_table,
    project_id=project_id,
    if_exists="replace",
    location=job_location,
    chunksize=10_000,
    progress_bar=True,
    credentials=credential,
    table_schema=schema
)

100%|██████████| 1/1 [00:00<?, ?it/s]
