#Capstone Project IBM x Hacktiv8

## Scraping Data

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import re

# Setup Chrome
options = Options()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service(), options=options)

# Inisialisasi penyimpanan data
all_features = set()  # Menyimpan semua fitur unik
data_rows = []        # Menyimpan data tiap rumah sebagai dictionary

def scrape_detail():
    global all_features, data_rows
    try:
        wait = WebDriverWait(driver, 10)
        title = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, "h1[class*='p-eciW3Ko_']"))).text

        if "Dijual Rumah" not in title:
            print("Lewati: Judul bukan 'Dijual Rumah'")
            return

        price = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, "div.summary__price-and-fomo"))).text
        address = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, "p[class*='p-hnE2cqE_']"))).text
        # Tambahan di dalam fungsi scrape_detail()
        harga_per_m2 = "-"
        try:
            harga_per_m2_elements = driver.find_elements(By.CSS_SELECTOR, "div.p-eciW3Ko_.p-hnE2cqE_")
            for el in harga_per_m2_elements:
                text = el.text.strip()
                if "jt/m²" in text.lower():
                    harga_per_m2 = text
                    break
        except:
            harga_per_m2 = "-"

        specs = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "span[class*='p-hnE2cqE_']")))

        room = land_area = building_area = house_document = "-"

        for spec in specs:
            text = spec.text.lower()
            if "kamar tidur" in text:
                room = spec.text.strip()
            elif "luas tanah" in text:
                land_area = spec.text.strip()
            elif "luas bangunan" in text:
                building_area = spec.text.strip()
            elif ("sertifikat hak milik" in text or
                "hak guna bangunan" in text or
                "girik" in text or
                "shm" in text or
                "hgb" in text):
                house_document = spec.text.strip()

        # Fasilitas dari source code kedua
        fasilitas_elements = driver.find_elements(By.CSS_SELECTOR, "div.p-eciW3Ko_.p-7KvUfE4_")
        fasilitas_texts = [el.text.strip().lower() for el in fasilitas_elements]

        fasilitas = {
            'carport': 0, 'garasi': 0, 'kamar mandi': 0, 'lantai': 0, 'akses mobil': 0,
            'listrik': 0, 'sumber air': 'PDAM', 'perabot': 'Tidak', 'keamanan 24 jam': 0,
            'kolam renang': 0, 'ac': 0, 'akses 24/7': 0, 'akses wifi': 0, 'atm center': 0,
            'balkon': 0, 'bank': 0, 'bathtub': 0, 'cctv': 0, 'gorden': 0, 'gym': 0,
            'internet berkecepatan': 0, 'kasur': 0, 'kitchen set': 0, 'kolam publik': 0,
            'kompor': 0, 'kulkas': 0, 'lemari pakaian': 0, 'microwave': 0, 'minimarket': 0,
            'musholla': 0, 'oven': 0, 'pompa air': 0, 'restoran': 0, 'shower': 0,
            'taman': 0, 'tangki air': 0, 'telepon': 0, 'tempat parkir': 0,
            'water heater': 0, 'jogging track': 0
        }


        boolean_map = {
            'keamanan': 'keamanan 24 jam',
            'kolam renang': 'kolam renang',
            'ac': 'ac',
            'akses 24/7': 'akses 24/7',
            'akses wifi': 'akses wifi',
            'atm center': 'atm center',
            'balkon': 'balkon',
            'bank': 'bank',
            'bathtub': 'bathtub',
            'cctv': 'cctv',
            'gorden': 'gorden',
            'gym': 'gym',
            'internet': 'internet berkecepatan',
            'kasur': 'kasur',
            'kitchen set': 'kitchen set',
            'kolam publik': 'kolam publik',
            'kompor': 'kompor',
            'kulkas': 'kulkas',
            'lemari pakaian': 'lemari pakaian',
            'microwave': 'microwave',
            'minimarket': 'minimarket',
            'musholla': 'musholla',
            'oven': 'oven',
            'pompa air': 'pompa air',
            'restoran': 'restoran',
            'shower': 'shower',
            'taman': 'taman',
            'tangki air': 'tangki air',
            'telepon': 'telepon',
            'tempat parkir': 'tempat parkir',
            'water heater': 'water heater',
            'jogging track': 'jogging track'
        }

        for item in fasilitas_texts:
            item_lower = item.lower()

            # Fasilitas boolean
            for key, field in boolean_map.items():
                if key in item_lower:
                    fasilitas[field] = 1

            # Fasilitas numerik atau khusus
            if 'carport' in item_lower:
                match = re.search(r'\d+', item_lower)
                if match:
                    fasilitas['carport'] = int(match.group())
            elif 'garasi' in item_lower:
                match = re.search(r'\d+', item_lower)
                if match:
                    fasilitas['garasi'] = int(match.group())
            elif 'kamar mandi' in item_lower:
                match = re.search(r'\d+', item_lower)
                if match:
                    fasilitas['kamar mandi'] = int(match.group())
            elif 'lantai' in item_lower:
                match = re.search(r'\d+', item_lower)
                if match:
                    fasilitas['lantai'] = int(match.group())
            elif 'akses mobil' in item_lower:
                match = re.search(r'\d+', item_lower)
                if match:
                    fasilitas['akses mobil'] = int(match.group())
            elif 'listrik' in item_lower:
                match = re.search(r'\d+', item_lower.replace('.', ''))
                if match:
                    fasilitas['listrik'] = int(match.group())
            elif 'sumber air' in item_lower:
                if 'pdam' in item_lower:
                    fasilitas['sumber air'] = 'PDAM'
                elif 'pam' in item_lower:
                    fasilitas['sumber air'] = 'PAM'
                else:
                    fasilitas['sumber air'] = item
            elif 'perabot' in item_lower:
                fasilitas['perabot'] = 'Perabot'

        rumah_info = {
            "Judul": title,
            "Harga": price,
            "Harga per m²": harga_per_m2,
            "Alamat": address,
            "Kamar": room,
            "Luas Tanah": land_area,
            "Luas Bangunan": building_area,
            "Kepemilikan Tanah": house_document
        }
        # Tambahan 2 kolom ekonomi makro
        rumah_info["Pertumbuhan IHPR YoY"] = "1.05%"
        rumah_info["Suku Bunga KPR Nasional"] = "10.38%"

        # Gabungkan info fasilitas ke rumah_info
        for key, value in fasilitas.items():
            rumah_info[key.capitalize()] = value
            all_features.add(key.capitalize())

        data_rows.append(rumah_info)
        print("✔ Data ditambahkan:", title)

    except Exception as e:
        print("Gagal ambil data:", e)

# Iterasi halaman
for page_num in range(1, 100):
    try:
        url = f"https://www.pinhome.id/dijual/cari/kota-surabaya-209?buildingType=building_type.house&page={page_num}"
        driver.get(url)
        time.sleep(3)

        # Tutup popup jika muncul
        try:
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'pin-popup__dialog___u1yak'))
            )
            close_btn = driver.find_element(By.CLASS_NAME, 'pin-popup__close-btn___1z0op')
            close_btn.click()
            time.sleep(1)
            print("Popup ditutup")
        except Exception:
            print("Popup tidak muncul")

        wait = WebDriverWait(driver, 10)
        cards = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, 'a.pin-card__link___h5vvm')
        ))

        base_url = "https://www.pinhome.id"
        links = []
        for card in cards:
            href = card.get_attribute('href')
            if href:
                links.append(href if href.startswith('http') else base_url + href)

        for link in links:
            try:
                driver.execute_script("window.open('');")
                driver.switch_to.window(driver.window_handles[1])
                driver.get(link)
                time.sleep(2)
                scrape_detail()
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
                time.sleep(1)
            except Exception as detail_error:
                print(f"Gagal buka detail rumah: {detail_error}")
                driver.switch_to.window(driver.window_handles[0])

    except Exception as page_error:
        print(f"Gagal buka halaman {page_num}: {page_error}")

# Selesai scraping
driver.quit()

# Ubah data ke DataFrame
columns = ["Judul", "Harga", "Harga per m²", "Alamat", "Kamar", "Luas Tanah", "Luas Bangunan", "Kepemilikan Tanah",
           "Pertumbuhan IHPR YoY", "Suku Bunga KPR Nasional"] + sorted(all_features)

df = pd.DataFrame(data_rows)

# Pastikan semua kolom tersedia, isi dengan NaN jika tidak ada
for col in columns:
    if col not in df.columns:
        df[col] = "NaN"

# Urutkan kolom
df = df[columns]

# Simpan ke CSV
df.to_csv("dataset_baru.csv", index=False, encoding="utf-8")
print("✔ Data CSV dengan fasilitas berhasil disimpan.")


## Exploratory Data Analysis

In [8]:
#import library
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

#Read Data
url = 'https://raw.githubusercontent.com/dimasfajrip/capstone_project_ibmXhacktiv8/main/dataset_baru.csv'
df = pd.read_csv(url)

### Preliminary EDA

In [12]:
#Show Data
df.info()
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3435 entries, 0 to 3434
Data columns (total 50 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Judul                    3435 non-null   object
 1   Harga                    3435 non-null   object
 2   Harga per m²             3435 non-null   object
 3   Alamat                   3435 non-null   object
 4   Kamar                    3435 non-null   object
 5   Luas Tanah               3435 non-null   object
 6   Luas Bangunan            3435 non-null   object
 7   Kepemilikan Tanah        3435 non-null   object
 8   Pertumbuhan IHPR YoY     3435 non-null   object
 9   Suku Bunga KPR Nasional  3435 non-null   object
 10  Ac                       3435 non-null   int64 
 11  Akses 24/7               3435 non-null   int64 
 12  Akses mobil              3435 non-null   int64 
 13  Akses wifi               3435 non-null   int64 
 14  Atm center               3435 non-null  

Unnamed: 0,Judul,Harga,Harga per m²,Alamat,Kamar,Luas Tanah,Luas Bangunan,Kepemilikan Tanah,Pertumbuhan IHPR YoY,Suku Bunga KPR Nasional,...,Perabot,Pompa air,Restoran,Shower,Sumber air,Taman,Tangki air,Telepon,Tempat parkir,Water heater
0,Dijual Rumah Shm di Jalan Cumpat Nambangan GG....,Rp430 Jt,"Rp8,3 Jt/m²","Kel. Kedung Cowek, Kec. Bulak, Kota Surabaya",3 Kamar Tidur,Luas Tanah 52m²,Luas Bangunan 50m²,Sertifikat Hak Milik,1.05%,10.38%,...,Perabot,0,0,0,PDAM,0,0,0,0,0
1,Dijual Rumah Murah Cepat Surabaya di Dukuh Men...,Rp700 Jt,"Rp8,2 Jt/m²","Kel. Dukuh Menanggal, Kec. Gayungan, Kota Sura...",3 Kamar Tidur,Luas Tanah 85m²,Luas Bangunan 85m²,-,1.05%,10.38%,...,Perabot,0,0,0,PAM,0,0,0,0,0
2,Dijual Rumah 1KT 48m² di Jl Simo Gunung Baru J...,Rp250 Jt,"Rp5,2 Jt/m²","Kel. Putat Jaya, Kec. Sawahan, Kota Surabaya",1 Kamar Tidur,Luas Tanah 48m²,Luas Bangunan 48m²,Sertifikat Hak Milik,1.05%,10.38%,...,Perabot,0,0,0,PDAM,0,0,0,0,0
3,Dijual Rumah 7KT 116m² di Jl Krakatau Raya No ...,"Rp1,4 M","Rp12,7 Jt/m²","Kel. Petemon, Kec. Sawahan, Kota Surabaya",7 Kamar Tidur,Luas Tanah 116m²,Luas Bangunan 195m²,Sertifikat Hak Milik,1.05%,10.38%,...,Perabot,0,0,0,PDAM,0,0,0,0,0
4,Dijual Rumah SHM Lokasi Strategis (Bisa Nego) ...,"Rp2,2 M","Rp22,9 Jt/m²","Kel. Gading, Kec. Tambaksari, Kota Surabaya",4 Kamar Tidur,Luas Tanah 96m²,Luas Bangunan 96m²,Sertifikat Hak Milik,1.05%,10.38%,...,Perabot,0,0,0,PDAM,0,0,0,0,0
5,Dijual Rumah Asri 1 Lantai Taman Luas Depan Ma...,Rp6 M,Rp15 Jt/m²,"Kel. Rungkut Kidul, Kec. Rungkut, Kota Surabaya",5 Kamar Tidur,Luas Tanah 400m²,Luas Bangunan 250m²,Sertifikat Hak Milik,1.05%,10.38%,...,Perabot,0,0,0,PDAM,0,0,0,0,0
6,Dijual Rumah 2 Lantai Nyaman Strategis di Teng...,"Rp3,3 M","Rp16,5 Jt/m²","Kel. Tengilis Mejoyo, Kec. Tenggilis Mejoyo, K...",5 Kamar Tidur,Luas Tanah 200m²,Luas Bangunan 300m²,Sertifikat Hak Milik,1.05%,10.38%,...,Perabot,1,0,0,PDAM,0,1,0,0,0
7,Dijual Rumah Lokasi Strategis Surabaya di Teng...,"Rp3,3 M","Rp16,5 Jt/m²","Kel. Tengilis Mejoyo, Kec. Tenggilis Mejoyo, K...",5 Kamar Tidur,Luas Tanah 200m²,Luas Bangunan 300m²,Sertifikat Hak Milik,1.05%,10.38%,...,Perabot,0,0,0,sumber air air tanah,0,0,0,0,1
8,Dijual Rumah Tenggilis Surabaya di Jjj,"Rp3,3 M","Rp16,5 Jt/m²","Kel. Tengilis Mejoyo, Kec. Tenggilis Mejoyo, K...",5 Kamar Tidur,Luas Tanah 200m²,Luas Bangunan 300m²,Sertifikat Hak Milik,1.05%,10.38%,...,Perabot,0,0,0,PDAM,0,0,0,0,0
9,Dijual Rumah Cepat Butuh Uang di di Jual Cepat...,"Rp2,2 M",Rp9 Jt/m²,"Kel. Penjaringansari (Penjaringan Sari), Kec. ...",4 Kamar Tidur,Luas Tanah 245m²,Luas Bangunan 300m²,Sertifikat Hak Milik,1.05%,10.38%,...,Perabot,0,0,1,PAM,0,0,0,0,1


In [17]:
#clean data
def clean_price(price):
    if pd.isna(price):
        return None

    # Hapus "Rp", " /m²", spasi ekstra, dan ubah ke huruf kecil
    price = price.lower().replace('rp', '').replace('/m²', '').strip()

    # Ganti koma dengan titik agar bisa dikonversi ke float
    price = price.replace(',', '.')

    # Ekstrak angka dan satuan (jt atau miliar)
    match = re.search(r'([\d.]+)\s*(jt|juta|m|miliar)', price)
    if match:
        number = float(match.group(1))
        unit = match.group(2)

        if unit in ['jt', 'juta']:
            return int(number * 1_000_000)
        elif unit in ['m', 'miliar']:
            return int(number * 1_000_000_000)

    return None

def clean_area(area):
    if pd.isna(area):
        return None

    # Hapus kata 'Luas Tanah' atau 'Luas Bangunan' jika ada, serta 'm²'
    area = area.lower().replace("luas tanah", "").replace("luas bangunan", "")
    area = area.replace("m²", "").replace(":", "").strip()

    try:
        return float(area)
    except:
        return None

def clean_room(room):
    if pd.isna(room):
        return None

    #Hapus kata 'Kamar Tidur'
    room = room.lower().replace('kamar tidur', '').strip()

    try:
        return float(room)
    except:
        return None

df['harga_bersih'] = df['Harga'].apply(clean_price)
df['harga_per_m2_bersih'] = df['Harga per m²'].apply(clean_price)
df['Luas_Tanah_Bersih'] = df['Luas Tanah'].apply(clean_area)
df['Luas_Bangunan_Bersih'] = df['Luas Bangunan'].apply(clean_area)
df['Kamar'] = df['Kamar'].apply(clean_room)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3435 entries, 0 to 3434
Data columns (total 50 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Judul                    3435 non-null   object 
 1   Harga                    3434 non-null   float64
 2   Harga per m²             3418 non-null   float64
 3   Alamat                   3435 non-null   object 
 4   Kamar                    3435 non-null   float64
 5   Luas Tanah               3435 non-null   float64
 6   Luas Bangunan            3435 non-null   float64
 7   Kepemilikan Tanah        3435 non-null   object 
 8   Pertumbuhan IHPR YoY     3435 non-null   object 
 9   Suku Bunga KPR Nasional  3435 non-null   object 
 10  Ac                       3435 non-null   int64  
 11  Akses 24/7               3435 non-null   int64  
 12  Akses mobil              3435 non-null   int64  
 13  Akses wifi               3435 non-null   int64  
 14  Atm center              

In [15]:
#delete data
df['Harga'] = df['harga_bersih']
df['Harga per m²'] = df['harga_per_m2_bersih']
df.drop('harga_bersih', axis=1, inplace=True)
df.drop('harga_per_m2_bersih', axis=1, inplace=True)

df['Luas Bangunan'] = df['Luas_Bangunan_Bersih']
df['Luas Tanah'] = df['Luas_Tanah_Bersih']
df.drop('Luas_Bangunan_Bersih', axis=1, inplace=True)
df.drop('Luas_Tanah_Bersih', axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3435 entries, 0 to 3434
Data columns (total 50 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Judul                    3435 non-null   object 
 1   Harga                    3434 non-null   float64
 2   Harga per m²             3418 non-null   float64
 3   Alamat                   3435 non-null   object 
 4   Kamar                    3435 non-null   object 
 5   Luas Tanah               3435 non-null   float64
 6   Luas Bangunan            3435 non-null   float64
 7   Kepemilikan Tanah        3435 non-null   object 
 8   Pertumbuhan IHPR YoY     3435 non-null   object 
 9   Suku Bunga KPR Nasional  3435 non-null   object 
 10  Ac                       3435 non-null   int64  
 11  Akses 24/7               3435 non-null   int64  
 12  Akses mobil              3435 non-null   int64  
 13  Akses wifi               3435 non-null   int64  
 14  Atm center              