#Capstone Project IBM x Hacktiv8

## Scraping Data

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import re

# Setup Chrome
options = Options()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service(), options=options)

# Inisialisasi penyimpanan data
all_features = set()  # Menyimpan semua fitur unik
data_rows = []        # Menyimpan data tiap rumah sebagai dictionary

def scrape_detail():
    global all_features, data_rows
    try:
        wait = WebDriverWait(driver, 10)
        title = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, "h1[class*='p-eciW3Ko_']"))).text

        if "Dijual Rumah" not in title:
            print("Lewati: Judul bukan 'Dijual Rumah'")
            return

        price = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, "div.summary__price-and-fomo"))).text
        address = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, "p[class*='p-hnE2cqE_']"))).text
        # Tambahan di dalam fungsi scrape_detail()
        harga_per_m2 = "-"
        try:
            harga_per_m2_elements = driver.find_elements(By.CSS_SELECTOR, "div.p-eciW3Ko_.p-hnE2cqE_")
            for el in harga_per_m2_elements:
                text = el.text.strip()
                if "jt/m²" in text.lower():
                    harga_per_m2 = text
                    break
        except:
            harga_per_m2 = "-"

        specs = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "span[class*='p-hnE2cqE_']")))

        room = land_area = building_area = house_document = "-"

        for spec in specs:
            text = spec.text.lower()
            if "kamar tidur" in text:
                room = spec.text.strip()
            elif "luas tanah" in text:
                land_area = spec.text.strip()
            elif "luas bangunan" in text:
                building_area = spec.text.strip()
            elif ("sertifikat hak milik" in text or
                "hak guna bangunan" in text or
                "girik" in text or
                "shm" in text or
                "hgb" in text):
                house_document = spec.text.strip()

        # Fasilitas dari source code kedua
        fasilitas_elements = driver.find_elements(By.CSS_SELECTOR, "div.p-eciW3Ko_.p-7KvUfE4_")
        fasilitas_texts = [el.text.strip().lower() for el in fasilitas_elements]

        fasilitas = {
            'carport': 0, 'garasi': 0, 'kamar mandi': 0, 'lantai': 0, 'akses mobil': 0,
            'listrik': 0, 'sumber air': 'PDAM', 'perabot': 'Tidak', 'keamanan 24 jam': 0,
            'kolam renang': 0, 'ac': 0, 'akses 24/7': 0, 'akses wifi': 0, 'atm center': 0,
            'balkon': 0, 'bank': 0, 'bathtub': 0, 'cctv': 0, 'gorden': 0, 'gym': 0,
            'internet berkecepatan': 0, 'kasur': 0, 'kitchen set': 0, 'kolam publik': 0,
            'kompor': 0, 'kulkas': 0, 'lemari pakaian': 0, 'microwave': 0, 'minimarket': 0,
            'musholla': 0, 'oven': 0, 'pompa air': 0, 'restoran': 0, 'shower': 0,
            'taman': 0, 'tangki air': 0, 'telepon': 0, 'tempat parkir': 0,
            'water heater': 0, 'jogging track': 0
        }


        boolean_map = {
            'keamanan': 'keamanan 24 jam',
            'kolam renang': 'kolam renang',
            'ac': 'ac',
            'akses 24/7': 'akses 24/7',
            'akses wifi': 'akses wifi',
            'atm center': 'atm center',
            'balkon': 'balkon',
            'bank': 'bank',
            'bathtub': 'bathtub',
            'cctv': 'cctv',
            'gorden': 'gorden',
            'gym': 'gym',
            'internet': 'internet berkecepatan',
            'kasur': 'kasur',
            'kitchen set': 'kitchen set',
            'kolam publik': 'kolam publik',
            'kompor': 'kompor',
            'kulkas': 'kulkas',
            'lemari pakaian': 'lemari pakaian',
            'microwave': 'microwave',
            'minimarket': 'minimarket',
            'musholla': 'musholla',
            'oven': 'oven',
            'pompa air': 'pompa air',
            'restoran': 'restoran',
            'shower': 'shower',
            'taman': 'taman',
            'tangki air': 'tangki air',
            'telepon': 'telepon',
            'tempat parkir': 'tempat parkir',
            'water heater': 'water heater',
            'jogging track': 'jogging track'
        }

        for item in fasilitas_texts:
            item_lower = item.lower()

            # Fasilitas boolean
            for key, field in boolean_map.items():
                if key in item_lower:
                    fasilitas[field] = 1

            # Fasilitas numerik atau khusus
            if 'carport' in item_lower:
                match = re.search(r'\d+', item_lower)
                if match:
                    fasilitas['carport'] = int(match.group())
            elif 'garasi' in item_lower:
                match = re.search(r'\d+', item_lower)
                if match:
                    fasilitas['garasi'] = int(match.group())
            elif 'kamar mandi' in item_lower:
                match = re.search(r'\d+', item_lower)
                if match:
                    fasilitas['kamar mandi'] = int(match.group())
            elif 'lantai' in item_lower:
                match = re.search(r'\d+', item_lower)
                if match:
                    fasilitas['lantai'] = int(match.group())
            elif 'akses mobil' in item_lower:
                match = re.search(r'\d+', item_lower)
                if match:
                    fasilitas['akses mobil'] = int(match.group())
            elif 'listrik' in item_lower:
                match = re.search(r'\d+', item_lower.replace('.', ''))
                if match:
                    fasilitas['listrik'] = int(match.group())
            elif 'sumber air' in item_lower:
                if 'pdam' in item_lower:
                    fasilitas['sumber air'] = 'PDAM'
                elif 'pam' in item_lower:
                    fasilitas['sumber air'] = 'PAM'
                else:
                    fasilitas['sumber air'] = item
            elif 'perabot' in item_lower:
                fasilitas['perabot'] = 'Perabot'

        rumah_info = {
            "Judul": title,
            "Harga": price,
            "Harga per m²": harga_per_m2,
            "Alamat": address,
            "Kamar": room,
            "Luas Tanah": land_area,
            "Luas Bangunan": building_area,
            "Kepemilikan Tanah": house_document
        }
        # Tambahan 2 kolom ekonomi makro
        rumah_info["Pertumbuhan IHPR YoY"] = "1.05%"
        rumah_info["Suku Bunga KPR Nasional"] = "10.38%"

        # Gabungkan info fasilitas ke rumah_info
        for key, value in fasilitas.items():
            rumah_info[key.capitalize()] = value
            all_features.add(key.capitalize())

        data_rows.append(rumah_info)
        print("✔ Data ditambahkan:", title)

    except Exception as e:
        print("Gagal ambil data:", e)

# Iterasi halaman
for page_num in range(1, 100):
    try:
        url = f"https://www.pinhome.id/dijual/cari/kota-surabaya-209?buildingType=building_type.house&page={page_num}"
        driver.get(url)
        time.sleep(3)

        # Tutup popup jika muncul
        try:
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'pin-popup__dialog___u1yak'))
            )
            close_btn = driver.find_element(By.CLASS_NAME, 'pin-popup__close-btn___1z0op')
            close_btn.click()
            time.sleep(1)
            print("Popup ditutup")
        except Exception:
            print("Popup tidak muncul")

        wait = WebDriverWait(driver, 10)
        cards = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, 'a.pin-card__link___h5vvm')
        ))

        base_url = "https://www.pinhome.id"
        links = []
        for card in cards:
            href = card.get_attribute('href')
            if href:
                links.append(href if href.startswith('http') else base_url + href)

        for link in links:
            try:
                driver.execute_script("window.open('');")
                driver.switch_to.window(driver.window_handles[1])
                driver.get(link)
                time.sleep(2)
                scrape_detail()
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
                time.sleep(1)
            except Exception as detail_error:
                print(f"Gagal buka detail rumah: {detail_error}")
                driver.switch_to.window(driver.window_handles[0])

    except Exception as page_error:
        print(f"Gagal buka halaman {page_num}: {page_error}")

# Selesai scraping
driver.quit()

# Ubah data ke DataFrame
columns = ["Judul", "Harga", "Harga per m²", "Alamat", "Kamar", "Luas Tanah", "Luas Bangunan", "Kepemilikan Tanah",
           "Pertumbuhan IHPR YoY", "Suku Bunga KPR Nasional"] + sorted(all_features)

df = pd.DataFrame(data_rows)

# Pastikan semua kolom tersedia, isi dengan NaN jika tidak ada
for col in columns:
    if col not in df.columns:
        df[col] = "NaN"

# Urutkan kolom
df = df[columns]

# Simpan ke CSV
df.to_csv("dataset_baru.csv", index=False, encoding="utf-8")
print("✔ Data CSV dengan fasilitas berhasil disimpan.")
