Immov web scrapping
1. Install Selenium and required packages

In [5]:
import requests
from bs4 import BeautifulSoup
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from time import sleep
import pandas as pd


driver = webdriver.Chrome()
root_url = "https://immovlan.be/fr"
driver.get(root_url)

2. Automatically accept the GDPR banner

In [6]:
cookie_button = driver.find_element(By.XPATH, '/html/body/div[1]/div/div/div/div/div/div[2]/button[2]/span')
cookie_button.click()

3. Testing scrapping data in one URL

In [7]:
#Get the Html from one website
url = "https://immovlan.be/en/detail/apartment/for-sale/1030/schaarbeek/vbd88427"
headers =  {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
req = requests.get(url, headers=headers, timeout = 10)
content = req.text 
soup = BeautifulSoup(content, "html.parser")
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0" name="viewport"/>
  <meta content="app-id=968691127" name="apple-itunes-app"/>
  <meta content="app-id=com.czaam" name="google-play-app"/>
  <meta content="735734281535384549ac78e4353b4147" name="p:domain_verify">
   <meta content="Apartment for sale | Place de Houffalize 24 - 1030 Schaarbeek | 295‚ÄØ000 ‚Ç¨ | Livable surface 96m¬≤ | EPC E | 2 Bedrooms | 1 Bathroom | Discover all pictures and details." name="description">
    <meta content="Apartment, Apartment, for sale, Schaarbeek, 1030" name="keywords"/>
    <meta content="noindex" name="robots"/>
    <link href="https://immovlan.be/en/detail/apartment/for-sale/1030/schaarbeek/vbd88427" rel="canonical"/>
    <link href="https://immovlan.be/nl/detail/appartement/te-koop/1030/schaarbeek/vbd88427" hreflang="nl" rel="alternate"/>
    <link href="https://immovlan.be/fr/detail/appartement/a-vendre/1

In [8]:
# Make a function to get the data safely. If there is no data, it will return N/A, not crashing
def safe_get_text(soup_element, default="N/A"):
    """For safely take the value, will return NA if there's no value, avoid crashing"""
    if soup_element:
        return soup_element.get_text(strip=True)
    return default

In [9]:
property_dict = {}
price_tag = safe_get_text(soup.find('span', class_='detail__header_price_data'))
if price_tag:
    print(price_tag)

numbers_only = re.findall(r'\d+', price_tag) #Have to clean data by eliminating the currency
clean_price = "".join(numbers_only) #Eliminate the . between numbers

if clean_price:
    final_price = int(clean_price)
    print(final_price)
else:
        print("None")
property_dict["Price"] = final_price
print(property_dict)

295‚ÄØ000 ‚Ç¨
295000
{'Price': 295000}


In [None]:
highlights = soup.find_all('li', class_='property-highlight')

for item in highlights:
    other_information = item.get_text(separator=" ", strip=True)if other_information else "N/A"
    print(other_information)

2 Bedroom(s)
96 m¬≤
1 Garage
1 Parking place(s)


In [None]:
street_tag = safe_get_text(soup.find('span', class_='street-line'))
city_tag = safe_get_text(soup.find('span', class_='city-line'))

full_address = f"{street_tag}, {city_tag}"

print(street_tag) 
print(city_tag)
print(full_address)
property_dict["Locality"] = full_address
property_dict["Street"] = street_tag
property_dict["City"] = city_tag
print(property_dict)

Place de Houffalize 24
1030 Schaarbeek
Place de Houffalize 24, 1030 Schaarbeek
{'Price': 295000, 'Locality': 'Place de Houffalize 24, 1030 Schaarbeek', 'Street': 'Place de Houffalize 24'}


In [85]:
find_data = soup.find_all(class_="d-block d-lg-none margin-bottom-10 text-center")
data = find_data[0].find_all("span")
for info in data:
    print(info.get_text())


Place de Houffalize 24
1030 Schaarbeek


In [None]:
#This is the wrong code. For finding all 'div' and then find h4 and p, it returns many overlapping or looping results
additional_info = {}
additional_information = soup.find_all('div', class_ = "general-info w-100")
for item in additional_information:
    header = item.find('h4')
    value = item.find('p')

    if header and value:
    # Only take the value and clear spaces
        key = header.get_text(strip=True)
        val = value.get_text(strip=True)
            
        # save in dictionary
        additional_info[key] = val
for key, value in additional_info.items():
    print(f"{key}: {value}")                              

State of the property: Normal


In [19]:
#Find all header with h4, then find value p inside
additional_info = {}
additional_information = soup.find('div', class_ = "general-info w-100") #Do not use find_all here as find_all return a list. Then, we cannot use command find() or find_all() anymore
if additional_information:
    # Find all the header in soup
    all_headers = additional_information.find_all('h4')
    
    for header in all_headers:
        # Take only the text in header
        key = header.get_text(strip=True)
        
        # Find the nearest <p> in header h4
        value_tag = header.find_next('p')
        
        if value_tag:
            value = value_tag.get_text(strip=True)
            additional_info[key] = value  
else:
    print ("N/A")
# Update the dictionary
#for k, v in additional_info.items():
    #print(f"{k}: {v}")

property_dict = {
    "State of the property:": additional_info.get("State of the property", "N/A"),
    "Number of rooms": additional_info.get("Number of bedrooms", "N/A"),
    "Living Area":additional_info.get("Livable surface", "N/A"),
    "Fully equiped kitchen":additional_info.get("Kitchen equipment", "N/A"),
    "Furnished":additional_info.get("Furnished", "N/A"),
    "Terrace":additional_info.get("Terrace", "N/A"),
    "Garden":additional_info.get("Garden", "N/A"),
    "Surface of the land":additional_info.get("Total land surface", "N/A"),
    "Number of facades":additional_info.get("Number of facades", "N/A"),
    "Swimming pool":additional_info.get("Swimming pool", "N/A"),}
print(property_dict)
                    

{'State of the property:': 'Normal', 'Number of rooms': '2', 'Living Area': '96 m¬≤', 'Fully equiped kitchen': 'Fully equipped', 'Furnished': 'No', 'Terrace': 'Yes', 'Garden': 'No', 'Surface of the land': 'N/A', 'Number of facades': '2', 'Swimming pool': 'No'}
