# Test #1 of WebScraping

# WebScraping plan

## Immowelt Parameters

### Mieten & Kaufen Categories

- Wohnung
- Haus
- Wohngemeinschaft
- Gewerbeimmobilien

### Mieten & Kaufen States

Baden-Württemberg, Bayern, Berlin, Brandenburg, Bremen, Hamburg, Hessen, Mecklenburg-Vorpommern, Niedersachsen, Nordrhein-Westfalen, Rheinland-Pfalz, das Saarland, Sachsen, Sachsen-Anhalt, Schleswig-Holstein, Thüringen


# Imports

In [87]:
import time
import requests

import pandas as pd
import bs4

from datetime import datetime

from bs4 import BeautifulSoup
from bs4 import ResultSet
from bs4.element import Tag as HttpTag
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

In [89]:
BUY_CATEGORIES = ["kaufen", "mieten"]
PROPERTY_CATEGORIES = ["wohnungen", "haeuser"]
HITS_PER_PAGE = 20
BASE_URL = "https://www.immowelt.de/liste/${state}/${property_category}/${buy_category}?d=true&efs=NEW_BUILDING_PROJECT&sd=DESC&sf=RELEVANCE&sp=${page}"

In [90]:
states = "Baden-Württemberg, Bayern, Berlin, Brandenburg, Bremen, Hamburg, Hessen, Mecklenburg-Vorpommern, Niedersachsen, Nordrhein-Westfalen, Rheinland-Pfalz, Saarland, Sachsen, Sachsen-Anhalt, Schleswig-Holstein, Thüringen"
states = states.replace("ü", "ue").lower().split(", ")
STATES = [f"bl-{state}" for state in states]
STATES

['bl-baden-wuerttemberg',
 'bl-bayern',
 'bl-berlin',
 'bl-brandenburg',
 'bl-bremen',
 'bl-hamburg',
 'bl-hessen',
 'bl-mecklenburg-vorpommern',
 'bl-niedersachsen',
 'bl-nordrhein-westfalen',
 'bl-rheinland-pfalz',
 'bl-saarland',
 'bl-sachsen',
 'bl-sachsen-anhalt',
 'bl-schleswig-holstein',
 'bl-thueringen']

# Immowelt WebScraping

### General Plan

<ol>
    <li>Get list of Properties</li>
    <li>Calculate the total Property pages count</li>
    <li>Get list of Expose URLs of the page</li>
    <li>Navigate to the Page and grab the Infos you want</li>
    <li>Loop</li>
</ol>

In [94]:
STATES = ["bl-berlin", "bl-bremen"]

class ImmoWeltScraper:
    def __init__(
        self,
        base_url: str,
        states: list[str],
        hits_per_page: int,
        property_categories: list[str],
        buy_categories: list[str],
    ) -> None:
        self.driver: webdriver.Chrome = None
        self.base_url = base_url
        self.states = states
        self.hits_per_page = hits_per_page
        self.property_categories = property_categories
        self.buy_categories = buy_categories

        # sleep durations in seconds
        self.PAGE_DURATION = 5
        self.CATEGORY_DURATION = 60
        self.STATE_DURATION = 60 * 2
    
    def scrape(self) -> pd.DataFrame:
        try:
            self._create_driver()
            df = self._scrape()
            self._log("Script successful finished!", "SUCCESS")
        except Exception as exp:
            self._destroy_driver()
            self._log("Script failed", "ERR")
            raise exp
    
    def _scrape(self) -> pd.DataFrame:
        total_property_data: list[dict] = []
        for state in self.states:
            state_property_data: list[dict] = []
            for property_category in self.property_categories:
                for buy_category in self.buy_categories:
                    url = self._get_prep_url(state, property_category, buy_category)
                    soup = self._navigate_to(url)
                    if not self._is_page_found(soup):
                        continue
                    
                    page_count = self._calc_total_relevant_page_count(soup)
                    page_count = 2 if page_count > 2 else page_count
                    for i in range(0, page_count):
                        page = i + 1
                        self._log(f"State: {state}, category: {property_category}, type: {buy_category}\tpage: {page} of {page_count}")
                        page_url = self._get_prep_url_page(url, page)
                        soup = self._navigate_to(url)

                        expose_urls = self._get_expose_urls(soup)
                        for expose_url in expose_urls:
                            self.driver.get(expose_url)
                            property_data = self._get_property_data(
                                property_category,
                                buy_category,
                                state
                            )
                            state_property_data.append(property_data)
                            total_property_data.append(property_data)
            filepath = f"./data/property_immowelt_{state}.csv"
            _ = self._export(total_property_data, filepath)
        filepath = "./data/property_immowelt_data.csv"
        return self._export(total_property_data, filepath)

    def scrape_obj_test(
        self,
        expose_url: str,
    ) -> dict:
        self._create_driver()
        soup = self._navigate_to(expose_url)
        property_data = self._get_property_data(
            soup,
            "property_category",
            "buy_category",
            "state"
        )
        self._destroy_driver()
        return property_data

    def _create_driver(self) -> None:
        if self.driver:
            self._destroy_driver()
        options = Options()
        options.add_argument("--headless")
        self.driver = webdriver.Chrome(
            service=Service(
                ChromeDriverManager().install()
            ),
            options=options
        )
        self._log("Driver created")
        self._check_permission_requirements()
        self._log("Permissions accepted")

    def _destroy_driver(self) -> None:
        self._log("Destroy driver")
        if not self.driver:
            return
        self.driver.close()
        self.driver = None

    def _check_permission_requirements(self):
        self.driver.get("https://www.immowelt.de/immobilienpreise")
        check_element = None
        while not check_element:
            check_element = self.driver.execute_script(
                """return document.querySelector('#usercentrics-root').shadowRoot.querySelector("button[data-testid='uc-accept-all-button']")"""
            )
            if check_element:
                check_element.click()
            else:
                self._log("Permission loop")
                time.sleep(.5)

    def _export(self, data: list[dict], filepath: str) -> pd.DataFrame:
        df = pd.DataFrame(data)
        df.replace("|", "_", inplace=True)
        df.to_csv(filepath, sep="|", index=False)
        self._log(f"CSV-Export to: {filepath}")
        return df

    def _navigate_to(self, url: str) -> BeautifulSoup:
        self.driver.get(url)
        return BeautifulSoup(self.driver.page_source)

    def _is_page_found(self, soup: BeautifulSoup) -> bool:
        is_page_found = not soup.find("div", {"class": "NotFound-d39a0"})
        if not is_page_found:
            url = self.driver.current_url
            self._log(f"Page not Found: {url}", "ERROR")
        return is_page_found

    def _log(self, msg: str, tag: str = "INFO") -> None:
        tag = tag.upper()
        ts = datetime.now().strftime("%Y-%m-%d, %H:%M:%S")
        print(f"[{ts}] {tag}:\t{msg}")

    def _get_prep_url(
        self,
        state: str,
        property_category: str,
        buy_category: str,
        page: int = 1
    ) -> str:
        page = str(page)
        return (
            self.base_url
                .replace("${state}", state)
                .replace("${property_category}", property_category)
                .replace("${buy_category}", buy_category)
                .replace("${page}", page)
        )

    def _get_prep_url_page(self, url: str, page: int) -> str:
        url_page_removed = url[:url.find("sp=")]
        return f"{url_page_removed}sp={page}"

    def _calc_total_relevant_page_count(self, soup: BeautifulSoup) -> int:
        hits_emt = soup.find("h1", {"class": "MatchNumber-a225f"})
        if not hits_emt:
            return 0
        content = hits_emt.text
        hits = content[:content.find(" ")].replace(".", "")
        hits = int(int(hits) * 0.7)
        page_count = int(hits / self.hits_per_page)
        return page_count

    def _get_expose_urls(self, soup: BeautifulSoup) -> list[str]:
        expose_urls_emt = soup.find_all("a", {"class": "mainSection-b22fb"})
        expose_urls = [link["href"] for link in expose_urls_emt]
        return expose_urls

    def _expand_read_more_areas(self, err_loop = 0) -> BeautifulSoup:
        try:
            script_expand_all_read_more = """
            let links = document.getElementsByClassName("link--read-more");
            while(links.length > 0) {
                links[0].click(); 
                links = document.getElementsByClassName("link--read-more");
            }
            """
            self.driver.execute_script(script_expand_all_read_more)
            return BeautifulSoup(self.driver.page_source)
        except Exception as exp:
            print("EXEPTION!!")
            if err_loop >= 5:
                raise exp
            err_loop += + 1
            self._log(f"Can't expand read-more. Try: {err_loop} of 5", "WARN")
            self._expand_read_more_areas(err_loop)

    def _get_overview_container(self, soup: BeautifulSoup) -> HttpTag:
        return soup.find("app-objectmeta", {"id": "aUebersicht"})
    
    def _get_data_title(self, overview_container: HttpTag) -> str:
        emt = overview_container.find("h1", {"class": "ng-star-inserted"})
        return emt.text if emt else "Na"
    
    def _get_data_hardfacts(self, overview_container: HttpTag) -> tuple[str, str, str]:
        hardfact_emts = overview_container.find_all("div", {"class": "hardfact ng-star-inserted"})

        def prep_text(emt):
            content: str = emt.text.strip()
            return content[:content.find(" ")]

        price = prep_text(hardfact_emts[0])
        living_space = prep_text(hardfact_emts[1])
        rooms = prep_text(hardfact_emts[2])
        
        return price, living_space, rooms
    
    def _get_data_badges(self, overview_container: BeautifulSoup) -> str:
        badge_emts = overview_container.find_all("sd-badge")
        badges = "Na"
        if badge_emts:
            badges = ";".join([legal_info.text for legal_info in badge_emts])
        return badges

    def _get_data_ratings(self, soup: BeautifulSoup) -> tuple[str, str]:
        rating_emts = soup.find_all("div", {"class": "rating-meter__value"})
        location_rating = "Na"
        public_transport_rating = "Na"
        if len(rating_emts) == 2:
            location_rating = rating_emts[0].text
            public_transport_rating = rating_emts[1].text
        return location_rating, public_transport_rating

    def _get_data_equipments(self, soup: BeautifulSoup) -> str:
        equipment_emt = soup.find("div", {"class": "equipment card-content ng-star-inserted"})
        equipments = equipment_emt.text if equipment_emt else "Na"
        return equipments
    
    def _get_data_features(self, soup: BeautifulSoup) -> str:
        feature_list_emts = soup.find_all("div", {"class": "textlist"})
        features = "Na"
        if feature_list_emts:
            features = ""
            feature_list_emt: HttpTag
            for feature_list_emt in feature_list_emts:
                feature_emts = feature_list_emt.find_all("li")
                if feature_emts:
                    feature_emt: HttpTag
                    for feature_emt in feature_emts:
                        features += f"{feature_emt.text.strip()};"

        return features

    def _get_data_energy_data(self, soup: BeautifulSoup) -> str:
        energy_data = ""

        def get_data(curr_energy_data: str, energy_cells: ResultSet) -> str:
            if not energy_cells:
                return curr_energy_data

            energy_cell: HttpTag
            for energy_cell in energy_cells:
                content_emts = energy_cell.find_all("p")
                title = content_emts[0].text
                content = content_emts[1].text
                curr_energy_data += f"{title}: {content};"

            return curr_energy_data

        energy_container1 = soup.find("app-energy-equipment")
        if energy_container1:
            energy_cells = energy_container1.find_all("sd-cell-col", {"data-cy": "energy-equipment"})
            energy_data = get_data(energy_data, energy_cells)

        energy_container2 = soup.find("div", {"class": "energy_information ng-star-inserted"})
        if energy_container2:
            energy_cells = energy_container2.find_all("sd-cell-col", {"class": "cell__col"})
            energy_data = get_data(energy_data, energy_cells)

        return energy_data[:len(energy_data)]

    def _get_data_keywords(self, soup: BeautifulSoup) -> str:
        read_more_emts = soup.find_all("sd-read-more")
        keywords = "Na"
        if not read_more_emts:
            return keywords

        last_emt_idx = len(read_more_emts) - 1
        keywords: str = read_more_emts[last_emt_idx].text
        KEYWORDS_NAME = "Stichworte"
        if not KEYWORDS_NAME in keywords:
            return keywords
        start_idx = keywords.find(KEYWORDS_NAME) + len(KEYWORDS_NAME)
        keywords = keywords[start_idx:]
        return keywords

    def _get_property_data(
        self,
        category: str,
        buy_rent: str,
        state: str,
    ) -> dict:
        # NEXT :
        # weiter schauen, ob noch gute Infos (kurz!!!)
        # logs
        # tests!
        # images fetchen
        soup = self._expand_read_more_areas()
        overview_container = self._get_overview_container(soup)

        url = self.driver.current_url
        title = self._get_data_title(overview_container)
        price, living_space, rooms = self._get_data_hardfacts(overview_container)
        badges = self._get_data_badges(overview_container)
        rating_location, rating_public_transport = self._get_data_ratings(soup)
        equipments = self._get_data_equipments(soup)
        features = self._get_data_features(soup)
        energy_data = self._get_data_energy_data(soup)
        keywords = self._get_data_keywords(soup)

        return {
            "url": url,
            "category": category,
            "buy_rent": buy_rent,
            "state": state,
            "title": title,
            "price": price,
            "living_space": living_space,
            "rooms": rooms,
            "badges": badges,
            "rating_location": rating_location,
            "rating_public_transport": rating_public_transport,
            "equipments": equipments,
            "features": features,
            "energy_data": energy_data,
            "keywords": keywords
            # image_paths: str,  # later the saved paths!
        }

# equipments refactoren! -> preprocessing!
immo_scraper = ImmoWeltScraper(
    BASE_URL,
    STATES,
    HITS_PER_PAGE,
    PROPERTY_CATEGORIES,
    BUY_CATEGORIES
)
immo_scraper.scrape()

[2022-10-09, 10:52:33] INFO:	Driver created
[2022-10-09, 10:52:34] INFO:	Permission loop
[2022-10-09, 10:52:35] INFO:	Permission loop
[2022-10-09, 10:52:36] INFO:	Permissions accepted
[2022-10-09, 10:52:37] INFO:	State: bl-berlin, category: wohnungen, type: kaufen	page: 1 of 2
[2022-10-09, 10:52:54] INFO:	State: bl-berlin, category: wohnungen, type: kaufen	page: 2 of 2
[2022-10-09, 10:53:10] INFO:	State: bl-berlin, category: wohnungen, type: mieten	page: 1 of 2
[2022-10-09, 10:53:26] INFO:	State: bl-berlin, category: wohnungen, type: mieten	page: 2 of 2
[2022-10-09, 10:53:42] INFO:	State: bl-berlin, category: haeuser, type: kaufen	page: 1 of 2
[2022-10-09, 10:53:57] INFO:	State: bl-berlin, category: haeuser, type: kaufen	page: 2 of 2
[2022-10-09, 10:54:15] INFO:	CSV-Export to: ./data/property_immowelt_bl-berlin.csv
[2022-10-09, 10:54:15] INFO:	State: bl-bremen, category: wohnungen, type: kaufen	page: 1 of 2
[2022-10-09, 10:54:31] INFO:	State: bl-bremen, category: wohnungen, type: kaufe

In [80]:
from datetime import datetime
now = datetime.now().strftime("%Y-%m-%d, %H:%M:%S")
date_time = now
print("date and time:",date_time)

date and time: 2022-10-09, 10:34:32


In [20]:
immo_scraper = ImmoWeltScraper(
    BASE_URL,
    STATES,
    HITS_PER_PAGE,
    PROPERTY_CATEGORIES,
    BUY_CATEGORIES
)

immo_scraper.scrape_obj_test("https://www.immowelt.de/expose/2733j5d")

'https://www.immowelt.de/expose/2733j5d'

'Ruhig gelegene 3,5 Zimmer Wohnung'

'395.000\xa0€'

'88,79'

'3,5'

'Einzelbesichtigung;Gewerblicher Anbieter'

'Gut'

'Okay'

'WohnungslageDachgeschossBezugEnde 2022/Frühjahr 2023'

'Bad mit Wanne Balkon, Garten Einbauküche Böden: Fliesenboden, Laminat, Parkett Zustand: gepflegt Weitere Räume: Kelleranteil '

'Energieträger: Öl;Heizungsart: Zentralheizung;Energieausweistyp: Verbrauchsausweis;Gebäudetyp: Wohngebäude;Baujahr laut Energieausweis: 1969;Wesentliche Energieträger: Öl;Gültigkeit:  08.05.2018 bis 08.05.2028 ;Effizienzklasse: D;Endenergieverbrauch:  129,00 kWh/(m²·a)  - Warmwasser enthalten ;'

{'category': 'property_category',
 'buy_rent': 'buy_category',
 'state': 'state',
 'description': '',
 'price': '',
 'living_space': '',
 'num_of_rooms': '',
 'land_area': '',
 'public_transport_rating': '',
 'location_rating': '',
 'property_category': '',
 'construction_year': '',
 'features1': '',
 'energy_demand': '',
 'energy_efficient_class': '',
 'energy_other': '',
 'features2': '',
 'price_infos': '',
 'other_infos': '',
 'url': ''}

# WebScraping Script (DUMMY)

In [48]:

url = "https://www.immowelt.de/liste/berlin/immobilien/kaufen?d=true&sd=DESC&sf=RELEVANCE&sp=1"
url = "https://www.immowelt.de/liste/berlin/haeuser/kaufen?d=true&efs=NEW_BUILDING_PROJECT&efs=JUDICIAL_SALE&sd=DESC&sf=RELEVANCE&sp=1"
url = "https://www.immowelt.de/expose/27w3y5f"
url = "https://www.immowelt.de/liste/bl-baden-wuerttemberg/wohnungen/kaufen?d=true&efs=NEW_BUILDING_PROJECT&sd=DESC&sf=RELEVANCE&sp=1"
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(url)

soup = BeautifulSoup(driver.page_source)

driver.close()
# soup.find_all("div", {"class": "EstateItem-1c115"})[0]
soup

<html data-react-helmet="lang" lang="de"><head>
<meta content="notranslate" data-react-helmet="true" name="google"/><meta charset="utf-8" data-react-helmet="true"/><meta content="width=device-width, initial-scale=1, shrink-to-fit=no, user-scalable=no" data-react-helmet="true" name="viewport"/><meta content="Wohnung kaufen in Baden-Württemberg. Wohnimmobilien zum Kauf findest du bei Immowelt ⇒ Jetzt dank großer Auswahl Wunschimmobilie finden!" data-react-helmet="true" name="description"/><meta content="noindex" data-react-helmet="true" name="robots"/><meta content="Wohnung kaufen in Baden-Württemberg" data-react-helmet="true" property="og:title"/><meta content="Wohnung kaufen in Baden-Württemberg. Wohnimmobilien zum Kauf findest du bei Immowelt ⇒ Jetzt dank großer Auswahl Wunschimmobilie finden!" data-react-helmet="true" property="og:description"/><meta content="//media-static.immowelt.org/app_themes/mid_0_rwd/image/logo/og-img_iwde.png" data-react-helmet="true" property="og:image"/><me

In [109]:
r = requests.get('https://github.com', timeout=(3.05, 27))
r

<Response [200]>

In [118]:
from bs4 import BeautifulSoup
from selenium import webdriver    

options = webdriver.ChromeOptions()
options.add_argument('--headless')
# executable_path param is not needed if you updated PATH
browser = webdriver.Chrome(options=options, executable_path='YOUR_PATH/chromedriver.exe')
browser.get("http://legendas.tv/busca/walking%20dead%20s03e02")
html = browser.page_source
soup = BeautifulSoup(html, features="html.parser")
print(soup)
browser.quit()

  browser = webdriver.Chrome(options=options, executable_path='YOUR_PATH/chromedriver.exe')


WebDriverException: Message: 'chromedriver.exe' executable needs to be in PATH. Please see https://chromedriver.chromium.org/home


In [117]:
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
driver.get("https://www.youtube.com/watch?v=lTypMlVBFM4&ab_channel=JohnWatsonRooney")

  driver = webdriver.Chrome("/usr/local/bin/chromedriver")


WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://chromedriver.chromium.org/home
