In [1]:
import os

# prepering directories
path = os.getcwd()
# subfolders
input_dir = os.path.join(path, 'inputs')
output_dir = os.path.join(path, 'outputs')

In [10]:
# selenium 
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# BeautifulSoup
from bs4 import BeautifulSoup

import datetime
import time

import pandas as pd

In [3]:
# chrome driver location
PATH = os.path.join(path, 'inputs','chromedriver.exe' )

# chrom driver configuration
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
#options.add_argument('--headless') # without opening browser

driver = webdriver.Chrome(PATH, options=options)

wait = WebDriverWait(driver, 10)

In [4]:
url = 'https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/warminsko--mazurskie/olsztyn/olsztyn/olsztyn'

In [5]:
# get number of pages

# open website using webdriver
driver.get(url)
print(f"Opens website: {url}")

# accepting terms and conditions
# step One - pokaz cele
element = wait.until(EC.element_to_be_clickable((By.ID, 'onetrust-pc-btn-handler')))
element.click()
# step two -  confirming preferences and closing the modal window
element = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Potwierdzenie moich wyborów')]")))
element.click()

# step - get number of the paginations
## using SOUP
soup = BeautifulSoup(driver.page_source, 'html.parser')
# save homepage to the file & soup
date = datetime.datetime.now().strftime("%Y%m%d") # current date
    
# finds nav with paginations 
# scrapped HTML code can be displayed and pretty printed in Notepad++ using the plugin XML tools
target = soup.find('nav', attrs={'aria-label':'Nawigacja po paginacji'})
date = datetime.datetime.now().strftime("%Y%m%d") # current date
with open(os.path.join(path, 'outputs', 'nav_pagination' + date + '.html'), "w", encoding='utf-8') as file:
    file.write(str(target))
    
# selects all hyperlinks from the nav section
hrefs = target.find_all("a")
# get the hyperlink with the highest number which represents number of pages with adds for the given city
pagination_last = 1
for a in hrefs:
    try:
        num = int(a.get_text())
        pagination_last =  num if pagination_last < num else pagination_last
    except:
        pass
print(f'Number of pages: {pagination_last}')

Opens website: https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/warminsko--mazurskie/olsztyn/olsztyn/olsztyn
Number of pages: 21


#### VARs

In [33]:
# paggination URL
url_n = 'https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/warminsko--mazurskie/olsztyn/olsztyn/olsztyn?page='
all_df = pd.DataFrame()
city = 'olsztyn'

In [34]:
def scrap_page(url):
    '''
    - returns a soup section of a given page containing adds
    '''
    driver.get(url)
    time.sleep(2) # waits n seconds
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    return soup.find('div', attrs={'data-cy':'search.listing.organic'})
    
def has_article(tag):
    '''
    - info about adds is kept in <li> that contain <artcicle> tag
    '''
    return tag.name == 'li' and tag.find('article') is not None

def get_adds(data):
    '''
    - info about adds is kept in <li> that contain <article> tag
    '''
    return data.find_all(has_article)

def soup_to_dataframe(data):
    '''
    - gets soup and returns dataframe
    '''
    adds = []
    for add in data:
        artcile = add.find('article')
        # gets any tags with the attribute 'title'
        titles = artcile.find_all(lambda tag: tag is not None and tag.has_attr("title"))
        price = price_meter = rooms = area = more = None

        info = add.find_all('span')
        for e in info:
            e = e.text
            if 'zł' in e and 'zł/m²' not in e:
                price = e
            if 'zł/m²' in e:
                price_meter = e
            if [element for element in ['pokoje','pokoi','pokój'] if(element in e)]:
                rooms = e
            if 'm²' in e and 'zł/m²' not in e:
                area = e
        more = info[-1].text

        adds.append({
            'free_text': titles[0]['title'],
            'address': titles[1]['title'],
            'price': price,
            'price_meter': price_meter,
            'rooms': rooms,
            'area': area,
            'more':more,
            'add_link': add.select_one('a')['href']
        })
    # Creates DataFrame.
    df = pd.DataFrame(adds)
    
    return df

for n in range(pagination_last):
    n = n + 1
    data = scrap_page(url_n + str(n))
    data = get_adds(data)
    df = soup_to_dataframe(data)
    df.to_csv(os.path.join(output_dir, f'{city}_page_{str(n)}.csv'), encoding='utf-8', index=False)
    # merges datframes
    all_df = pd.concat([all_df, df])
    all_df = all_df.reset_index(drop=True)
    date = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

all_df.to_csv(os.path.join(output_dir, f'{city}_page_all_pages_{date}.csv'), encoding='utf-8', index=False)
    
    