# Libraries import

In [1]:
from bs4 import BeautifulSoup as bs
from urllib.request import Request
import requests 
from random import randint
from time import sleep
import pandas as pd
from pathlib import Path  

# Defining functions 

In [3]:
#Returns the document with HTML code of requested webpage

def downloadPage(url):
    req = Request(url, headers={
        'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", 
        'Accept-Encoding': "gzip, deflate, br", 
        'Accept-Language': "pl,en-US;q=0.7,en;q=0.3", 
        'Host': "ogloszenia.trojmiasto.pl", 
        'Referer': "https://www.google.pl/", 
        'Sec-Fetch-Dest': "document", 
        'Sec-Fetch-Mode': "navigate", 
        'Sec-Fetch-Site': "cross-site", 
        'Sec-Fetch-User': "?1", 
        'Upgrade-Insecure-Requests': "1", 
        'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0", 
    })  
    webpage = requests.get(url) 
    doc = bs(webpage.content.decode('utf-8'), 'html5lib')
    
    return doc

In [4]:
#Returns last page number

def getNoOfPages(doc):
    last_page = doc.find(class_="pages__controls__last")
    pageNo_End = int(last_page['data-page-number'])
    
    return pageNo_End

In [5]:
#Returns the list of offers' links from one result page

def getOfferLink(url):
    offer_links = []
    htmlPage = downloadPage(url)
    
    search_table = htmlPage.find_all(class_="ogl__list__wrap")                                     #table with offers
    
    for element in search_table:
        offers = element.find_all(class_="list__item__wrap__content")                              #frame of one offer
        for offer in offers:
            offer = offer.find("a", href = True, class_="list__item__content__title__name link")   #offer's link
            
            offer_links.append(offer['href'])
            
    return offer_links

In [6]:
#Returns full list of searched offers' links

def getListOfOffersLinks(pageNo_End, base_url):
    offer_links_list = []
    for pageNo in range(1, pageNo_End+1):          #pages crawling
        current_page = base_url+str(pageNo)        #def next page
        linkslist = getOfferLink(current_page)     #list of offers' links from current page

        offer_links_list.extend(linkslist)         #add offers' links to the main list

        sleep(randint(2,10))                       #wait before sending next request
            
    return offer_links_list

In [7]:
#Returns dataframe with details of one offer

def getOfferDetails(offer_doc, offer_link):
    main_info = offer_doc.find_all(class_="oglField__container")
    dict_fields = {}                                   #def a dictionary where details of the offer will be loaded

    Cena = []
    
    try:
        cena_search = main_info[0].find("p", class_="oglDetailsMoney autolinkSafariFix").text
        Cena.append(cena_search)
        dict_fields['Cena'] = Cena

    except IndexError:                                 #error occurs when e.g. the offer has been deleted
        print("----------- ERROR -----------")
        print(offer_link)
        print("-----------------------------")
        

    for field in main_info:
        try:
            name1 = field.find(class_="oglField__name").text
            value1 = []
            value_search = field.find(class_="oglField__value").text
            value1.append(value_search)
            
            dict_fields[name1] = value1
            
        except AttributeError:                         #error occurs when e.g. the field is not used in the offer
            pass

    df_offer = pd.DataFrame.from_dict(dict_fields)     #creation of a dataframe with details from dictionary
    
    return df_offer

In [8]:
#Returns datframe with details of all offers 

def getDataFrame(offer_links_list):
    offers_datalist = []                      #def a list where details of the offers will be loaded
    
    i = 0                                     #counter of downloaded offers

    for offer in offer_links_list: 
        offer_doc = downloadPage(offer)
        df_offer = getOfferDetails(offer_doc, offer)

        offers_datalist.append(df_offer)

        i+=1
        
        if (i % 100 == 0 ):                   #log - saving a .csv backup file with offers' details after each 100 offers downloading
            print(f"Prcoessing of {i}/{len(offer_links_list)} offers completed.")       #progress
            
            temp_df = pd.concat(offers_datalist).reset_index(drop=True)
            backup_path = Path(f'C:/Users/malgo/OneDrive - University of Gdansk (for Students)/Pulpit/data_test/backup/backup_{i}.csv')  
            temp_df.to_csv(backup_path)
        
        sleep(randint(2,10))                  #wait before next offer details request

    offers_df = pd.concat(offers_datalist).reset_index(drop=True)
    
    return offers_df

# Script running

In [9]:
#URL address for requests

pageNo = 1
base_url = f"https://ogloszenia.trojmiasto.pl/nieruchomosci-mam-do-wynajecia/gdansk/?strona="
main_url = base_url+str(pageNo)

In [10]:
#HTML file

doc = downloadPage(main_url)

In [11]:
#Last page number

pageNo_End = getNoOfPages(doc)

In [12]:
#List of offers' links

offer_links_list = getListOfOffersLinks(pageNo_End, base_url)

In [16]:
#Requested data

offers_df = getDataFrame(offer_links_list)

filepath = Path('C:/Users/malgo/OneDrive - University of Gdansk (for Students)/Pulpit/data_test/data_trojmiasto.csv')  
offers_df.to_csv(filepath)  

Prcoessing of 100/2867 offers completed.
Prcoessing of 200/2867 offers completed.
Prcoessing of 300/2867 offers completed.
Prcoessing of 400/2867 offers completed.
Prcoessing of 500/2867 offers completed.
Prcoessing of 600/2867 offers completed.
Prcoessing of 700/2867 offers completed.
Prcoessing of 800/2867 offers completed.
Prcoessing of 900/2867 offers completed.
Prcoessing of 1000/2867 offers completed.
----------- ERROR -----------
https://ogloszenia.trojmiasto.pl/nieruchomosci-mam-do-wynajecia/komfortowe-mieszkanie-z-balkonem-silownia-garaz-wrzeszcz-ambiente-ogl65165220.html
-----------------------------
----------- ERROR -----------
https://ogloszenia.trojmiasto.pl/nieruchomosci-mam-do-wynajecia/przestronne-mieszkanie-2-pokojowe-z-duzym-balkonem-wiszace-ogrody-jasien-ogl65165217.html
-----------------------------
----------- ERROR -----------
https://ogloszenia.trojmiasto.pl/nieruchomosci-mam-do-wynajecia/dwupokojowe-wygodne-mieszkanie-blisko-galerii-forum-kartuska-ogl65165134.h