In [48]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import logging
from unidecode import unidecode

city = "KÃ¶ln"
max_rent = 500
min_size = 15

# specify the URL of the search results page for Cologne WG-Zimmer on wg-gesucht.de
url = f"https://www.wg-gesucht.de/wg-zimmer-in-Koeln.73.0.1.0.html?offer_filter=1&city_id=73&sort_order=0&noDeact=1&categories%5B%5D=0&rent_types%5B%5D=0&sMin={min_size}&rMax={max_rent}"


# specify the URL of the search results page for Cologne WG-Zimmer on wg-gesucht.de
#url = "https://www.wg-gesucht.de/wg-zimmer-in-Koeln.73.0.1.0.html"
# send a GET request to the URL and get the HTML content of the page
response = requests.get(url)

html_content = response.content

# parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# find all listings on the page
listings = soup.find_all(class_='wgg_card offer_list_item')

# define a regular expression pattern to match the relevant information in the string
pattern = r"(\d)er WG[\s|]*(\S+)[\s|]*(\S+)[\s|]*(\S+.*)"

# create an empty list to store the data
data = []

# set up logging
logging.basicConfig(filename='merged_info.log', level=logging.INFO, format='%(asctime)s:%(message)s')

# loop through each listing and extract the information
for listing in listings:
    title = listing.find(class_='truncate_title noprint').text.strip()
    price = listing.find(class_='col-xs-3').text.strip()
    size = listing.find(class_='col-xs-3 text-right').text.strip()
    
    # parse single informations with regex
    merged_infos = listing.find(class_='col-xs-11').text.replace('\n', '')
    
    # log the value of merged_infos
    logging.info(merged_infos)

    match = re.search(pattern, merged_infos, re.MULTILINE)
    if match is not None:
        # extract the relevant information from the match object
        wg_type = match.group(1)
        city = match.group(2).strip()
        district = match.group(3).strip()
        street = match.group(4).strip()
        
        # append the extracted data to the list
        data.append([title, price, size, wg_type, city, district, street])
    
    else: 
        print("No match found")

# create a pandas DataFrame from the extracted data
df = pd.DataFrame(data, columns=['Title', 'Price', 'Size', 'WG Type', 'City', 'District', 'Street'])

# save the DataFrame to a CSV file in the current working directory
file_path = os.path.join(os.getcwd(), 'wg_listings.csv')
df.to_csv(file_path, index=False)

# print a message to confirm that the file has been saved
print(f"DataFrame saved to {file_path}")

df

DataFrame saved to c:\Coding\wggesucht_webscraper\wg_listings.csv


Unnamed: 0,Title,Price,Size,WG Type,City,District,Street
0,WG Zimmer in Neu Ehrenfeld,480 â‚¬,20 mÂ²,4,KÃ¶ln,Neuehrenfeld,Grolmanstr.43
1,"City UniKlinik,Uni , 2RaumSingleApartment, MÃ¶b...",490 â‚¬,30 mÂ²,2,KÃ¶ln,SÃ¼lz,Nikolausstr.131
2,16qm Zimmer in 2er WG - SÃ¼dstadt,475 â‚¬,16 mÂ²,2,KÃ¶ln,Altstadt-SÃ¼d,ZwirnerstraÃŸe 41
3,Zimmer (Nr. 31) 26 qm in katholischer Studente...,348 â‚¬,26 mÂ²,0,KÃ¶ln,Klettenberg,Wolkenburgstr. 3
4,Ruhiges Zimmer im Haus mit Garten und direkt a...,400 â‚¬,15 mÂ²,3,KÃ¶ln,wahnheide,Magazinstrasse
5,SHORT term Rent (June-July) Zwischenmiete clos...,450 â‚¬,15 mÂ²,3,KÃ¶ln,KÃ¶ln,Blaubach
6,WG-Zimmer am Rhein zur Zwischenmiete,490 â‚¬,16 mÂ²,2,KÃ¶ln,Deutz,Bebelplatz 18
7,WG Zimmer in KÃ¶ln Zollstock ab sofort beziehbar,500 â‚¬,16 mÂ²,3,KÃ¶ln,Zollstock,Kierberger Strasse 15
8,FuWo-Friends ðŸŒˆ open minded People KÃ¶ln ðŸ¤—ðŸŒ¿ FuWo...,100 â‚¬,30 mÂ²,0,KÃ¶ln,Telegram,Gruppe
9,Alle Zimmer sind ungefÃ¤hr gleich groÃŸ.,460 â‚¬,18 mÂ²,3,KÃ¶ln,Neuehrenfeld,Vogelsanger Str. 185
