In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import logging
from unidecode import unidecode

city = "Köln"
max_rent = 500
min_size = 15

# specify the URL of the search results page for Cologne WG-Zimmer on wg-gesucht.de
url = f"https://www.wg-gesucht.de/wg-zimmer-in-Koeln.73.0.1.0.html?offer_filter=1&city_id=73&sort_order=0&noDeact=1&categories%5B%5D=0&rent_types%5B%5D=0&sMin={min_size}&rMax={max_rent}"


# specify the URL of the search results page for Cologne WG-Zimmer on wg-gesucht.de
#url = "https://www.wg-gesucht.de/wg-zimmer-in-Koeln.73.0.1.0.html"
# send a GET request to the URL and get the HTML content of the page
response = requests.get(url)

html_content = response.content

# parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# find all listings on the page
listings = soup.find_all(class_='wgg_card offer_list_item')

# define a regular expression pattern to match the relevant information in the string
pattern = r"(\d)er WG[\s|]*(\S+)[\s|]*(\S+)[\s|]*(\S+.*)"

# create an empty list to store the data
data = []

# set up logging
logging.basicConfig(filename='./logs/merged_info.log', level=logging.INFO, format='%(asctime)s:%(message)s')

# loop through each listing and extract the information
for listing in listings:
    title = listing.find(class_='truncate_title noprint').text.strip()
    price = listing.find(class_='col-xs-3').text.strip()
    size = listing.find(class_='col-xs-3 text-right').text.strip()
    
    # parse single informations with regex
    merged_infos = listing.find(class_='col-xs-11').text.replace('\n', '')
    
    # log the value of merged_infos
    logging.info(merged_infos)

    match = re.search(pattern, merged_infos, re.MULTILINE)
    if match is not None:
        # extract the relevant information from the match object
        wg_type = match.group(1)
        city = match.group(2).strip()
        district = match.group(3).strip()
        street = match.group(4).strip()
        
        # append the extracted data to the list
        data.append([title, price, size, wg_type, city, district, street])
    
    else: 
        print("No match found")

# create a pandas DataFrame from the extracted data
df = pd.DataFrame(data, columns=['Title', 'Price', 'Size', 'WG Type', 'City', 'District', 'Street'])

# save the DataFrame to a CSV file in the current working directory
file_path = os.path.join(os.getcwd(), './data/wg_listings.csv')
df.to_csv(file_path, index=False)

# print a message to confirm that the file has been saved
print(f"DataFrame saved to {file_path}")

df

DataFrame saved to c:\Coding\wggesucht_webscraper\./data/wg_listings.csv


Unnamed: 0,Title,Price,Size,WG Type,City,District,Street
0,"Süßes, helles WG-Zimmer",480 €,19 m²,3,Köln,Altstadt-Süd,Rolandstraße 5
1,Große Haus-WG Köln/Bonn,499 €,18 m²,7,Köln,Bornheim,- Sechtem | ...
2,Zwischenmiete in gemütlicher 5er-WG in Köln Ka...,455 €,16 m²,5,Köln,Humboldt/Gremberg,Esserstraße
3,Kalker Schmuckstück,464 €,21 m²,3,Köln,Humboldt-Gremberg,Feldbergstraße x
4,"Die Miete enthält auch Strom, Gas und Internet.",440 €,16 m²,3,Köln,Ehrenfeld,Fröbelstraße 40
5,FuWo-Friends 🌈 open minded People Köln 🤗🌿 FuWo...,100 €,30 m²,0,Köln,Telegram,Gruppe
6,Alle Zimmer sind ungefähr gleich groß.,460 €,18 m²,3,Köln,Neuehrenfeld,Vogelsanger Str. 185
7,"City UniKlinik,Uni , 2RaumSingleApartment, Möb...",490 €,30 m²,2,Köln,Sülz,Nikolausstr.131
8,Studentenzimmer in Verbindungsvilla (Studenten...,290 €,20 m²,9,Köln,Lindenthal,Heinestraße 30
9,16qm Zimmer in 2er WG - Südstadt,475 €,16 m²,2,Köln,Altstadt-Süd,Zwirnerstraße 41
