In [75]:
from selenium import webdriver
from selenium.webdriver.common.by import By               
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select

from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import time
import re

import geopy
from geopy.geocoders import Nominatim

from pymongo import MongoClient
import textwrap

In [76]:
def set_up_mongo(client_str,database_str,collection_str):
    client = MongoClient(client_str) #connect to mongodb client
    db = client[database_str] #connect to database

    existing_collections = db.list_collection_names() #check that dbs collections
    if collection_str not in existing_collections:
        db.create_collection(collection_str) #create collection if needed
    
    my_collection = db[collection_str] #connect to collection

    return my_collection

--- 

### scrape all postcodes in sh

In [None]:
# first scrape all plzs
url = "https://home.meinestadt.de/schleswig-holstein/postleitzahlen"
driver = webdriver.Firefox()
driver.get(url)


time.sleep(10) # -> TEN SECONDS TO CLICK IT
#manage cookie button : manually click it instead it, because its blocked or behind i frame or sth and we just need one website
"""try:
    accept_button = driver.find_element(By.XPATH, "/html/body/div/div[2]/div[4]/div[2]/button")
    accept_button.click()
except:
    print("cant find it")
    accept_button = None"""

soup = BeautifulSoup(driver.page_source, 'html.parser')

driver.close()

In [None]:
#soup

In [None]:
r = soup.find_all(class_="m-table__row")

In [None]:
elements = [res.find_all(class_="m-table__data") for res in r]

In [None]:
all = []
for element in elements:
    if element is not []:
        results = {}
        for e in element:
            data_label = e.get("data-label")
            if data_label: 
                text = e.text.split()
                if len(text) > 1:
                    text = ' '.join(text) #join list together again but orderly
                elif len(text) > 0:
                    [text] = text
                results.update({f"{data_label}":text})
        all.append(results)

In [None]:
all[5:10]

In [None]:
def split_every_five_characters(s):
    return textwrap.wrap(s, 5)

In [None]:
for a in all:
    if "PLZ" in a.keys():
        plz = a["PLZ"]
        if len(plz) > 5:
            new_plz = split_every_five_characters(plz)
            a["PLZ"] = new_plz
        else:
            a["PLZ"] = [plz] #put also into list for easier processing

In [None]:
all[5:10]

In [None]:
len(all)

### create one object per plz and add to mongodb & add geopy coordinates

In [None]:
def geocode_coords(plz,geoloc):
    try:
        geocoded_plz = geoloc.geocode(plz,exactly_one=True)
        if geocoded_plz is not None:
            gca_lat = geocoded_plz.latitude if geocoded_plz.latitude is not None else None
            gca_lon = geocoded_plz.longitude if geocoded_plz.longitude is not None else None
        else:
            gca_lat = None
            gca_lon = None
    except:
        gca_lat = None
        gca_lon = None
    coord_dict = {"lat":gca_lat,"lon":gca_lon}
    return coord_dict

In [None]:
def create_dict_obj(datapoint):
    my_dict = {"Stadt":datapoint["Stadt"],
               "Stadtteil":datapoint["Stadtteil"],
               "Landkreis":datapoint["Landkreis"],
               "Bundesland":datapoint["Bundesland"]}
    return my_dict

In [None]:
collection = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','avg_rent') 

In [None]:
geolocator = Nominatim(user_agent="sad_scraper")
new_results = []

for a in all:
    if "PLZ" in a.keys():
        plz = a["PLZ"]
        if len(plz) > 1:
            for one_plz in plz:
                my_object = create_dict_obj(a)
                #geocode and respect limit on calls
                time.sleep(1)
                coords = geocode_coords(one_plz,geolocator)
                my_object.update({"PLZ":one_plz,"coords":coords})
                #new_results.append(my_object)
                collection.insert_one(my_object)
        else:
            [plz] = plz
            #print(plz)
            #geocode and respect limit on calls
            time.sleep(1)
            coords = geocode_coords(plz,geolocator)
            a.update({"coords":coords,"PLZ":plz})
            #new_results.append(a)
            collection.insert_one(a)

In [None]:
len(new_results) # so yes new objects were added

---

## add rent price to objects

---

In [77]:
collection = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','avg_rent') 

In [78]:
def extract_numbers(n):
    n = n.replace(",",".")
    return float(n)

In [79]:
def check_if_collected(coll,plz):
    obj = coll.find_one({"PLZ":plz})
    if obj["collected"]=="False":
        return False
    else: 
        return True

In [80]:
def price_finder(text):
    pattern = re.compile(r"(\d+[,.]\d+)")
    res = pattern.findall(text)
    res = [extract_numbers(r) for r in res]
    price = np.round((sum(res)/len(res)),2) #avergae
    return price

In [None]:
#collection.update_many({},{"$set":{"collected":"False"}})

In [81]:
lost_places = []

for e in collection.find():

    identifier = e["_id"]
    p = e["PLZ"]
    city = e["Stadt"].lower()
    city = (city.replace("ü", "ue")
            .replace("ä", "ae")
            .replace("ö", "oe")
            .replace("ß", "ss"))

    if check_if_collected(collection,p) == False:
        url = "https://www.miet-check.de/mietspiegel-uebersicht-alle.php"
        driver = webdriver.Firefox()  
        driver.get(url)

        #deal with cookie button
        try:
            driver.implicitly_wait(10)
            cookie_button = driver.find_element(By.XPATH,'/html/body/dialog/div[2]/div/div[2]/div[2]/div[2]/div[1]/button')
            cookie_button.click()
        except:
            pass

        try:
            annoying_button = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div/div[3]")
            annoying_button.click()
        except:
            pass

        try:
            annoying_button_2 = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div/div[3]/button")
            annoying_button_2.click()
        except:
            pass

        price = -1
        try:
            city_button = driver.find_element(By.XPATH, f"//a[@class='btn btn-white btn-sm' and contains(@href,'{city}')]")
            city_button.click()

            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')
            driver.quit()

            try:
                text = soup.find(class_="col-sm-12 text-center").text
                price = price_finder(text)
            except:
                pass

            if price == -1:
                try:
                    text = soup.find(id="h2tab1").text
                    price = price_finder(text)
                except:
                    pass
        except:
            pass
            
        if price != -1:
            collection.update_one({"_id":identifier},{"$set":{"average_rent":price,"collected":"True"}})
        else:
            lost_places.append({city:p})
            driver.quit()
                
    else:
        print(f"already got this: {p}")


already got this: 24943
already got this: 24975
already got this: 24866
already got this: 23746
already got this: 23743
already got this: 23746
already got this: 25541
already got this: 23743
already got this: 23738
already got this: 23774
already got this: 23775
already got this: 25725
already got this: 25725
already got this: 25557
already got this: 24402
already got this: 24395
already got this: 24402
already got this: 24409
already got this: 24407
already got this: 24401
already got this: 24405
already got this: 24891
already got this: 25721
already got this: 23821
already got this: 23719
already got this: 23623
already got this: 25591
already got this: 25786
already got this: 25729
already got this: 25599
already got this: 25590
already got this: 25587
already got this: 25585
already got this: 24797
already got this: 25585
already got this: 25576
already got this: 25554
already got this: 25575
already got this: 24819
already got this: 25573
already got this: 25554
already got this

In [84]:
lost_places

[{'dahme/holstein': '23747'},
 {'schoenwalde am bungsberg': '23744'},
 {'grube/holstein': '23749'},
 {'heringsdorf kr. ostholstein': '23777'},
 {'neukirchen b. oldenburg': '23777'},
 {'neukirchen b. oldenburg': '23779'},
 {'gremersdorf/holstein': '23779'},
 {'gremersdorf/holstein': '23758'},
 {'fehmarn (stadt)': '23769'},
 {'kappeln/schlei': '24404'},
 {'kappeln/schlei': '24376'},
 {'huetten b. ascheffel': '24367'},
 {'huetten b. ascheffel': '24357'},
 {'huetten b. ascheffel': '24358'},
 {'osterby b. eckernfoerde': '24367'},
 {'seedorf b. bad segeberg': '23823'},
 {'seedorf b. bad segeberg': '24326'},
 {'klein offenseth-sparrieshoop': '25365'},
 {'klein bennebek': '24863'},
 {'klein bennebek': '24848'},
 {'helgoland': '27498'},
 {'hooge': '25859'},
 {'langeness': '25869'},
 {'langeness': '25863'},
 {'lehe/dithmarschen': '25774'},
 {'schwentinental': '24222'},
 {'schwentinental': '24147'},
 {'schwentinental': '24223'},
 {'horstedt b. husum': '25860'},
 {'gross schenkenberg': '23860'},
 

---

# update naming in mondodb

In [None]:
def set_up_mongo(client_str,database_str,collection_str):
    client = MongoClient(client_str) #connect to mongodb client
    db = client[database_str] #connect to database

    existing_collections = db.list_collection_names() #check that dbs collections
    if collection_str not in existing_collections:
        db.create_collection(collection_str) #create collection if needed
    
    my_collection = db[collection_str] #connect to collection

    return my_collection


collection = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','avg_rent') 
collection_new = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','avg_rent_v2') 


for x in collection.find():
    id = x["_id"]
    coords = x["coords"]
    lat = coords["lat"]
    lon = coords["lon"]
    x.update({"lat":lat,"lon":lon})
    #collection_new.insert_one(x)
    collection.update_one({"_id":id},{"$set":{"lat":lat,"lon":lon}})