In [1]:
import re
import json
import time
import urllib.parse

import requests
import numpy as np
import pandas as pd


from bs4 import BeautifulSoup

from selenium import webdriver ## Driver for Firefox, Chrome, Edge, etc.
from selenium.webdriver.common.by import By # Mode of locating html elements: ID, CSS_SELECTOR, XPATH, ...
from selenium.webdriver.support.select import Select

from pymongo import MongoClient

<!-- these first 3 are like batch collections right, so no delay necessary -->

In [2]:
# ----- FIND COUNTIES IN SH -----
def county_search():
    #go to start page and make into soup html obj
    url = "https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/landkreise"
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    
    #find all counties
    lk = soup.find_all(class_="filterlist__item")
    landkreis_stack = [] #create empty stack to put counties into (not a stack tho)
    for e in lk:
        county_refined = e.text.lower()
        county_refined = county_refined.strip()
        landkreis_stack.append(county_refined)

    return landkreis_stack


# ----- FIND CITIES IN PER COUNTY -----
def city_search(county_in_question):
    #pass relevant counties as argument and get its cities
    url = f"https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/{county_in_question}"
    #print(url) #remove or comment later
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')

    places = soup.find_all(class_="boxteaser__title")

    city_stack = []
    for e in places:
        city_stack.append(e.text.lower()) #append all city names (for one county)

    return city_stack


# ----- FIND BRANCHES PER LOCATION ALPHABETICALLY -----
def total_branch_collector(county,city):
    #now collect all types of buisness

    #first go to county/city
    url = f"https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/{county}/{city}"
    url = urllib.parse.quote(url, safe=':/')
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')

    #find links to each letter of alphabet that has businesses
    soup_r = soup.find_all(class_="alphabetfilter__btn")
    #get links to these subdirectories (And only specific links, theres links to like #a too so filter out)
    all_hrefs = [e.get("href") for e in soup_r] 
    all_hrefs = [e if e!=None else "get_out" for e in all_hrefs]
    all_hrefs = [e if e[0]=='/' else "get_out" for e in all_hrefs]
    hrefs = []
    for e in all_hrefs[:]:  # Iterate over a copy of the list
        if e != "get_out":
            hrefs.append(f"https://www.gelbeseiten.de{e}") #we now have clean links for each letter 
            #e.g. url = f"https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/dithmarschen/albersdorf%20holstein/branchen/b"
    
    branchen_refs = []
    #now go through all the branchen to get the actual links to buisnesses
    for l in hrefs:
        html = requests.get(l).text
        soup = BeautifulSoup(html,'html.parser')
        soup_r = soup.find_all(class_="link") #get the links for each branche
        for b in soup_r:
            branchen_refs.append(f'https://www.gelbeseiten.de{b.get("href")}')

    #some counties are so small they have no unternehmenslist so extra function for those (same as previous)
    return branchen_refs


# ----- COLLECT INFO IN SH -----
def tooSmolNoBranch(county,city):
    #apparently here there is no cookies shit anyway
    url = f"https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/{county}/{city}/unternehmen"
    url = urllib.parse.quote(url, safe=':/')
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')

    #get all the links to the buisnesses
    links = soup.find_all('a',class_='link')
    all_links = []
    for elm in links:
        link_yp = elm.get("href") #get the yellowpage link for 1 buisness
        #only keep relevant links
        if link_yp.startswith('https://www.gelbeseiten.de/gsbiz'):
            all_links.append(link_yp)

    return all_links

# ----- COLLECT ACTUAL YP BUSINESS INFO -----
def collect_buisness_info_nittyGritty(my_buisnesses):
    my_list = []
    for elm in my_buisnesses:
        time.sleep(1)
        #for each element in list of all buisness per branch (in one city)
        try:
            html = requests.get(elm).text #go there
            soup = BeautifulSoup(html, 'html.parser')
            json_soup = soup.find_all(type="application/ld+json") #same as above
            if len(json_soup) > 0:
                e = json_soup[len(json_soup)-1] #seems to be in last element always (i hope)
                e = e.text
                data = json.loads(e)
            else:
                data = {"business":elm, "message":"failure - this buisness seems to not be available in json format"}
        except Exception as e:
                error_type = type(e).__name__  # Get the name of the exception
                error_message = str(e)         # Get the error message
                data = {"website":elm, "error_type":error_type, "error_message":error_message}        
        my_list.append(data)
    return my_list


# ----- EXPAND RESULT COUNT TO 50+ -----
def expand_resultcount(url):

    #url = 'https://www.gelbeseiten.de/branchen/friseur/lübeck'
    driver = webdriver.Firefox()
    driver.get(url)

    driver.implicitly_wait(10)

    #manage cookie button
    try:
        accept_button = driver.find_element(By.XPATH, "/html/body/div[1]/div[1]/div[2]/span[1]/a")
        accept_button.click()
    except:
        accept_button = None

    #find out how many results in total there are
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    result_totalCount = soup.find_all(id="loadMoreGesamtzahl")
    if len(result_totalCount) > 0:
        result_totalCount = int(result_totalCount[0].text) #get str w/ number, make into int
        result_totalCount = np.round((result_totalCount-50)/10) #because first 50 results are shown
    else:
        result_totalCount = 0 #f not more than 50 results available just skip while loop

    #click load more button as often as needed (each time 10 more results)
    load_more_clicked = 1
    while load_more_clicked <= result_totalCount:
        #driver.execute_script("arguments[0].scrollIntoView(true);", loadMore_button) #thanks chat gtp
        #loadMore_button.click()
        loadMore_button = driver.find_element(By.CSS_SELECTOR, "#mod-LoadMore--button")
        time.sleep(2)
        driver.execute_script("arguments[0].click();", loadMore_button) #use java script to execute button, since sometimes image ads or headers obscure view (see demo)
        load_more_clicked += 1

    #now use bs4 again since i already wrote this, just extract all hrefs
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    all_links = soup.find_all(id=re.compile(r'^treffer_\d+')) #each buisness is a 'treffer' (=hit) followed by _ and some digits
    my_buisnesses = [s.find("a")["href"] for s in all_links] #now get all hrefs for the buisnesses and return that list

    driver.quit()

    return my_buisnesses


# ----- COMBINE ALL RESULTS PER BRANCHE + COLLECT EACHs INFOS -----
def collect_buisness_infos_2(branche):
    #something like this url = "https://www.gelbeseiten.de/branchen/apotheke/kiel" is the url then
    #use function with selenium to expand search results from 50 to x, then use bs4 to collect all the hrefs for the buisnesses and return them as a list
    my_buisnesses = expand_resultcount(branche)
    #now go through that list and get info on each buisness
    #use this broad try thing, because e.g. timeout errors occured at 126 elements and who knows which other issues might appear (also eduroam hates me)
    buisnesslist = collect_buisness_info_nittyGritty(my_buisnesses)
    return buisnesslist


# ----- SET UP MONGO -----
def set_up_mongo(client_str,database_str,collection_str):
    client = MongoClient(client_str) #connect to mongodb client
    db = client[database_str] #connect to database

    existing_collections = db.list_collection_names() #check that dbs collections
    if collection_str not in existing_collections:
        db.create_collection(collection_str) #create collection if needed
    
    my_collection = db[collection_str] #connect to collection

    return my_collection
    

In [29]:
my_branch_list = total_branch_collector("kreisfrei","kiel")

In [4]:
#collection = set_up_mongo('mongodb://localhost:27017/','webscraping_dataLabKiel','yellow_pages')
collection = set_up_mongo('mongodb://localhost:27017','sh_data_collection','yp_kiel') #mongodb://mongodb:27017

for shu in my_branch_list_smol:
    print(shu)
    sh_unternehmen = collect_buisness_infos_2(shu) 
    collection.insert_many(sh_unternehmen)

https://www.gelbeseiten.de/branchen/transportunternehmen/kiel
https://www.gelbeseiten.de/branchen/trockenbau/kiel
https://www.gelbeseiten.de/branchen/t%c3%a4schner/kiel
https://www.gelbeseiten.de/branchen/t%c3%bcren/kiel
https://www.gelbeseiten.de/branchen/umzugsfirma/kiel
https://www.gelbeseiten.de/branchen/umzugsunternehmen/kiel
https://www.gelbeseiten.de/branchen/umz%c3%bcge/kiel
https://www.gelbeseiten.de/branchen/unternehmensberatung/kiel
https://www.gelbeseiten.de/branchen/urologe/kiel
https://www.gelbeseiten.de/branchen/venerologie/kiel
https://www.gelbeseiten.de/branchen/veranstaltungstechnik/kiel
https://www.gelbeseiten.de/branchen/vereine/kiel
https://www.gelbeseiten.de/branchen/verlag/kiel
https://www.gelbeseiten.de/branchen/vermessungsb%c3%bcro/kiel
https://www.gelbeseiten.de/branchen/verm%c3%b6gensberatung/kiel
https://www.gelbeseiten.de/branchen/verpackungsmaterial/kiel
https://www.gelbeseiten.de/branchen/verputzer/kiel
https://www.gelbeseiten.de/branchen/versicherungen/k

--- 

# update location naming


In [3]:
def set_up_mongo(client_str,database_str,collection_str):
    client = MongoClient(client_str) #connect to mongodb client
    db = client[database_str] #connect to database

    existing_collections = db.list_collection_names() #check that dbs collections
    if collection_str not in existing_collections:
        db.create_collection(collection_str) #create collection if needed
    
    my_collection = db[collection_str] #connect to collection

    return my_collection


collection_yp_clean = set_up_mongo('mongodb://localhost:27017/','sh_data_collection','yp_kiel')

for x in collection_yp_clean.find():
    id = x["_id"]
    try:
        lat = float(x["latitude"])
        lon = float(x["longitude"])
    except:
        lat = None
        lon = None
    x.update({"lat":lat,"lon":lon})
    collection_yp_clean.update_one({"_id":id},{"$set":{"lat":lat,"lon":lon}})

some issues:



- **some business adress not actualy in locality where i found it** <br>
but whats really weird is that when you go here (one of the cities in dithmarschen) https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/dithmarschen/bargenstedt/unternehmen what you see is for example the Fritz Käppner GmbH. and its adresses is in surprise: Nuermberg. wtf
- **number of businesses** <br> apparently theres 123000+ businesses in SH, so maybe use some hash function so i dont have to query them all https://www.schleswig-holstein.de/DE/landesregierung/themen/wirtschaft/mittelstand-handwerk