In [None]:
import re
import json
import time
import urllib.parse
from urllib.parse import unquote

import requests
import numpy as np
import pandas as pd


from bs4 import BeautifulSoup

from selenium import webdriver ## Driver for Firefox, Chrome, Edge, etc.
from selenium.webdriver.common.by import By # Mode of locating html elements: ID, CSS_SELECTOR, XPATH, ...
from selenium.webdriver.support.select import Select

from pymongo import MongoClient

# helper functions navigating website and collecting inbetween data (like links)

In [None]:
# ----- FIND COUNTIES IN SH -----
def county_search():
    #go to start page and make into soup html obj
    url = "https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/landkreise"
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    
    #find all counties
    lk = soup.find_all(class_="filterlist__item")
    landkreis_stack = [] #create empty stack to put counties into (not a stack tho)
    for e in lk:
        county_refined = e.text.lower()
        county_refined = county_refined.strip()
        landkreis_stack.append(county_refined)

    return landkreis_stack


# ----- FIND CITIES IN PER COUNTY -----
def city_search(county_in_question):
    #pass relevant counties as argument and get its cities
    url = f"https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/{county_in_question}"
    #print(url) #remove or comment later
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')

    places = soup.find_all(class_="boxteaser__title")

    city_stack = []
    for e in places:
        city_stack.append(e.text.lower()) #append all city names (for one county)

    return city_stack


# ----- FIND BRANCHES PER LOCATION ALPHABETICALLY -----
def total_branch_collector(county,city):
    #now collect all types of buisness

    #first go to county/city
    url = f"https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/{county}/{city}"
    url = urllib.parse.quote(url, safe=':/')
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')

    #find links to each letter of alphabet that has businesses
    soup_r = soup.find_all(class_="alphabetfilter__btn")
    #get links to these subdirectories (And only specific links, theres links to like #a too so filter out)
    all_hrefs = [e.get("href") for e in soup_r] 
    all_hrefs = [e if e!=None else "get_out" for e in all_hrefs]
    all_hrefs = [e if e[0]=='/' else "get_out" for e in all_hrefs]
    hrefs = []
    for e in all_hrefs[:]:  # Iterate over a copy of the list
        if e != "get_out":
            hrefs.append(f"https://www.gelbeseiten.de{e}") #we now have clean links for each letter 
            #e.g. url = f"https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/dithmarschen/albersdorf%20holstein/branchen/b"
    
    branchen_refs = []
    #now go through all the branchen to get the actual links to buisnesses
    for l in hrefs:
        html = requests.get(l).text
        soup = BeautifulSoup(html,'html.parser')
        soup_r = soup.find_all(class_="link") #get the links for each branche
        for b in soup_r:
            branchen_refs.append(f'https://www.gelbeseiten.de{b.get("href")}')

    #some counties are so small they have no unternehmenslist so extra function for those (same as previous)
    return branchen_refs


# ----- COLLECT INFO IN SH -----
def tooSmolNoBranch(county,city):
    #apparently here there is no cookies shit anyway
    url = f"https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/{county}/{city}/unternehmen"
    url = urllib.parse.quote(url, safe=':/')
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')

    #get all the links to the buisnesses
    links = soup.find_all('a',class_='link')
    all_links = []
    for elm in links:
        link_yp = elm.get("href") #get the yellowpage link for 1 buisness
        #only keep relevant links
        if link_yp.startswith('https://www.gelbeseiten.de/gsbiz'):
            all_links.append(link_yp)

    return all_links

# ----- COLLECT ACTUAL YP BUSINESS INFO -----
def collect_buisness_info_nittyGritty(my_buisnesses,my_list):
    for elm in my_buisnesses:
        time.sleep(1)
        #for each element in list of all buisness per branch (in one city)
        try:
            html = requests.get(elm).text #go there
            soup = BeautifulSoup(html, 'html.parser')
            json_soup = soup.find_all(type="application/ld+json") #same as above
            if len(json_soup) > 0:
                e = json_soup[len(json_soup)-1] #seems to be in last element always (i hope)
                e = e.text
                data = json.loads(e)
            else:
                data = {"business":elm, "message":"failure - this buisness seems to not be available in json format"}
        except Exception as e:
                error_type = type(e).__name__  # Get the name of the exception
                error_message = str(e)         # Get the error message
                data = {"website":elm, "error_type":error_type, "error_message":error_message}        
        
        my_list.append(data)

### ------ BUSINESSES FROM MY INDEX DB ------
def collect_buisness_info_basic(one_link):
        try:
            html = requests.get(one_link).text #go there
            soup = BeautifulSoup(html, 'html.parser')
            json_soup = soup.find_all(type="application/ld+json") #same as above
            if len(json_soup) > 0:
                e = json_soup[len(json_soup)-1] #seems to be in last element always (i hope)
                e = e.text
                data = json.loads(e)
            else:
                data = {"business":one_link, "message":"failure - this buisness seems to not be available in json format"}
        except Exception as e:
                error_type = type(e).__name__  # Get the name of the exception
                error_message = str(e)         # Get the error message
                data = {"website":one_link, "error_type":error_type, "error_message":error_message}
        #return json object of data
        return data


# ----- EXPAND RESULT COUNT TO 50+ -----
def expand_resultcount(url):
    #print(url)
    #url = 'https://www.gelbeseiten.de/branchen/friseur/lübeck'
    driver = webdriver.Firefox()
    driver.get(url)

    driver.implicitly_wait(3)

    #manage cookie button
    try:
        accept_button = driver.find_element(By.XPATH, "/html/body/div[1]/div[1]/div[2]/span[1]/a")
        accept_button.click()
    except:
        accept_button = None

    #find out how many results in total there are
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    result_totalCount = soup.find_all(id="loadMoreGesamtzahl")
    if len(result_totalCount) > 0:
        result_totalCount = int(result_totalCount[0].text) #get str w/ number, make into int
        result_totalCount = np.round((result_totalCount-50)/10) #because first 50 results are shown
    else:
        result_totalCount = 0 #f not more than 50 results available just skip while loop

    #click load more button as often as needed (each time 10 more results)
    load_more_clicked = 1
    while load_more_clicked <= result_totalCount:
        #driver.execute_script("arguments[0].scrollIntoView(true);", loadMore_button) #thanks chat gtp
        #loadMore_button.click()
        loadMore_button = driver.find_element(By.CSS_SELECTOR, "#mod-LoadMore--button")
        time.sleep(1)
        driver.execute_script("arguments[0].click();", loadMore_button) #use java script to execute button, since sometimes image ads or headers obscure view (see demo)
        load_more_clicked += 1

    #now use bs4 again since i already wrote this, just extract all hrefs
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    all_links = soup.find_all(id=re.compile(r'^treffer_\d+')) #each buisness is a 'treffer' (=hit) followed by _ and some digits
    my_buisnesses = [s.find("a")["href"] for s in all_links] #now get all hrefs for the buisnesses and return that list

    driver.quit()

    return my_buisnesses


# ----- COMBINE ALL RESULTS PER BRANCHE + COLLECT EACHs INFOS -----
def collect_buisness_infos_2(branchen):
    buisnesslist = []
    for b in branchen:
        #something like this url = "https://www.gelbeseiten.de/branchen/apotheke/kiel" is the url then
        
        #use function with selenium to expand search results from 50 to x, then use bs4 to collect all the hrefs for the buisnesses and return them as a list
        my_buisnesses = expand_resultcount(b)
        #now go through that list and get info on each buisness
        #use this broad try thing, because e.g. timeout errors occured at 126 elements and who knows which other issues might appear (also eduroam hates me)
        collect_buisness_info_nittyGritty(my_buisnesses, buisnesslist)
    return buisnesslist


# ----- X X X -----
def collect_business_links(branchen):
    buisnesslist = []
    for b in branchen:
        #something like this url = "https://www.gelbeseiten.de/branchen/apotheke/kiel" is the url then
        #use function with selenium to expand search results from 50 to x, then use bs4 to collect all the hrefs for the buisnesses and return them as a list
        business_found = expand_resultcount(b)
        buisnesslist.extend(business_found)
    return buisnesslist


# ---- MAKE YP LINK TO BUSINESSES INTO JSON OBJECT TO DUMP INTO DB -----
def links_to_index_obj(one_link): 
    index_object = {"@id":one_link, "data_collected":"false"}
    return index_object


# --- CHECK IF THIS BUSINESS HAS ALREADY BEEN NOTED ---
def check_indices(collection,link):
    available_buiz = [x for x in collection.find({'@id':f'{link}'},{'_id':0})]
    if len(available_buiz) >= 1:
        return True
    else:
        return False


# ----- SET UP MONGO -----
def set_up_mongo(client_str,database_str,collection_str):
    client = MongoClient(client_str) #connect to mongodb client
    db = client[database_str] #connect to database

    existing_collections = db.list_collection_names() #check that dbs collections
    if collection_str not in existing_collections:
        db.create_collection(collection_str) #create collection if needed
    
    my_collection = db[collection_str] #connect to collection

    return my_collection
    

# ACTUAL SET UP

### First collect all links to all businesses in SH

- dithmarschen - done

In [None]:
lk_stack = county_search()
lk_stack

In [None]:
city = city_search('pinneberg')
city

In [None]:
lk = "pinneberg"

cities_next = ['schenefeld bezirk hamburg',
 'seester',
 'seestermühe',
 'seeth-ekholt',
 'tangstedt kreis pinneberg',
 'tornesch',
 'uetersen',
 'wedel',
 'westerhorn']

In [None]:
collection_indices = set_up_mongo('mongodb://localhost:27017/','webscraping_dataLabKiel','yellow_pages_index')

pattern = r"[)(]" #take care of unclean stuff, look like only brakcets are an issue
city_chic = [re.sub(pattern, '', e) for e in cities_next]
for c in city_chic:
    sh_unternehmen_links = [] #set up collector
    my_branch_list = total_branch_collector(lk,c) #get list of all links of all the branchen which are available in 1 city alphabetically
    if my_branch_list == []:
        sh_unternehmen_links = tooSmolNoBranch(lk,c) #get list of all href of buisness for that place (if so small theres no branchen split up)
        print(f"links for 1 SMALL city collected! city/landkreis: {c}/{lk}")
    else:
        sh_unternehmen_links = collect_business_links(my_branch_list)
        print(f"links for 1 city collected! city/landkreis: {c}/{lk}")

    #and then for each city, go through list of all business links, check if they are in index db, and only if not put them in there
    for e in sh_unternehmen_links:
        #only if we assume that in one city businesses arent listed twice, then we can reduce the querying, but since the point literally is not to get duplicates, lets not
        if check_indices(collection_indices,e) == False:
            d = links_to_index_obj(e)
            collection_indices.insert_one(d)
    print(f"links also inserted for city/landkreis: {c}/{lk}\n")

---

run this over night!!

In [None]:
"""collection_indices = set_up_mongo('mongodb://localhost:27017/','webscraping_dataLabKiel','yellow_pages_index')

for lk in ['pinneberg','plön','rendsburg-eckernförde','schleswig-flensburg','segeberg','steinburg','stormarn']:
    cities_next = city_search(lk)
    pattern = r"[)(]" #take care of unclean stuff, look like only brakcets are an issue
    city_chic = [re.sub(pattern, '', e) for e in cities_next]
    for c in city_chic:
        sh_unternehmen_links = [] #set up collector
        my_branch_list = total_branch_collector(lk,c) #get list of all links of all the branchen which are available in 1 city alphabetically
        if my_branch_list == []:
            sh_unternehmen_links = tooSmolNoBranch(lk,c) #get list of all href of buisness for that place (if so small theres no branchen split up)
            print(f"links for 1 SMALL city collected! city/landkreis: {c}/{lk}")
        else:
            sh_unternehmen_links = collect_business_links(my_branch_list)
            print(f"links for 1 city collected! city/landkreis: {c}/{lk}")

        #and then for each city, go through list of all business links, check if they are in index db, and only if not put them in there
        for e in sh_unternehmen_links:
            #only if we assume that in one city businesses arent listed twice, then we can reduce the querying, but since the point literally is not to get duplicates, lets not
            if check_indices(collection_indices,e) == False:
                d = links_to_index_obj(e)
                collection_indices.insert_one(d)
        print(f"links also inserted for city/landkreis: {c}/{lk}\n")"""

---

In [None]:
collection_indices = set_up_mongo('mongodb://localhost:27017/','webscraping_dataLabKiel','yellow_pages_index')

#then do the same for kreisfreie städe
for kfs in ["flensburg","neumünster","lübeck","kiel"]:
    my_branch_list = total_branch_collector("kreisfrei",kfs)
    sh_unternehmen_links = collect_business_links(my_branch_list)
    for e in sh_unternehmen_links:
        if check_indices(collection_indices,e) == False:
            d = links_to_index_obj(e)
            collection_indices.insert_one(d)


### Then go through new index db and into a second collection YP_DATA_CLEAN insert only data for businesses we have once

In [None]:
collection_indices = set_up_mongo('mongodb://localhost:27017/','webscraping_dataLabKiel','yellow_pages_index')
collection_yp_clean = set_up_mongo('mongodb://localhost:27017/','webscraping_dataLabKiel','yellow_pages_clean_v2')

In [None]:
#collect all buisnesses where the flag attr hasnt been set to true
available_buiz_not_collected = [x for x in collection_indices.find({"data_collected":"false"},{'_id':0})]
for a in available_buiz_not_collected:
    #collect data for one link
    data = collect_buisness_info_basic(a)
    collection_yp_clean.insert_one(data)

    #set flag in index list to true
    query = {"@id": f"{a}"}
    update = {"$set": {"data_collected": "true"}}
    collection_indices.update_one(query, update)

In [None]:
def set_up_mongo(client_str,database_str,collection_str):
    client = MongoClient(client_str) #connect to mongodb client
    db = client[database_str] #connect to database

    existing_collections = db.list_collection_names() #check that dbs collections
    if collection_str not in existing_collections:
        db.create_collection(collection_str) #create collection if needed
    
    my_collection = db[collection_str] #connect to collection

    return my_collection


collection_yp_clean = set_up_mongo('mongodb://localhost:27017/','webscraping_dataLabKiel','yellow_pages_clean_v2')

for x in collection_yp_clean.find():
    id = x["_id"]
    try:
        lat = float(x["latitude"])
        lon = float(x["longitude"])
    except:
        lat = None
        lon = None
    x.update({"lat":lat,"lon":lon})
    collection_yp_clean.update_one({"_id":id},{"$set":{"lat":lat,"lon":lon}})

# SMALL TRIAL SET UP

In [None]:
collection_indices = set_up_mongo('mongodb://localhost:27017/','webscraping_dataLabKiel','yellow_pages_index')

#get all counties
lk = "dithmarschen"
city_stack = ['arkebek','averlak','bargenstedt','barkenholm','barlt']


pattern = r"[)(]" #take care of unclean stuff, look like only brakcets are an issue
city_chic = [re.sub(pattern, '', e) for e in city_stack]

for c in city_chic:
    sh_unternehmen_links = [] #set up collector
    my_branch_list = total_branch_collector(lk,c) #get list of all links of all the branchen which are available in 1 city alphabetically
    if my_branch_list == []:
        sh_unternehmen_links = tooSmolNoBranch(lk,c) #get list of all href of buisness for that place (if so small theres no branchen split up)
        print(f"links for 1 SMALL city collected! city/landkreis: {c}/{lk}")
    else:
        sh_unternehmen_links = collect_business_links(my_branch_list)
        print(f"links for 1 city collected! city/landkreis: {c}/{lk}")

    #and then for each city, go through list of all business links, check if they are in index db, and only if not put them in there
    for e in sh_unternehmen_links:
        if check_indices(collection_indices,e) == False:
            d = links_to_index_obj(e)
            collection_indices.insert_one(d)
    print(f"links also inserted for city/landkreis: {c}/{lk}\n")

In [None]:
collection_indices = set_up_mongo('mongodb://localhost:27017/','webscraping_dataLabKiel','yellow_pages_index')
collection_yp_clean = set_up_mongo('mongodb://localhost:27017/','webscraping_dataLabKiel','yp_data_clean')

In [None]:
#collect all buisnesses where the flag attr hasnt been set to true
available_buiz_not_collected = [x for x in collection_indices.find({"data_collected":"false"},{'_id':0})]
for a in available_buiz_not_collected:
    #collect data for one link
    link = a["@id"]
    data = collect_buisness_info_basic(link)
    collection_yp_clean.insert_one(data)

    #set flag in index list to true
    query = {"@id": f"{e}"}
    update = {"$set": {"data_collected": "true"}}
    collection_indices.update_one(query, update)

some issues:



- **some business adress not actualy in locality where i found it** <br>
but whats really weird is that when you go here (one of the cities in dithmarschen) https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/dithmarschen/bargenstedt/unternehmen what you see is for example the Fritz Käppner GmbH. and its adresses is in surprise: Nuermberg. wtf
- **number of businesses** <br> apparently theres 123000+ businesses in SH, so maybe use some hash function so i dont have to query them all https://www.schleswig-holstein.de/DE/landesregierung/themen/wirtschaft/mittelstand-handwerk

---
---

# update data in collection

In [None]:
collection_yp_clean = set_up_mongo('mongodb://localhost:27017/','webscraping_dataLabKiel','yp_data_clean')

for x in collection_yp_clean.find():
    id = x["_id"]
    try:
        lat = float(x["latitude"])
        lon = float(x["longitude"])
    except:
        lat = None
        lon = None
    x.update({"lat":lat,"lon":lon})
    collection_yp_clean.update_one({"_id":id},{"$set":{"lat":lat,"lon":lon}})