In [None]:
import re
import json
import time
import urllib.parse

import requests
import numpy as np
import pandas as pd


from bs4 import BeautifulSoup

from selenium import webdriver ## Driver for Firefox, Chrome, Edge, etc.
from selenium.webdriver.common.by import By # Mode of locating html elements: ID, CSS_SELECTOR, XPATH, ...
from selenium.webdriver.support.select import Select

from pymongo import MongoClient

---

## FUNCTIONS SET UP

---

In [35]:
def county_search():
    #go to start page and make into soup html obj
    url = "https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/landkreise"
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    
    #find all counties
    lk = soup.find_all(class_="filterlist__item")
    landkreis_stack = [] #create empty stack to put counties into (not a stack tho)
    for e in lk:
        county_refined = e.text.lower()
        county_refined = county_refined.strip()
        landkreis_stack.append(county_refined)

    return landkreis_stack


def city_search(county_in_question):
    #pass relevant counties as argument and get its cities
    url = f"https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/{county_in_question}"
    #print(url) #remove or comment later
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')

    places = soup.find_all(class_="boxteaser__title")

    city_stack = []
    for e in places:
        city_stack.append(e.text.lower()) #append all city names (for one county)

    return city_stack


def set_up_mongo(client_str,database_str,collection_str):
    client = MongoClient(client_str) #connect to mongodb client
    db = client[database_str] #connect to database

    existing_collections = db.list_collection_names() #check that dbs collections
    if collection_str not in existing_collections:
        db.create_collection(collection_str) #create collection if needed

    my_collection = db[collection_str] #connect to collection

    return my_collection


def get_business_per_location(collection,city):
    #get all businesses yellow page link as its unique identifier from mongodb
    #all_buisness = [x['@id'] for x in collection.find({},{'_id':0,'@id':1})] #get the yellow page link of all businesses in database
    available_buiz = [x['@id'] for x in collection.find({'address.addressLocality':f'{city}'},{'_id':0,'@id':1})] #get the yellow page link of all businesses in database
    return available_buiz

#### get the links from the businesses which are like behind the 'newly added' link
def get_new_additions(county,city):
    #so for each county/city
    url = f'https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/{county}/{city}/unternehmen'
    url = urllib.parse.quote(url, safe=':/')
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')

    #go to the all new business section and collect all links on there
    links = soup.find_all('a',class_='link')
    all_links = []
    for elm in links:
        link_yp = elm.get("href") #get the yellowpage link for 1 buisness
        #only keep relevant links
        if link_yp.startswith('https://www.gelbeseiten.de/gsbiz'):
            all_links.append(link_yp)
    
    print(f"got all them links! #no: {len(all_links)}. here they are: ",all_links)

    return all_links

#### collect business info as before
def collect_buisness_info_nittyGritty(my_buisnesses,my_list):
    for elm in my_buisnesses:
        time.sleep(1)
        #for each element in list of all buisness per branch (in one city)
        try:
            html = requests.get(elm).text #go there
            soup = BeautifulSoup(html, 'html.parser')
            json_soup = soup.find_all(type="application/ld+json") #same as above
            if len(json_soup) > 0:
                e = json_soup[len(json_soup)-1] #seems to be in last element always (i hope)
                e = e.text
                data = json.loads(e)
                #print(data, type(data))
            else:
                data = {"business":elm, "message":"failure - this buisness seems to not be available in json format"}
        except Exception as e:
                error_type = type(e).__name__  # Get the name of the exception
                error_message = str(e)         # Get the error message
                data = {"website":elm, "error_type":error_type, "error_message":error_message}        
        
        my_list.append(data)


#### see if newly found yp links are already in database
def check_if_existing(old,new):
    actually_new = []
    for e in new:
        if e not in old:
            actually_new.append(e)

    print(f"check if some new buiz was found, #no {len(actually_new)}", actually_new)

    return actually_new

---

## the actual adding to

---

In [None]:
my_mongo = set_up_mongo('mongodb://localhost:27017/','d2v2','yellow_pages')

lk_stack = county_search()
for lk in lk_stack:
    city = city_search(lk)
    pattern = r"[)(]"
    city_chic = [re.sub(pattern, '', e) for e in city]
    for c in city_chic:
        new_data = []
        possible_new_buizz = get_new_additions(lk,c) #scrape yp data
        old_buizz = get_business_per_location(my_mongo,c) #query mongodb database
        new_buizz = check_if_existing(old_buizz,possible_new_buizz)
        collect_buisness_info_nittyGritty(new_buizz,new_data)
        if new_data != []:
            my_mongo.insert_many(new_data)
            print(f"data inserted for {lk}: {c}")

In [107]:
for kfs in ["kiel","lübeck","flensburg","neumünster"]:
    possible_new_buizz = get_new_additions("kreisfrei",c)
    old_buizz = get_business_per_location(my_mongo,c) #query mongodb database
    new_buizz = check_if_existing(old_buizz,possible_new_buizz)
    collect_buisness_info_nittyGritty(new_buizz,new_data)
    if new_data != []:
            my_mongo.insert_many(new_data)
            print(f"data inserted for {c}!")