In [1]:
import numpy as np
import pandas as pd

from datetime import datetime

import geopandas as gpd
from geopy.distance import geodesic
import geopy

import osmnx as ox
import networkx as nx
import folium

from pymongo import MongoClient

from bs4 import BeautifulSoup
import requests

from selenium import webdriver ## Driver for Firefox, Chrome, Edge, etc.
from selenium.webdriver.common.by import By # Mode of locating html elements: ID, CSS_SELECTOR, XPATH, ...
from selenium.webdriver.support.select import Select

import spacy
nlp = spacy.load("en_core_web_sm")

## Mongo set up as always

In [2]:
def set_up_mongo(client_str,database_str,collection_str):
    client = MongoClient(client_str) #connect to mongodb client
    db = client[database_str] #connect to database

    existing_collections = db.list_collection_names() #check that dbs collections
    if collection_str not in existing_collections:
        db.create_collection(collection_str) #create collection if needed
    
    my_collection = db[collection_str] #connect to collection

    return my_collection

two collections for the data

In [3]:
coll_yp = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','yellow_pages') 
coll_osm = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','osm_pois') 

### helper functions

helper function yp unique business since many duplicates

In [4]:
def unique_business(b):
    unique_buis = []
    new_b = [x for x in b if "@id" in x.keys()]  #only those items which have primary key
    links = [x["@id"] for x in new_b] 
    links = list(set(links)) #make this list unique
    for e in new_b: 
        if e["@id"] in links:
            links.remove(e["@id"])
            unique_buis.append(e)
    return unique_buis

since for osm data its lat, lon and for yp data latitude ..., its easier for plotting function to change it (and also sort out stuff that doesnt have this)

In [5]:
def change_to_LATLON(b):
    lat_lons = []
    for e in b:
        new_dict = {('lat' if k == 'latitude' else 'lon' if k == 'longitude' else k): v for k, v in e.items()} #change naming to be the same for plotting
        if "lat" in new_dict.keys() and "lon" in new_dict.keys():
            lat_lons.append(new_dict)
    return lat_lons

### previous plotting function

In [6]:
def popupStr_generator(df_row):
    pps = f"<b>{df_row["name"]}</b>"
    features = df_row.index
    values = df_row.values
    for f,v in zip(features,values):
        if f in ["telephone","amenity","sameAs"]:
            pps += f"<br>{f}: {v}"
        else:
            pps += ""
    return pps

In [7]:
def plot_spots(data,z):
    poi_df = pd.DataFrame(data) #compile business sample into df
    poi_df["lat"] = pd.to_numeric(poi_df["lat"])
    poi_df["lon"] = pd.to_numeric(poi_df["lon"])
    map_center = [poi_df['lat'].mean(), poi_df['lon'].mean()] # Center the map around the mean latitude and longitude of the POIs
    m = folium.Map(location=map_center, zoom_start=z, tiles='OpenStreetMap') # Initialize the folium map with OpenStreetMap tiles

    # Add markers with popups for each POI
    for _, row in poi_df.iterrows():
        folium.Marker(
            location=[row['lat'], row['lon']],
            popup=popupStr_generator(row),
            tooltip=row['name']
        ).add_to(m)
    
    return m  

## Queries

set up a function where i can pass a dictionary of key words, and it retrieves data from the databses as needed

In [8]:
query_format = {"where":_, #location e.g. Lübeck, Arkebek, Flensburg etc
                "what_general":_, #type of amnenity (only retrieves osm) e.g. pub, restaurant, parking etc
                "what_specific":_, #like pizza, italian, ... (searches the names)
                "special_tag":_} #like specific types of service e.g. reparatur, putzen, etc...

#but order doesnt matter

this function gets a dictionary like above, and then according to the combination of keys passed retrieves data. the query logic differs depending on key words, so have differing underlying logic in setting up query dict.

for example:
- kebab places in lübeck -> 'and' connection (only retrieves yp data since rn only thats with place location, otherwise need geofencing)
- reparatur (passed both what_specific and special_tag -> search for occurence of reparatur both in names of yp and osm data + special tags of yp data)

In [9]:
def create_more_complex_search_string(q,which_db):
    query_dict = {}
    if which_db == "osm":
        if "where" in q.keys():
            #technically this should be some geolocation magic, but as it isnt yet ask for sth impossible
            query_dict.update({"id": "thisisanimprobableid"})
        else: 
            if ("what_general" in q.keys()) and ("what_specific" in q.keys()):
                query_dict.update({"$or": [{"amenity": { "$regex": f"{q["what_general"]}", "$options": "i" }},{"name": { "$regex": f"{q["what_specific"]}", "$options": "i" }}]})   
            elif "what_general" in q.keys():
                query_dict.update({"amenity": { "$regex": f"{q["what_general"]}", "$options": "i" }})
            elif "what_specific" in q.keys():
                query_dict.update({"name": { "$regex": f"{q["what_specific"]}", "$options": "i" }})

    #basically, go from most complex through least complex: first try 3, then all 2 combinations, and then 1s
    if which_db == "yp":
        if ("where" in q.keys()) and ("what_specific" in q.keys()) and ("special_tag" in q.keys()):
            query_dict.update({"$and": [{"address.addressLocality":f"{q["where"]}"}, 
                                        {"$or": [{"name": { "$regex": f"{q["what_specific"]}", "$options": "i" }},{"keywords": { "$regex": f"{q["special_tag"]}", "$options": "i" }}]}]})
        elif ("where" in q.keys()) and ("what_specific" in q.keys()):
            query_dict.update({"$and": [{"address.addressLocality":f"{q["where"]}"}, {"name": { "$regex": f"{q["what_specific"]}", "$options": "i" }}]})
        elif ("where" in q.keys()) and ("special_tag" in q.keys()):
            query_dict.update({"$and": [{"address.addressLocality":f"{q["where"]}"}, {"keywords": { "$regex": f"{q["special_tag"]}", "$options": "i" }}]})
        elif ("special_tag" in q.keys()) and ("what_specific" in q.keys()):
            query_dict.update({"$or": [{"keywords": { "$regex": f"{q["special_tag"]}", "$options": "i" }}, {"name": { "$regex": f"{q["what_specific"]}", "$options": "i" }}]})
        elif ("special_tag" in q.keys()):
            query_dict.update({"keywords": { "$regex": f"{q["special_tag"]}", "$options": "i" }})
        elif ("what_specific" in q.keys()):
            query_dict.update({"name": { "$regex": f"{q["what_specific"]}", "$options": "i" }})
        elif ("where" in q.keys()):
            query_dict.update({"address.addressLocality":f"{q["where"]}"})

    return query_dict

this is just getting both the collection and the already made query dict and retrieves data accordingly

#### try normal input text to query dict funtion

**could also try doing this with an llm** but for now try:
- check if a word is in the list of all cities -> where (in theory, could also add landkreise, but then where key word would also need to be refined)
- get list of possible amenity tags -> check if word is in there
- any adj / adv description -> what_specific and special_tag

- could have used synonyms for both amenities, as well as name and special tag searches, but at least chatgtp synonym finder function isnt good right now

### helper functions: get stacks to look through basically

In [10]:
#GET ALL CITIES

def county_search():
    #go to start page and make into soup html obj
    url = "https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/landkreise"
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    
    #find all counties
    lk = soup.find_all(class_="filterlist__item")
    landkreis_stack = [] #create empty stack to put counties into (not a stack tho)
    for e in lk:
        county_refined = e.text.lower()
        county_refined = county_refined.strip()
        landkreis_stack.append(county_refined)

    return landkreis_stack


# ----- FIND CITIES IN PER COUNTY -----
def city_search(county_in_question):
    #pass relevant counties as argument and get its cities
    url = f"https://www.gelbeseiten.de/branchenbuch/staedte/schleswig-holstein/{county_in_question}"
    #print(url) #remove or comment later
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')

    places = soup.find_all(class_="boxteaser__title")

    city_stack = []
    for e in places:
        city_stack.append(e.text.lower()) #append all city names (for one county)

    return city_stack


In [11]:
# GET ALL AMENITIES
c = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','osm_pois') #establish connection
available_amenities = [x["amenity"] for x in c.find({},{"_id":0})]
available_amenities = list(set(available_amenities))

In [12]:
"""#import spacy
#python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")
doc = nlp("i am looking for car repair shops")

#would ideally yield car and repais, but not shops (could do with repair shops but thats compound nouns then)

# Print each word with its part of speech tag
tokens = {}
for token in doc:
    tokens.update({token.pos_:token.text})
#[k for v,k in tokens.items() if (v=="ADJ" or v=="NOUN") ] #doesnt find repair because repair is assumed to modify shop 
#yields: ['shops']
tokens"""

'#import spacy\n#python -m spacy download en_core_web_sm\n\nnlp = spacy.load("en_core_web_sm")\ndoc = nlp("i am looking for car repair shops")\n\n#would ideally yield car and repais, but not shops (could do with repair shops but thats compound nouns then)\n\n# Print each word with its part of speech tag\ntokens = {}\nfor token in doc:\n    tokens.update({token.pos_:token.text})\n#[k for v,k in tokens.items() if (v=="ADJ" or v=="NOUN") ] #doesnt find repair because repair is assumed to modify shop \n#yields: [\'shops\']\ntokens'

In [13]:
input_query = "I am looking for reparatur shops or repair shops"

#look for compound nouns
doc = nlp(input_query)
chunkss = []
for chunk in doc.noun_chunks:
    chunkss.append(chunk.text)

In [14]:
chunkss

['I', 'reparatur shops', 'repair shops']

In [15]:
input_query = "where is a shop where I could fix my car door"
query_sep = input_query.split()
tokens = {}
for t in query_sep:
    doc = nlp(t)
    for token in doc:
        tokens.update({token.text:token.pos_})
my_search = [k for k,v in tokens.items() if v=="NOUN"]

In [16]:
my_search

['shop', 'fix', 'car', 'door']

In [17]:
def make_list_into_noun_terms(list):
    tokens = {}
    for t in list:
        doc = nlp(t)
        for token in doc:
            tokens.update({token.text:token.pos_})
    my_search = [k for k,v in tokens.items() if v=="NOUN"]
    return my_search

In [18]:
def input_to_query_terms(input):
    
    #look for compound nouns
    doc_1 = nlp(input)
    chunkss = []
    for chunk in doc_1.noun_chunks:
        chunkss.append(chunk.text)

    query_sep = input.split()    
    list_of_options = make_list_into_noun_terms(query_sep) #only search for real nouns but include compound nouns (here we could also get like top 5 synonyms)
    list_of_options = list(set(list_of_options + chunkss))
    list_of_options = [e for e in list_of_options if (e not in ["I","i","you","Your"])] 

    #get rid of pronouns sadly like this
    return list_of_options

In [19]:
q1 = "I am looking for bike repair shops in Lübeck"
print(input_to_query_terms(q1))
q1 = input_to_query_terms(q1) 

['bike repair shops', 'bike', 'repair', 'Lübeck', 'shops']


In [20]:
q2 = "where is a shop so i could fix my bike in lübeck"
print(input_to_query_terms(q2))
q2 = input_to_query_terms(q2) 

['bike', 'my bike', 'lübeck', 'a shop', 'fix', 'shop']


also actually output multiple dicts, similar-ish to search options, to really actually search all the combinations

In [21]:
def qTerm_to_query_dict(terms):
    #ALL AMENITIES
    c = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','osm_pois') #establish connection
    available_amenities = [x["amenity"] for x in c.find({},{"_id":0})]
    available_amenities = list(set(available_amenities))

    #ALL CITIES
    all_cities_sh = []
    lk_stack = county_search()
    for lk in lk_stack:
        c = city_search(lk)
        all_cities_sh.extend(c)
    all_cities_sh = all_cities_sh + ["Kiel","Flensburg","Neumünster","Lübeck"] #add kreisfreie städe

    #search_dicts = [] BUT basically do this, only for terms at len -1, -2, -3 to get multiple combinations? but also not really, cause location always
    qd = {}
    for t in terms:
        if t in all_cities_sh:
            qd.update({"where":t})
        if t in available_amenities:
            qd.update({"what_general":t})

        qd.update({"what_specific":t})
        qd.update({"special_tag":t})
    return qd


In [22]:
qTerm_to_query_dict(q1)

{'what_specific': 'shops', 'special_tag': 'shops', 'where': 'Lübeck'}

In [23]:
qTerm_to_query_dict(q2) #just picks last term in list, cause only 1 dictionary, disregards lübeck cause lower case in input string

{'what_specific': 'shop', 'special_tag': 'shop'}

try via synonyms? so more search words findable. but dont use this 

In [24]:
"""from nltk.corpus import wordnet

# Function to get synonyms
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

# Example usage
word = "repair"
synonyms = get_synonyms(word)
print(f"Synonyms for '{word}':", synonyms)

#yeah i dont like these synonyms, not gonna use them"""

'from nltk.corpus import wordnet\n\n# Function to get synonyms\ndef get_synonyms(word):\n    synonyms = set()\n    for syn in wordnet.synsets(word):\n        for lemma in syn.lemmas():\n            synonyms.add(lemma.name())\n    return list(synonyms)\n\n# Example usage\nword = "repair"\nsynonyms = get_synonyms(word)\nprint(f"Synonyms for \'{word}\':", synonyms)\n\n#yeah i dont like these synonyms, not gonna use them'

### retrieve data by passing collection and finished search dict

In [25]:
def retrieve_data(collection,query_dict): 
    available_buiz = [x for x in collection.find(query_dict,{'_id':0})]  
    return available_buiz

### to do

- add a function where i can pass literally a text and it takes elements and puts them into query format
- geofencing? anyway, obvs also want to retrieve osm data for locations
- other fancy stuff like: i want parks near bus stops, or reparatur places near parking or sth

### query data

In [26]:
def get_data(coll_osm,coll_yp,query):
    #get osm data
    query_str_osm = create_more_complex_search_string(query,"osm") #create correct query string for yp db
    osm_buiz = retrieve_data(coll_osm, query_str_osm) #just pick up data

    #get yp data
    query_str_yp = create_more_complex_search_string(query,"yp") #create correct query string for yp db
    yp_buiz = retrieve_data(coll_yp, query_str_yp) #just pick up data
    yp_buiz = unique_business(yp_buiz) #make sure everything only once since still many duplicates
    yp_buiz = change_to_LATLON(yp_buiz)

    #concatenate results (since if nothing is found empty list is passed, so no harm done)
    result_set = osm_buiz + yp_buiz

    return result_set

In [27]:
q = {"what_specific":"Reparatur", "special_tag":"Reparatur"}
r = get_data(coll_osm,coll_yp,q)

In [28]:
len(r)

130

## Examples with plot

In [29]:
#reminder:
query_format = {"where":_, # Lübeck                 (search address loc yp)
                "what_general":_,# pub, restaurant etc   (search amenity osm)
                "what_specific":_,# kebab italian        (search name)
                "special_tag":_} # pizza cleaning        (search keywords osm)
#but order doesnt matter

In [30]:
coll_osm = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','osm_pois') #establish connection
coll_yp = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','yellow_pages') #establish connection

In [31]:
#q = {"what_specific":"Reparatur", "special_tag":"Reparatur"} 130 results
q = {"what_specific":"Reparatur", "special_tag":"Reparatur","what_general":"repair"} #now also search amenity tag, much more
r = get_data(coll_osm,coll_yp,q)
plot_spots(r,8)

In [32]:
#q = {"what_specific":"Kebab"}
q = {"where":"Lübeck","what_specific":"Kebab"} #cancel out all other kebab places because only 1 in yp data right now and no searching in osm due to 'where'
r = get_data(coll_osm,coll_yp,q)
plot_spots(r,8)

try a different collection for yp: we now have only kiel yey

In [33]:
coll_yp_kiel = set_up_mongo('mongodb://localhost:27017','sh_data_collection','yp_kiel') #establish connection

In [34]:
q = {"what_specific":"Umzüge", "where":"Kiel", "special_tag":"Umzüge"}
r = get_data(coll_osm,coll_yp_kiel,q)
plot_spots(r,8)

In [35]:
q = {"what_specific":"Pizza", "where":"Kiel"}
r = get_data(coll_osm,coll_yp_kiel,q)
#print((len(r)))
#print(r[0])
#plot_spots(r[0:13],8)
#plot_spots(r,8) -> doesnt work i dont know why???

In [36]:
#r[13]

In [37]:
q = {"what_specific":"Italian", "special_tag":"Pizza"}
r = get_data(coll_osm,coll_yp_kiel,q)
plot_spots(r,8)