In [1]:
# connect to mongodb via mongodb compass
# load geodata as in lecture

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from geopy.distance import geodesic
import osmnx as ox
import networkx as nx
import geopy
import folium
from pymongo import MongoClient
from datetime import datetime

# mongodb

In [2]:
def set_up_mongo(client_str,database_str,collection_str):
    client = MongoClient(client_str) #connect to mongodb client
    db = client[database_str] #connect to database

    existing_collections = db.list_collection_names() #check that dbs collections
    if collection_str not in existing_collections:
        db.create_collection(collection_str) #create collection if needed
    
    my_collection = db[collection_str] #connect to collection

    return my_collection

In [3]:
collection = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','yellow_pages') 

## helper functions etc

### all business in one location (YP)

In [5]:
def get_business_per_location(collection,city):
    #get everything except mongodb key
    available_buiz = [x for x in collection.find({"@id":{ "$exists": True}, #primary key of yp page link has to exist
                                                 'address.addressLocality':f'{city}'},{'_id':0})]  #'address':1,'name':1,'telephone':1,'latitude':1,'longitude':1,"@id":1, "opening_Hours":1
    return available_buiz

In [6]:
business = get_business_per_location(collection,"Lübeck")

In [7]:
len(business)

460

## just all business (YP)

In [8]:
def get_all_business(collection):
    available_buiz = [x for x in collection.find({"@id":{ "$exists": True}})] #only get business where primary key exists
    return available_buiz

In [9]:
l = get_all_business(collection)
len(l)

13231

turns out some are there doubly (maybe they are listed in more than one location?) -> sort them out

In [10]:
def get_unique_buisness(b):
    unique_buis = []
    links = [x["@id"] for x in b] #get all the yellow page links (the primary key basically) in my list of business in one location
    links = list(set(links)) #make this list unique
    for e in b: #go through original list
        #basically check if this element is (still) in link list, if yes remove it from links, but add all its info to unique info list
        if e["@id"] in links:
            links.remove(e["@id"])
            unique_buis.append(e)
    return unique_buis

In [11]:
relevant_buiz = get_unique_buisness(business)

In [12]:
len(relevant_buiz)

164

In [13]:
#relevant_buiz[100]

get all business stuff

In [14]:
uql = get_unique_buisness(l)
len(uql)

4451

### check if all data i try to collect is available

in this case particularly important: does it have geolocations? those need to be numerical so transform them

In [15]:
def availablity_check(dct,attribute):
    if attribute in dct.keys():
        if attribute in ["latitude","longitude"]:
            return float(dct[attribute])
        return dct[attribute]
    else:
        return None

select only some data from business list: 

In [16]:
#select buisness examples
i = 0
how_many = len(relevant_buiz)
data_x = []
while i < how_many:
    ith = relevant_buiz[i]
    if "latitude" in ith.keys(): #only take business which has geolocation 
        data_x.append({"name":availablity_check(ith,"name"),"latitude":availablity_check(ith,"latitude"),"longitude":availablity_check(ith,"longitude"),
                    "telephone":availablity_check(ith,"telephone")})
    i += 1

put into function

In [17]:
def pick_only_some_data(my_list, how_many=10):
    i = 0
    if how_many == 10: #otherwise people changed it, so then leave it
        how_many = len(my_list)
    data = []
    while i < how_many:
        ith = my_list[i]
        if "latitude" in ith.keys(): #only take business which has geolocation 
            data.append({"name":availablity_check(ith,"name"),"latitude":availablity_check(ith,"latitude"),"longitude":availablity_check(ith,"longitude"),
                        "telephone":availablity_check(ith,"telephone"),"@id":availablity_check(ith,"@id")}) #@id show up only with like plot chosen features thing
        i += 1
    return data

## make search more dynamic

- display choosable features on map (isnt nice actually)
- select spots based on some tag search / opening hours or something / type of business ideally / key word in name

### make search more flexible

In [18]:
uql[10].keys()

dict_keys(['_id', 'address', 'keywords', '@type', 'latitude', 'name', 'dateModified', 'telephone', '@id', '@context', 'url', 'sameAs', 'longitude'])

In [19]:
# interesting values are: 
# location (via adress) 
# opening hours: is it open now
# do tags contain search word
# show only e.g. restaurants, so by type of business ... maybe via name????

In [20]:
uql[0]["review"]

[{'datePublished': '15.11.2016',
  '@type': 'Review',
  'author': {'@type': 'Person', 'name': 'Burr BenutzerIn'},
  'reviewBody': 'Kompetenz,Freundlichkeit, Pünktlichkeit, wie man das von einer guten Praxis erwarten kann.',
  'reviewRating': {'bestRating': '5',
   '@type': 'Rating',
   'ratingValue': 5,
   'worstRating': '1'}}]

In [21]:
def render_query(query_string):
    "so here we want to get a normal sentence as input and somehow make it into query string format"
    pass

#### query format!

In [22]:
query_format = {"where":_,"is_open":_,"what":_,"special_tag":_} #but order doesnt matter

In [23]:
def create_search_string(q):
    query_dict = {"@id":{ "$exists": True}}
    for k in q.keys():
        if k == "where":
            query_dict.update({"address.addressLocality":f"{q[k]}"})
        if k == "what":
            query_dict.update({"name": { "$regex": f"{q[k]}", "$options": "i" }})
        if k == "special_tag":
            query_dict.update({"keywords": { "$regex": f"{q[k]}", "$options": "i" }})
        if k == "is_open":
            #time = datetime.now()
            #time = time.strftime("%A, %H:%M") #time = 'Monday, 15:48'
            #either use LLM or hard code options
            pass
    return query_dict

In [24]:
query1 = {"where":"Lübeck","special_tag":"Reparatur"}
query1

{'where': 'Lübeck', 'special_tag': 'Reparatur'}

In [25]:
to_ask = create_search_string(query1)
create_search_string(query1)

{'@id': {'$exists': True},
 'address.addressLocality': 'Lübeck',
 'keywords': {'$regex': 'Reparatur', '$options': 'i'}}

In [26]:
def get_business_by_features(collection,query_dict):
    available_buiz = [x for x in collection.find(query_dict,{'_id':0})]  
    return available_buiz

In [27]:
get_business_by_features(collection,to_ask)

[{'address': {'addressCountry': 'DE',
   'streetAddress': 'Kronsforder Allee 130',
   '@type': 'PostalAddress',
   'postalCode': '23560',
   'addressLocality': 'Lübeck'},
  'keywords': ['Zeltreparatur'],
  '@type': 'LocalBusiness',
  'latitude': '53.841564',
  'name': 'Planen Bahr',
  'faxNumber': '0451 59 48 44',
  'dateModified': '2024-10-15',
  'telephone': '0451 5 10 25',
  '@id': 'https://www.gelbeseiten.de/gsbiz/52132bd8-6e1d-4df4-b40b-9723d34e8691',
  '@context': 'https://schema.org/',
  'url': 'https://www.gelbeseiten.de/gsbiz/52132bd8-6e1d-4df4-b40b-9723d34e8691',
  'longitude': '10.676685'}]

### make pick_only_some_data function more flexible

- e.g. let me pass the features i want displayed. but only relevant for display, not actually for search

somehow get a function where i dont have to type the features exactly as they are named

In [28]:
def select_features():
    pass

pass a list of features (but as they are really called) which i want too include for my business, but dont select based on them

to display those in pop up

In [29]:
def pick_data_with_named_features(all_buiz, my_features):
    data = []
    for ith in all_buiz:
        if ("latitude" in ith.keys()) and ("longitude" in ith.keys()): #only take business which has geolocation 
            list_of_dicts = [{f:availablity_check(ith,f)} for f in my_features]
            merged_g = [{k: v for d in list_of_dicts for k, v in d.items()}]
            data.extend(merged_g)
    return data

In [30]:
fs = ["name","latitude","longitude","openingHours","@id"]
#pick_data_with_named_features(relevant_buiz,fs)

helper function to create the pop up string

In [31]:
def popupStr_generator(df_row):
    pps = f"<b>{df_row["name"]}</b>"
    features = df_row.index
    values = df_row.values
    for f,v in zip(features,values):
        if f not in ["longitude","latitude","name"]:
            pps += f"<br>{f}: {v}"
    return pps

### add search for osm data

In [32]:
def osm_get_data(collection,what):
    #get everything except mongodb key
    available_buiz = [x for x in collection.find({"amenity": { "$regex": f"{what}", "$options": "i" }},{'_id':0})]  #'address':1,'name':1,'telephone':1,'latitude':1,'longitude':1,"@id":1, "opening_Hours":1
    return available_buiz

In [33]:
collection_osm = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','osm_pois')
len(osm_get_data(collection_osm,"pub"))

611

## important: actually combine search options!!!

- 1) so if the goal is to plot business next to some amneties, then we get like regular integration
- 2) but if we just want e.g. all pubs, then we need to do the matching it together thing! so check via name or geofence location -> but suman is doing this so yey
- 3) or do fancier osm: i want restaurants near parks, hotels near bus stops etc

In [34]:
#so write a search function where i can type in basically: i want Friseure near parks or sth
#also do fanicer

# functions for plotting some data

static: only display name and phone number of business

In [35]:
def plot_business_spots(data,save=False):
    poi_df = pd.DataFrame(data) #compile business sample into df
    map_center = [poi_df['latitude'].mean(), poi_df['longitude'].mean()] # Center the map around the mean latitude and longitude of the POIs
    m = folium.Map(location=map_center, zoom_start=15, tiles='OpenStreetMap') # Initialize the folium map with OpenStreetMap tiles

    # Add markers with popups for each POI
    for _, row in poi_df.iterrows():
        folium.Marker(
            location=[row['latitude'], row['longitude']],
            popup=f"<b>{row['name']}</b><br>Phone: {row['telephone']}",
            tooltip=row['name']
        ).add_to(m)

    if save==True:
        m.save("map_with_pois.html")
    
    return m  

In [36]:
#plot_business_spots(data_x) #input data ready to use (from experimentation process above, so all prev steps need to be done)

## put into one function: choosing location + plot

add later: filter by type of business vs all location

In [37]:
def plot_by_location(mdb,location,to_index=10,save_map=False):
    #first connect to mongodb and get all business in one location 
    spots = get_business_per_location(mdb,location)
    #then make sure theres no duplicates
    spots = get_unique_buisness(spots)
    #get only subset of data if you want
    spots = pick_only_some_data(spots,to_index) #displays everything, only if not 10 is diplay changed
    #then into plotting function
    return plot_business_spots(spots,save_map)

now actually input the collection and the location and wheter to save/show everything as you wish

In [38]:
collection = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','yellow_pages') #same as before, just for the principle
#plot_by_location(collection,"Flensburg",save_map=True)

In [39]:
#plot_by_location(collection,"Arkebek")

In [40]:
#plot_by_location(collection,"Lübeck",to_index=3)

### plot with selected features (but just like what appears in pop up marker) (((((*)))))

(so show e.g. telephone and address or sth)

this one is more dynamic in theory. so if this is used with pick_data_with_named_features, it will display the features listed (named) when picking the data. 
if its used with the 'normal' data collection defined above (pick_only_some_data - but some in the sense of a limited amount, hence why we can pass an index), it just displays whats extracted in that function (so telephone and yp link)

In [41]:
def plot_business_spots_varied_features(data,save=False):
    poi_df = pd.DataFrame(data) #compile business sample into df
    map_center = [poi_df['latitude'].mean(), poi_df['longitude'].mean()] # Center the map around the mean latitude and longitude of the POIs
    m = folium.Map(location=map_center, zoom_start=15, tiles='OpenStreetMap') # Initialize the folium map with OpenStreetMap tiles

    # Add markers with popups for each POI
    for _, row in poi_df.iterrows():
        folium.Marker(
            location=[row['latitude'], row['longitude']],
            popup=popupStr_generator(row),
            tooltip=row['name']
        ).add_to(m)

    if save==True:
        m.save("map_with_pois.html")
    
    return m  

In [42]:
features = ["name","longitude","latitude","telephone","@id","openingHours"]
data_xx = pick_data_with_named_features(relevant_buiz,features)
#plot_business_spots_varied_features(data_xx,save=False)

## plot everything 

(so that then later it can be filtered by type of business / tags / opening hours)

#### simple set up: literally just a subset of everything (becausee 4000 is apparently too much)

In [43]:
def plot_all(mdb,to_index=10,save_map=False):
    #first connect to mongodb and get all business in one location 
    spots = get_all_business(mdb)
    spots = get_unique_buisness(spots)
    spots = pick_only_some_data(spots,to_index) #displays everything, only if not 10 is diplay changed
    return plot_business_spots(spots,save_map)

In [44]:
#modify to_index because 4500 is too much to plot
#plot_all(collection,to_index=500)

## plot osm

In [45]:
def plot_osm_data(data,save=False):
    poi_df = pd.DataFrame(data) #compile business sample into df
    map_center = [poi_df['lat'].mean(), poi_df['lon'].mean()] # Center the map around the mean latitude and longitude of the POIs
    m = folium.Map(location=map_center, zoom_start=15, tiles='OpenStreetMap') # Initialize the folium map with OpenStreetMap tiles

    # Add markers with popups for each POI
    for _, row in poi_df.iterrows():
        folium.Marker(
            location=[row['lat'], row['lon']],
            popup=popupStr_generator(row),
            tooltip=row['name']
        ).add_to(m)

    if save==True:
        m.save("osm_map_with_pois.html")
    
    return m  

In [46]:
collection_osm = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','osm_pois')
osm_data = osm_get_data(collection_osm,"pub")
#plot_osm_data(osm_data)

## plot yp but filtered (*)

In [47]:
def plot_all_filtered(mdb,query,to_index=10,save_map=False):
    #first set up query so i only get those businesses
    query = create_search_string(query)
    #then connect to mongodb and get all business according to filter options (e.g. location, name, tags)
    spots = get_business_by_features(mdb,query)
    spots = get_unique_buisness(spots)
    spots = pick_only_some_data(spots,to_index) #displays everything, only if not 10 is diplay changed
    return plot_business_spots_varied_features(spots,save_map)

##### examples

In [48]:
#connection to mongodb
collection = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','yellow_pages') 

In [49]:
#query 1
query1 = {"where":"Lübeck"}
#plot_all_filtered(collection,query1)

In [50]:
#query 2
query2 = {"special_tag":"Reparatur"}
#plot_all_filtered(collection,query2)

In [51]:
#query 3
query3 = {"what":"Kebab", "where":"Lübeck"}
#plot_all_filtered(collection,query3)

In [52]:
#query 4
query4 = {"what":"Pizza"}
#plot_all_filtered(collection,query4)

## try to go via click - gotta simulate click right now

In [53]:
def get_business_in_radius(original_point,radius,data):
    in_radius = []
    counter = 0
    for e in data:
        try:
            #try like this because of naming irregularities with yp and osm data
            try:
                lat, lon = e["latitude"],e["longitude"]
            except KeyError:
                lat, lon = e["lat"],e["lon"]
            other_point = (lat,lon)
            dist = geodesic(original_point, other_point).km
            if dist < radius:
                in_radius.append(e)
        except:
            counter += 1
    #print(f"no geocoordinates for {counter} businesses")
    return in_radius

In [54]:
"""def plot_in_radius(mdb,point,r):
    yp_data = get_all_business(mdb)
    yp_data = get_unique_buisness(yp_data)
    yp_data = get_business_in_radius(point,r,yp_data)
    yp_data = pick_only_some_data(yp_data) #displays everything, only if not 10 is diplay changed
    return plot_business_spots_varied_features(yp_data)"""

'def plot_in_radius(mdb,point,r):\n    yp_data = get_all_business(mdb)\n    yp_data = get_unique_buisness(yp_data)\n    yp_data = get_business_in_radius(point,r,yp_data)\n    yp_data = pick_only_some_data(yp_data) #displays everything, only if not 10 is diplay changed\n    return plot_business_spots_varied_features(yp_data)'

In [2]:
map_center = [54.323334, 10.139444]
m = folium.Map(location=map_center, zoom_start=15, tiles='OpenStreetMap')
loc = folium.LatLngPopup()
folium.LatLngPopup().add_to(m)  # Popup
m

In [56]:
#center of lübeck
click_point = (53.8682,10.6873)
radius = 0.5
yp_data = get_all_business(collection)
yp_data = get_unique_buisness(yp_data)
yp_data = get_business_in_radius(click_point,radius,yp_data)
yp_data = pick_only_some_data(yp_data) #displays everything, only if not 10 is diplay changed
plot_business_spots_varied_features(yp_data)
#plot_in_radius(collection,click_point,r) this doesnt work?? dunno why

### and for osm data

In [57]:
#center of lübeck
collection_osm = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','osm_pois')

click_point = (53.8682,10.6873)
radius = 0.25
osm_data = osm_get_data(collection_osm,"")
osm_data = get_business_in_radius(click_point,radius,osm_data)
plot_osm_data(osm_data)

## try to go via ipyleaflet

to get click action

In [59]:
"""from ipyleaflet import Map, Marker, Circle
from ipywidgets import Output

# Create an output widget to display Python outputs in Jupyter
out = Output()

# Initialize the map centered at a specific location
center = (40.7128, -74.0060)
m = Map(center=center, zoom=13)

# Add the map to the output widget so clicks can be captured
m.observe(lambda event: out.clear_output(), 'click')

# Function to handle clicks
def handle_map_click(event, location=None):
    # Clear previous output and display the new coordinates
    with out:
        print(f"Latitude: {location[0]}, Longitude: {location[1]}")
        
    # Draw a circle around the clicked location
    circle = Circle()
    circle.location = location
    circle.radius = 500  # Radius in meters
    circle.color = "blue"
    circle.fill_color = "#add8e6"
    circle.fill_opacity = 0.4
    
    # Remove the previous circle if it exists and add the new one
    if len(m.layers) > 1:
        m.remove_layer(m.layers[-1])
    m.add_layer(circle)

# Listen for click events on the map and capture the coordinates
m.on_interaction(handle_map_click)

# Display the map and output widget
display(m, out)"""

'from ipyleaflet import Map, Marker, Circle\nfrom ipywidgets import Output\n\n# Create an output widget to display Python outputs in Jupyter\nout = Output()\n\n# Initialize the map centered at a specific location\ncenter = (40.7128, -74.0060)\nm = Map(center=center, zoom=13)\n\n# Add the map to the output widget so clicks can be captured\nm.observe(lambda event: out.clear_output(), \'click\')\n\n# Function to handle clicks\ndef handle_map_click(event, location=None):\n    # Clear previous output and display the new coordinates\n    with out:\n        print(f"Latitude: {location[0]}, Longitude: {location[1]}")\n        \n    # Draw a circle around the clicked location\n    circle = Circle()\n    circle.location = location\n    circle.radius = 500  # Radius in meters\n    circle.color = "blue"\n    circle.fill_color = "#add8e6"\n    circle.fill_opacity = 0.4\n    \n    # Remove the previous circle if it exists and add the new one\n    if len(m.layers) > 1:\n        m.remove_layer(m.layer