In [1]:
from abc import ABC, abstractmethod
from typing import Any
from pymongo import MongoClient
from geopy.distance import geodesic
import folium
import pandas as pd

In [2]:
class DBHandlers(ABC):

    @abstractmethod 
    def connect_db(self,Client,client_str,db_str,coll_str): 
        client = Client(client_str) #connect to mongodb client
        db = client[db_str] #connect to database
        existing_collections = db.list_collection_names() #check that dbs collections
        if coll_str not in existing_collections:
            db.create_collection(coll_str) #create collection if needed
        self.collection = db[coll_str] #connect to collection

    @abstractmethod
    def design_query_dict(self):
        query_terms = [{},{}] # {"$or":[{},{}]} returns everything
        return query_terms


    @abstractmethod
    def retrieve_data(self):
        self.data = [elem for elem in self.collection.find(self.query_dict)]

    @abstractmethod
    def check_click_radius(self,click_point,radius,data):
        in_radius = []
        for e in data:
            try:
                #try like this because of naming irregularities with yp and osm data
                try:
                    lat, lon = e["latitude"],e["longitude"]
                except KeyError:
                    lat, lon = e["lat"],e["lon"]
                other_point = (lat,lon)
                dist = geodesic(click_point, other_point).km
                if dist < radius:
                    in_radius.append(e)
            except:
                pass
        self.data = in_radius


In [3]:
class OSM_queryer(DBHandlers):

    def connect_db(self,Client=MongoClient,client_str='mongodb://localhost:27017', db_str='webscraping_dataLabKiel', coll_str='osm_pois'):
        return super().connect_db(Client,client_str,db_str,coll_str)


    def design_query_dict(self,input):
        query_terms = []

        if "what" in input.keys():
            for val in input["what"]:
                query_terms.append({"name":{ "$regex": rf"^{val}", "$options": "i" }})
                query_terms.append({"amenity": { "$regex": rf"^{val}", "$options": "i" }})

        if "all" in input.keys():
            query_terms = super().design_query_dict()

        #for returning nothing
        if query_terms == []:
            query_terms.append({"_id":"thisisanimpossibleid"}) #so that if nothing is entered nothing will be returned instead of everything
                
        #if "aggregate" in input.keys():
            #self.query_dict = {"$group":{"_id":"$amenity","countAmenityType":{"$sum":1}}}
        #else:

        self.query_dict = {"$or":query_terms}


    def retrieve_data(self, **kwargs):
        #if "aggregate" in kwargs:
            #self.data = [x for x in self.collection.aggregate(self.query_dict)] #because its not collection.find but collection.aggregate
            #return self.data
        #else:
            #self.data = [elem for elem in self.collection.find(self.query_dict)]
            return super().retrieve_data()


    def check_click_radius(self, click_point, radius, data):
        return super().check_click_radius(click_point, radius, data)
    


In [4]:
class YP_queryer(DBHandlers):

    #webscraping_dataLabKiel yellow_pages
    def connect_db(self,Client=MongoClient,client_str='mongodb://localhost:27017', db_str='sh_data_collection', coll_str='yp_kiel'):
        return super().connect_db(Client,client_str,db_str,coll_str)

    def design_query_dict(self, input):
        query_terms = []

        if "what" in input.keys():
                for val in input["what"]:
                        query_terms.append({"name":{ "$regex": rf"^{val}", "$options": "i" }})
                        query_terms.append({"keywords":{ "$regex": rf"^{val}", "$options": "i" }})

        if "all" in input.keys():
              query_terms = super().design_query_dict()

        if query_terms == []:
                query_terms.append({"_id":"thisisanimpossibleid"}) #so that if nothing is entered nothing will be returned instead of everything
                
        self.query_dict = {"$or":query_terms}
    

    def retrieve_data(self):
        self.data = [elem for elem in self.collection.find(self.query_dict)]

    #add this!!!
    def is_open(self):
          #self.data if time.now is within opening hours would be cool!
          pass
    
    def check_click_radius(self, click_point, radius, data):
          return super().check_click_radius(click_point, radius, data)
    

In [5]:
class RENT_queryer(DBHandlers):

    def connect_db(self,Client=MongoClient,client_str='mongodb://localhost:27017', db_str='webscraping_dataLabKiel', coll_str='avg_rent'):
        return super().connect_db(Client,client_str,db_str,coll_str)

    def design_query_dict(self, input):
        query_terms = []

        if "rent" in input.keys():
                #query_terms = super().design_query_dict() #get everything
                query_terms = [{"collected":"True"}]

        if "all" in input.keys():
              #query_terms = super().design_query_dict()
              query_terms = [{"collected":"True"}]

        if query_terms == []:
                query_terms.append({"_id":"thisisanimpossibleid"}) #so that if nothing is entered nothing will be returned instead of everything
                
        self.query_dict = {"$or":query_terms}
    

    def retrieve_data(self):
        self.data = [elem for elem in self.collection.find(self.query_dict)]

    #add this!!!
    def is_open(self):
          #self.data if time.now is within opening hours would be cool!
          pass
    
    def check_click_radius(self, click_point, radius, data):
          return super().check_click_radius(click_point, radius, data)
    

In [6]:
class OutputManager():

#-------NOrMAL MARKER PLOT --------------------------------------------
    def popupStr_generator(self, df_row):
        try:
            name = f"<b>{df_row['name']}</b><br><br>"
            link = f"<i>link:</i>: <a href='{df_row['sameAs']}'>{df_row['sameAs']}</a><br>"
            tel = f"<i>tel.:</i> {df_row['telephone']}<br>"
            addr = f"<i>address:</i> {df_row['address']}<br>"
            amenity = f"<i>amenity:</i>: {df_row['amenity']}"
            pps = name + link + tel + addr + amenity
            return pps
        except:
            pass

        try:
            if df_row["Landkreis"] != []:
                name = f"<b>{df_row['Landkreis']}</b><br><br>"
            else:
                name = f"<b>{df_row['Stadt']}</b><br><br>"
            plz = f"<i>postcode:</i> {df_row['PLZ']}<br>"
            rent = f"<i>rent:</i> {df_row['average_rent']}€ per m²"
            pps = name + plz + rent
            return pps
        except:
            pass
        
    def tooltip_generator(self,row):
        tts = ""
        try:
            tts = row['name']
            return tts
        except:
            pass
        try: 
            tts = f"{row["average_rent"]}€ m²"
            return tts
        except:
            pass
        return tts

    
    #so make sure data has no duplicates and all the naming is the same
    def plot_spots(self,data,z):
            poi_df = pd.DataFrame(data) #compile business sample into df
            try:    
                poi_df["lat"] = pd.to_numeric(poi_df["lat"])
                poi_df["lon"] = pd.to_numeric(poi_df["lon"])
                poi_df.dropna(subset=["lat","lon"],inplace=True)
                map_center = [poi_df['lat'].mean(), poi_df['lon'].mean()] # Center the map around the mean latitude and longitude of the POIs
                self.m = folium.Map(location=map_center, zoom_start=z, tiles='OpenStreetMap') # Initialize the folium map with OpenStreetMap tiles

                for _, row in poi_df.iterrows():
                    folium.Marker(
                        location=[row['lat'], row['lon']],
                        popup=self.popupStr_generator(row),
                        tooltip=self.tooltip_generator(row)
                    ).add_to(self.m)
            except:
                print("no map available due to e.g. naming errors")
                self.m = False
            
    def display_output(self):
        self.plot_spots(data=self.results,z=8) #z is level of zoom

#-------AGGREGATE PLOT------------------------------------

    def plot_aggregates(self,click_point,data,z):
        try:    
            self.m = folium.Map(location=click_point, zoom_start=z)
            #create text dynamically
            popup_text = f"""<h4>Aggregate Info for selected location:{click_point}</h4>
                            <p>Here is a small summery of which amenities you can find at this location. Based only on OSM data as of now</p>
                            <ul>"""
            for _,row in data.iterrows():
                amenity = row["amenity"]
                count = row["count"]
                popup_text += f"<li>{amenity}: {count}</li>"
            popup_text += "</ul>"
            #add one marker with text
            folium.Marker(
                location=click_point,
                popup=folium.Popup(popup_text, max_width=300), 
                tooltip="Click me!"
            ).add_to(self.m)
        except:
            print("no map available due to e.g. naming errors")
            self.m = False
    
    def display_aggregates(self,click_point):
        self.plot_aggregates(click_point,data=self.results,z=8) #z is level of zoom
    

#-------INIT HANDLER FOR EACH COLLECTION AND RETURN DATA------
    def initiate_search(self,Querier,query,point=None,radius=None):
        q = Querier()
        q.connect_db()
        q.design_query_dict(query)
        q.retrieve_data()
        if (point is not None) and (radius is not None):
            q.check_click_radius(click_point=point,radius=radius, data=q.data)
        return q.data

#-------REMOVE DUPLICATRS BY NAME------------------------
    #thanks chat gtp
    #check if value of a certain id key is already in list
    #so basically throw out yp duplicates, and osm stuff with same name
    def integrate_data(self,dict_list, new_dicts, key):
        for e in new_dicts:
            #if the item to add doesnt have the relevant key, put it into results list
            if key not in e:
                dict_list.append(e)
                continue
            #if its false that the new item has the same value as an already collected dict, add new item 
            if not any(e.get(key) == d.get(key) for d in dict_list):
                dict_list.append(e)
        return dict_list


#-------WRAPPER FOR CALLING ABOVE FUNCTIONS--------
#pass query = query dict, point = click point, radius = radius depending on needs

    def perform_search(self,**kwargs):
        # click + filter 
        # click + empty filter: all 
        results = []
        if "query" in kwargs and "point" in kwargs and "radius" in kwargs:
            query, point, radius = kwargs["query"], kwargs["point"], kwargs["radius"]
            self.osm_data = self.initiate_search(OSM_queryer,query=query,point=point,radius=radius)
            self.yp_data = self.initiate_search(YP_queryer,query=query,point=point,radius=radius)
            self.rent_data = self.initiate_search(RENT_queryer,query=query,point=point,radius=radius)
            self.data = self.osm_data + self.yp_data + self.rent_data
            self.results = self.integrate_data(results,self.data,key="name")
            self.display_output()
        # no click + filter 
        elif "query" in kwargs:
            query = kwargs["query"]
            self.osm_data = self.initiate_search(OSM_queryer,query=query)
            self.yp_data = self.initiate_search(YP_queryer,query=query)
            self.rent_data = self.initiate_search(RENT_queryer,query=query)
            self.data = self.osm_data + self.yp_data + self.rent_data
            self.results = self.integrate_data(results,self.data,key="name")
            self.display_output()
        #get count of amenities in osm data
        elif "aggregate" in kwargs:
            query = {"all":True}
            point, radius = kwargs["point"], kwargs["radius"]
            self.osm_data = self.initiate_search(OSM_queryer,query=query,point=point,radius=radius)
            #self.rent_data = self.initiate_search(RENT_queryer,query=query,point=point,radius=radius)
            series = pd.DataFrame(self.osm_data)["amenity"].value_counts()
            series = series[series>20]
            self.results = pd.DataFrame(series).reset_index()
            self.display_aggregates(click_point=point)

--- 

## Try it out: do the datahandler do their thing?

---

In [7]:
click_point = (54.323334, 10.139444)
radius = 5

c = OSM_queryer()
c.connect_db()
c.collection
i = {"what":["pub"]}
c.design_query_dict(i)
print(c.query_dict)
c.retrieve_data()
results = c.data
print(f"len of result pre range filter: {len(results)}")
c.check_click_radius(click_point=click_point,radius=radius,data=results)
results = c.data
print(f"len of result post range filter: {len(results)}")

{'$or': [{'name': {'$regex': '^pub', '$options': 'i'}}, {'amenity': {'$regex': '^pub', '$options': 'i'}}]}
len of result pre range filter: 611
len of result post range filter: 59


In [None]:
#results

In [8]:
click_point = (54.511654, 13.638421)
radius = 1

c = YP_queryer()
c.connect_db(db_str="webscraping_dataLabKiel",coll_str="yellow_pages")
print(c.collection)
i = {"what":["reparatur","repair","Reparatur"]}
c.design_query_dict(i)
print(c.query_dict)
c.retrieve_data()
results = c.data
print(f"len of result pre range filter: {len(results)}")
c.check_click_radius(click_point=click_point,radius=radius, data=results)
results = c.data
print(f"len of result post range filter: {len(results)}") #lots double i imagine

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'webscraping_dataLabKiel'), 'yellow_pages')
{'$or': [{'name': {'$regex': '^reparatur', '$options': 'i'}}, {'keywords': {'$regex': '^reparatur', '$options': 'i'}}, {'name': {'$regex': '^repair', '$options': 'i'}}, {'keywords': {'$regex': '^repair', '$options': 'i'}}, {'name': {'$regex': '^Reparatur', '$options': 'i'}}, {'keywords': {'$regex': '^Reparatur', '$options': 'i'}}]}
len of result pre range filter: 731
len of result post range filter: 258


In [9]:
click_point = (54.323334, 10.139444)
radius = 10

c = RENT_queryer()
c.connect_db()
c.collection
i = {"rent":True}
c.design_query_dict(i)
print(c.query_dict)
c.retrieve_data()
results = c.data
print(f"len of result pre range filter: {len(results)}")
c.check_click_radius(click_point=click_point,radius=radius, data=results)
results = c.data
print(f"len of result post range filter: {len(results)}") #lots double i imagine

{'$or': [{'collected': 'True'}]}
len of result pre range filter: 386
len of result post range filter: 20


----

## does query manager manage its things?

In [10]:
click_point = (53.8682,10.6873)
radius = 0.5
i = {"what":["pub","bar","club"]} #only returns osm data because yp literally doeant have that

om = OutputManager()
om.perform_search(query=i,point=click_point,radius=radius)
print(f"we found {len(om.results)} matches for your query!")
om.m

we found 38 matches for your query!


this returns both osm and yp data, but shows really we need to fix the regex expressions, cause 'asia' returns 19 results, but 'asian' nothing

In [None]:
click_point = (54.323334, 10.139444)
radius = 5
i = {"what":["asia"]} 

om = OutputManager()
om.perform_search(query=i,point=click_point,radius=radius)
print(f"we found {len(om.results)} matches for your query!") #but they are double still
#om.m.save("osm_yp_asia.html")
om.m

---

does the all flag work?

In [None]:
click_point = (54.323334, 10.139444)
radius = 0.1
i = {"all":True}

om = OutputManager()
om.perform_search(query=i,point=click_point,radius=radius)
print(f"we found {len(om.results)} matches for your query!") #but they are double still
#om.m.save("all_flag.html")
om.m

just filter no click, does it work?

In [None]:
i = {"what":["ferry", "fähre"]} #fähre adds like 10 search results

om = OutputManager()
om.perform_search(query=i)
print(f"we found {len(om.results)} matches for your query!") #but they are double still
#om.m.save("no_click.html")
om.m

does the rent stuff work?

In [None]:
click_point = (54.323334, 10.139444)
radius = 50
i = {"rent":True} #basically search osm

om = OutputManager()
om.perform_search(query=i,point=click_point,radius=radius)
print(f"we found {len(om.results)} matches for your query!") #but they are double still
#om.m.save("rent.html")
om.m

does aggregation work?
- right now only osm data

In [None]:
click_point = (54.323334, 10.139444)
radius = 1

om = OutputManager()
om.perform_search(aggregate=True,point=click_point,radius=radius)
#print(f"we found {len(om.results)} matches for your query!") #but they are double still
#om.m.save("easy_agg.html")
om.m

---

In [None]:
click_point = (54.323334, 10.139444)
radius = 5
i = {"rent":True,"what":["pub"]}

om = OutputManager()
om.perform_search(query=i,point=click_point,radius=radius)
print(f"we found {len(om.results)} matches for your query!") #but they are double still
#om.m.save("all_flag.html")
#om.m