In [3]:
from abc import ABC, abstractmethod
from typing import Any
from pymongo import MongoClient
from geopy.distance import geodesic
import folium
import pandas as pd

In [4]:
class DBHandlers(ABC):

    @abstractmethod 
    def connect_db(self,Client,client_str,db_str,coll_str): 
        client = Client(client_str) #connect to mongodb client
        db = client[db_str] #connect to database
        existing_collections = db.list_collection_names() #check that dbs collections
        if coll_str not in existing_collections:
            db.create_collection(coll_str) #create collection if needed
        self.collection = db[coll_str] #connect to collection

    @abstractmethod
    def design_query_dict(self,input):
        pass

    @abstractmethod
    def retrieve_data(self):
        self.data = [elem for elem in self.collection.find(self.query_dict)]

    @abstractmethod
    def check_click_radius(self,click_point,radius):
        in_radius = []
        for e in self.data:
            try:
                #try like this because of naming irregularities with yp and osm data
                try:
                    lat, lon = e["latitude"],e["longitude"]
                except KeyError:
                    lat, lon = e["lat"],e["lon"]
                other_point = (lat,lon)
                dist = geodesic(click_point, other_point).km
                if dist < radius:
                    in_radius.append(e)
            except:
                pass
        self.data = in_radius


In [5]:
class OSM_queryer(DBHandlers):

    def connect_db(self,Client=MongoClient,client_str='mongodb://localhost:27017', db_str='webscraping_dataLabKiel', coll_str='osm_pois'):
        return super().connect_db(Client,client_str,db_str,coll_str)


    def design_query_dict(self,input):
        query_terms = []

        if "what_general" in input.keys():
                for val in input["what_general"]:
                        query_terms.append({"amenity": { "$regex": rf"^{val}", "$options": "i" }})

        if "what_specific" in input.keys():
                for val in input["what_specific"]:
                        query_terms.append({"name":{ "$regex": rf"^{val}", "$options": "i" }})
                        query_terms.append({"amenity": { "$regex": rf"^{val}", "$options": "i" }})

        if query_terms == []:
                query_terms.append({"_id":"thisisanimpossibleid"}) #so that if nothing is entered nothing will be returned instead of everything
                
        self.query_dict = {"$or":query_terms}


    def retrieve_data(self):
        #self.data = [elem for elem in self.collection.find(self.query_dict)]
        return super().retrieve_data()


    def check_click_radius(self, click_point, radius):
          return super().check_click_radius(click_point, radius)
    


In [6]:
class YP_queryer(DBHandlers):

    #webscraping_dataLabKiel yellow_pages
    def connect_db(self,Client=MongoClient,client_str='mongodb://localhost:27017', db_str='sh_data_collection', coll_str='yp_kiel'):
        return super().connect_db(Client,client_str,db_str,coll_str)

    def design_query_dict(self, input):
        query_terms = []

        if "what_general" in input.keys():
                for val in input["what_general"]:
                        query_terms.append({"keywords": { "$regex": rf"^{val}", "$options": "i" }})

        if "what_specific" in input.keys():
                for val in input["what_specific"]:
                        query_terms.append({"name":{ "$regex": rf"^{val}", "$options": "i" }})
                        query_terms.append({"keyword":{ "$regex": rf"^{val}", "$options": "i" }})

        if query_terms == []:
                query_terms.append({"_id":"thisisanimpossibleid"}) #so that if nothing is entered nothing will be returned instead of everything
                
        self.query_dict = {"$or":query_terms}
    

    def retrieve_data(self):
        self.data = [elem for elem in self.collection.find(self.query_dict)]

    def is_open(self):
          #self.data if time.now is within opening hours would be cool!
          pass
    
    def check_click_radius(self, click_point, radius):
          return super().check_click_radius(click_point, radius)
    

In [None]:
class OutputManager():

    def manage_search(self,Querier,query,point,radius):
        q = Querier()
        q.connect_db()
        q.design_query_dict(query)
        q.retrieve_data()
        q.check_click_radius(click_point=point,radius=radius)
        return q.data

    def integrate_data(self):
        #manages all the results from the children db handlers and integrates the data
        #manage by making one large list of relevant results? without duplicates
        # self.results = self.osm_data + self.yp_data etc
        pass

    def popupStr_generator(self,df_row):
        pps = f"<b>{df_row["name"]}</b>"
        features = df_row.index
        values = df_row.values
        for f,v in zip(features,values):
            if f in ["telephone","amenity","sameAs","address"]:
                pps += f"<br>{f}: {v}"
            else:
                pps += ""
        return pps
    
    def plot_spots(self,data,z):
            poi_df = pd.DataFrame(data) #compile business sample into df
            try:
                if "latitude" in poi_df.columns:
                    if "lat" in poi_df.columns:
                        poi_df.drop(columns=["lat","lon"],inplace=True)
                    poi_df.rename(columns={"latitude":"lat","longitude":"lon"},inplace=True)

                if "coords" in poi_df.columns:
                    poi_df[["lat","lon"]] = poi_df["coords"].apply(pd.Series)
                    
                poi_df["lat"] = pd.to_numeric(poi_df["lat"])
                poi_df["lon"] = pd.to_numeric(poi_df["lon"])
                poi_df.dropna(subset=["lat","lon"],inplace=True)
                map_center = [poi_df['lat'].mean(), poi_df['lon'].mean()] # Center the map around the mean latitude and longitude of the POIs
                self.m = folium.Map(location=map_center, zoom_start=z, tiles='OpenStreetMap') # Initialize the folium map with OpenStreetMap tiles

                # Add markers with popups for each POI
                for _, row in poi_df.iterrows():
                    folium.Marker(
                        location=[row['lat'], row['lon']],
                        popup=self.popupStr_generator(row),
                        tooltip=row['name']
                    ).add_to(self.m)
            except:
                print("no map available due to e.g. naming errors")
                self.m = False
            
    def display_output(self):
        self.plot_spots(data=self.data,z=15) #z is level of zoom
    
    # do this with args / kwargs so that one can search only by passing a point or only by passing a query
    def perform_search(self,query,point,radius):
        self.osm_data = self.manage_search(OSM_queryer,query=query,point=point,radius=radius)
        self.yp_data = self.manage_search(YP_queryer,query=query,point=point,radius=radius)
        self.data = self.osm_data + self.yp_data
        self.display_output()


--- 

Try it out: do the datahandler do their thing?

---

In [8]:
#with yp, try different collection: only kiel data
click_point = (54.323334, 10.139444)
radius = 1

c = OSM_queryer()
c.connect_db()
c.collection
i = {"what_general":["pub"]}
c.design_query_dict(i)
print(c.query_dict)
c.retrieve_data()
results = c.data
print(f"len of result pre range filter: {len(results)}")
c.check_click_radius(click_point=click_point,radius=radius)
results = c.data
print(f"len of result post range filter: {len(results)}")

{'$or': [{'amenity': {'$regex': '^pub', '$options': 'i'}}]}
len of result pre range filter: 611
len of result post range filter: 12


In [9]:
click_point = (54.511654, 13.638421)
radius = 1

c = YP_queryer()
c.connect_db(db_str="webscraping_dataLabKiel",coll_str="yellow_pages")
print(c.collection)
i = {"what_specific":["reparatur","repair","Reparatur"], "what_general":["reparatur","repair"]}
c.design_query_dict(i)
print(c.query_dict)
c.retrieve_data()
results = c.data
print(f"len of result pre range filter: {len(results)}")
c.check_click_radius(click_point=click_point,radius=radius)
results = c.data
print(f"len of result post range filter: {len(results)}") #lots double i imagine

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'webscraping_dataLabKiel'), 'yellow_pages')
{'$or': [{'keywords': {'$regex': '^reparatur', '$options': 'i'}}, {'keywords': {'$regex': '^repair', '$options': 'i'}}, {'name': {'$regex': '^reparatur', '$options': 'i'}}, {'keyword': {'$regex': '^reparatur', '$options': 'i'}}, {'name': {'$regex': '^repair', '$options': 'i'}}, {'keyword': {'$regex': '^repair', '$options': 'i'}}, {'name': {'$regex': '^Reparatur', '$options': 'i'}}, {'keyword': {'$regex': '^Reparatur', '$options': 'i'}}]}
len of result pre range filter: 731
len of result post range filter: 258


----

does query manager manage everything?

In [10]:
click_point = (53.8682,10.6873)
radius = 0.5
i = {"what_general":["pub"]}

om = OutputManager()
om.perform_search(i,click_point,radius)
print(f"we found {len(om.data)} matches for your query!")
#om.m

we found 26 matches for your query!


In [11]:
click_point = (54.323334, 10.139444)
radius = 10
i = {"what_specific":["pizza","italian","italienisch","itali"], "what_general":["pizza","italian","italienisch","itali"]}

om = OutputManager()
om.perform_search(i,click_point,radius)
print(f"we found {len(om.data)} matches for your query!") #but they are double still
#om.m

we found 83 matches for your query!


In [12]:
click_point = (54.323334, 10.139444)
radius = 5
i = {"what":["umzug","umzüge","transport"], "what_general":["umzug","umzüge","transport"]}

om = OutputManager()
om.perform_search(i,click_point,radius)
print(f"we found {len(om.data)} matches for your query!") #but they are double still
#om.m

we found 256 matches for your query!


----

# aggregate search

In [26]:
def set_up_mongo(client_str,database_str,collection_str):
    client = MongoClient(client_str) #connect to mongodb client
    db = client[database_str] #connect to database

    existing_collections = db.list_collection_names() #check that dbs collections
    if collection_str not in existing_collections:
        db.create_collection(collection_str) #create collection if needed
    
    my_collection = db[collection_str] #connect to collection

    return my_collection


collection = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','avg_rent') 
collection_new = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','avg_rent_v2') 


for x in collection.find():
    id = x["_id"]
    coords = x["coords"]
    lat = coords["lat"]
    lon = coords["lon"]
    x.update({"lat":lat,"lon":lon})
    #collection_new.insert_one(x)
    collection.update_one({"_id":id},{"$set":{"lat":lat,"lon":lon}})