In [1]:
from abc import ABC, abstractmethod
from typing import Any
from pymongo import MongoClient
from geopy.distance import geodesic
import folium
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import shape, Point
from shapely import from_wkt
import json
from thefuzz import fuzz
from datetime import timedelta
from bson import ObjectId
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# data base handlers

## parent class

In [2]:
class DBHandlers(ABC):

    @abstractmethod 
    def connect_db(self,Client,client_str,db_str,coll_str): 
        client = Client(client_str) #connect to mongodb client
        db = client[db_str] #connect to database
        existing_collections = db.list_collection_names() #check that dbs collections
        if coll_str not in existing_collections:
            db.create_collection(coll_str) #create collection if needed
        self.collection = db[coll_str] #connect to collection

    @abstractmethod
    def design_query_dict(self):
        query_terms = [{},{}] # {"$or":[{},{}]} returns everything
        return query_terms


    @abstractmethod
    def retrieve_data(self):
        self.data = [elem for elem in self.collection.find(self.query_dict)]

    @abstractmethod
    def check_click_radius(self,click_point,radius,data):
        in_radius = []
        for e in data:
            try:
                #try like this because of naming irregularities with yp and osm data
                try:
                    lat, lon = e["lat"],e["lon"]
                except KeyError:
                    lat, lon = e["latitude"],e["longitude"]
                other_point = (lat,lon)
                dist = geodesic(click_point, other_point).km
                if dist < radius:
                    in_radius.append(e)
            except:
                pass
        self.results = in_radius

    @abstractmethod
    def click_polygon(self,click_point, data):
        #get adminlevel=6 boundaries data
        df = pd.read_csv("sh_boundaries_6.csv")
        df = df[["name","geometry"]]
        df["geometry"] = df["geometry"].apply(from_wkt)
        gdf = gpd.GeoDataFrame(df, geometry='geometry')
        gdf.set_crs("EPSG:4326",inplace=True)

        #reorder for check
        click_point = [click_point]
        reordered_point = [(lon, lat) for lat, lon in click_point]
        reordered_point = Point(reordered_point)

        #check for intersection
        poly = df[df["geometry"].apply(lambda x: x.contains(reordered_point))]
        
        self.results = []
        for e in data:
            if "lon" in e and "lat" in e and e["lon"] is not None and e["lat"] is not None:
                reordered_coords = Point(e["lon"],e["lat"])
                if any(poly["geometry"].apply(lambda x: x.contains(reordered_coords))):
                    self.results.append(e)
        return poly


    @abstractmethod
    def no_duplicates(self,dict_list,new_dicts,key):
        for e in new_dicts:
            #if the item to add doesnt have the relevant key, put it into results list
            if key not in e:
                dict_list.append(e)
                continue
            #if its false that the new item has the same value as an already collected dict, add new item 
            if not any(e.get(key) == d.get(key) for d in dict_list):
                dict_list.append(e)
        return dict_list
    
    @abstractmethod
    def retrieve_by_id(self,Client,client_str,db_str,coll_str,batch_ids):
        self.connect_db(Client,client_str,db_str,coll_str)
        self.data = [e for e in self.collection.find({"_id":{"$in": batch_ids}})]


    @abstractmethod
    def popupStr_generator(self, df_row):
        pass

    @abstractmethod
    def tooltip_generator(self,df_row):
        pass

    @abstractmethod
    def gestalte_map(self):
        pass

    @abstractmethod
    def orderly_output(self):
        pass



## OSM

In [3]:
class OSM_queryer(DBHandlers):

    def connect_db(self,Client=MongoClient,client_str='mongodb://localhost:27017', db_str='webscraping_dataLabKiel', coll_str='osm_pois'):
        return super().connect_db(Client,client_str,db_str,coll_str)


    def design_query_dict(self,input):
        query_terms = []

        if "what" in input.keys():
            for val in input["what"]:
                query_terms.append({"name":{ "$regex": rf"^{val}", "$options": "i" }})
                query_terms.append({"amenity": { "$regex": rf"^{val}", "$options": "i" }})

        if "all" in input.keys():
            query_terms = super().design_query_dict()


        if "by_ids" in input.keys():
            query_terms.append({"_id":{"$in": input["by_ids"]}})

        #for returning nothing
        if query_terms == []:
            query_terms.append({"_id":"thisisanimpossibleid"}) #so that if nothing is entered nothing will be returned instead of everything
                

        self.query_dict = {"$or":query_terms}


    def retrieve_data(self, **kwargs):
        return super().retrieve_data()


    def check_click_radius(self, click_point, radius, data):
        return super().check_click_radius(click_point, radius, data)
    

    def no_duplicates(self, dict_list, new_dicts, key):
        return super().no_duplicates(dict_list, new_dicts, key)


    def popupStr_generator(self, df_row):
        try:
            name = f"<b>{df_row['name']}</b><br><br>"
            amenity = f"<i>amenity:</i>: {df_row['amenity']}"
            pps = name + amenity
            return pps
        except:
            pass

    def tooltip_generator(self,row):
        tts = ""
        try:
            tts = row['name']
            return tts
        except:
            pass
        return tts

    def gestalte_map(self,map,data):
        poi_df = pd.DataFrame(data) 
        try:    
            poi_df["lat"] = pd.to_numeric(poi_df["lat"])
            poi_df["lon"] = pd.to_numeric(poi_df["lon"])
            #poi_df.dropna(subset=["lat","lon"],inplace=True)

            for _, row in poi_df.iterrows():
                folium.Marker(
                    location=[row['lat'], row['lon']],
                    popup=self.popupStr_generator(row),
                    tooltip=self.tooltip_generator(row),
                    icon=folium.Icon(color="gray")
                ).add_to(map)
            return map
        except:
            #print("no map available due to e.g. naming errors")
            return map
        
    def display_output(self,map,data):
        self.gestalte_map(map=map,data=data)

    def click_polygon(self, click_point, data):
        return super().click_polygon(click_point, data)
    
    def plot_polygon(self,polygons,map):
        #add county area
        gdf = gpd.GeoDataFrame(polygons, geometry='geometry')
        gdf.set_crs(epsg=4326, inplace=True)
        for _,row in gdf.iterrows():
            sim_geo = gpd.GeoSeries(row["geometry"]).simplify(tolerance=0.001)
            geo_j = sim_geo.to_json()
            geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "lightgray"})
            folium.Popup(row["name"]).add_to(geo_j)
            geo_j.add_to(map)

        #add rent markers
        self.display_output(map=map,data=self.results)

        return map


    def orderly_output(self, data):
        output = []
        for df_row in data:
            doc = {"name":df_row['name'], 
                   "amenity":df_row['amenity']}
            output.append(doc)
        return output 
    
    def retrieve_by_id(self, Client, client_str, db_str, coll_str, batch_ids):
        return super().retrieve_by_id(Client, client_str, db_str, coll_str, batch_ids)
            

## YELLOW PAGES

In [4]:
class YP_queryer(DBHandlers):

    def connect_db(self,Client=MongoClient,client_str='mongodb://localhost:27017', db_str='sh_data_collection', coll_str='yp_kiel'):
        return super().connect_db(Client,client_str,db_str,coll_str)

    def design_query_dict(self, input):
        query_terms = []

        if "what" in input.keys():
                for val in input["what"]:
                        query_terms.append({"name":{ "$regex": rf"^{val}", "$options": "i" }})
                        query_terms.append({"keywords":{ "$regex": rf"^{val}", "$options": "i" }})

        if "all" in input.keys():
              query_terms = super().design_query_dict()

        if "by_ids" in input.keys():
            query_terms.append({"_id":{"$in": input["by_ids"]}})

        if query_terms == []:
                query_terms.append({"_id":"thisisanimpossibleid"}) #so that if nothing is entered nothing will be returned instead of everything
                
        self.query_dict = {"$or":query_terms}
    
    def retrieve_data(self):
        self.data = [elem for elem in self.collection.find(self.query_dict)]

    def time_check(self,time):
        mapping = {"Mo": ["Montag", "montag", "Mo", "mo", "Monday", "monday"],
                    "Di": ["Dienstag", "dienstag", "Tu", "tu", "Tuesday", "tuesday"],
                    "Mi": ["Mittwoch", "mittwoch", "We", "we", "Wednesday", "wednesday"],
                    "Do": ["Donnerstag", "donnerstag", "Th", "th", "Thursday", "thursday"],
                    "Fr": ["Freitag", "freitag", "Fr", "fr", "Friday", "friday"],
                    "Sa": ["Samstag", "samstag", "Sa", "sa", "Saturday", "saturday"],
                    "So": ["Sonntag", "sonntag", "Su", "su", "Sunday", "sunday"],
                }
        for k,v in mapping.items():
            if time in v:
                return k
        else:
            return None


    #add this!!!
    def within_time(self,data,date):
        if date=="today":
            date = pd.to_datetime(date,dayfirst=True).date() 
            #date = date.strftime("%Y-%m-%d")  
            date = date.strftime("%A") #yields weekday
        elif date == "tomorrow":
            date = pd.to_datetime("today").date()
            date = date + timedelta(days=1)
            #date = date.strftime("%Y-%m-%d")  
            date = date.strftime("%A")
        else:
            try:
                date = pd.to_datetime(date,dayfirst=True).date() #just make normal string date into correct format
                date = date.strftime("%A") #but still get weekday
            except ValueError:
                pass #if its not a date string, then maybe its like monday etc already
        time = self.time_check(date) #now get format to check opening hours, so Mo, Di, Mi etc
        results = []
        for e in data:
            opening_hours = e.get("openingHours")
            if opening_hours is not None:
                if any([k for k in opening_hours if time in k]):
                    results.append(e)
            else:
                #results.append(e)
                pass
        self.results = results
        return results


    def check_click_radius(self, click_point, radius, data):
        return super().check_click_radius(click_point, radius, data)
    
    
    def no_duplicates(self, dict_list, new_dicts, key):
        return super().no_duplicates(dict_list, new_dicts, key)


    # PLOTS
    def popupStr_generator(self, df_row):
        try:
            name = f"<b>{df_row['name']}</b><br><br>"
            link = f"<i>link:</i>: <a href='{df_row['sameAs']}'>{df_row['sameAs']}</a><br>"
            tel = f"<i>tel.:</i> {df_row['telephone']}<br>"
            address = df_row["address"]
            try:
                address = address["streetAddress"] + ", " + address["postalCode"] + ", " + address["addressLocality"]
            except:
                pass
            addr = f"<i>address:</i> {address}<br>"
            pps = name + link + tel + addr 
            return pps
        except:
            pass

    def tooltip_generator(self,row):
        tts = ""
        try:
            tts = row['name']
            return tts
        except:
            pass
        return tts

    def gestalte_map(self,map,data):
        poi_df = pd.DataFrame(data) 
        try:    
            poi_df["lat"] = pd.to_numeric(poi_df["lat"])
            poi_df["lon"] = pd.to_numeric(poi_df["lon"])
            #poi_df.dropna(subset=["lat","lon"],inplace=True)

            for _, row in poi_df.iterrows():
                folium.Marker(
                    location=[row['lat'], row['lon']],
                    popup=self.popupStr_generator(row),
                    tooltip=self.tooltip_generator(row),
                    icon=folium.Icon(color="beige")
                ).add_to(map)
            return map
        except:
            #print("no map available due to e.g. naming errors")
            return map
            
    def display_output(self,map,data):
        self.gestalte_map(map=map,data=data) 

    def click_polygon(self, click_point, data):
         return super().click_polygon(click_point, data)

    def plot_polygon(self,polygons,map):
        #add county area
        gdf = gpd.GeoDataFrame(polygons, geometry='geometry')
        gdf.set_crs(epsg=4326, inplace=True)
        for _,row in gdf.iterrows():
            sim_geo = gpd.GeoSeries(row["geometry"]).simplify(tolerance=0.001)
            geo_j = sim_geo.to_json()
            geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "lightgrey"})
            folium.Popup(row["name"]).add_to(geo_j)
            geo_j.add_to(map)

        #add rent markers
        self.display_output(map=map,data=self.results)

        return map

    def orderly_output(self, data):
        output = []
        for df_row in data:
            address = df_row["address"]
            try:
                nice_address = address["streetAddress"] + ", " + address["postalCode"] + ", " + address["addressLocality"]
                address = f"<i>address:</i> {nice_address}<br>"
            except:
                pass
            try:
                name = df_row['name']
            except:
                name = None
            try:
                tel = df_row['telephone']
            except:
                tel = None
            try:
                hours = df_row["openingHours"]
            except:
                hours = None

            doc = {"name":name, 
                   "tel":tel,
                   "address":address,
                   "open":hours}
            output.append(doc)
        return output 
    

    def retrieve_by_id(self, Client, client_str, db_str, coll_str, batch_ids):
        return super().retrieve_by_id(Client, client_str, db_str, coll_str, batch_ids)
    
    
    

## RENT

In [5]:
class RENT_queryer(DBHandlers):

    def connect_db(self,Client=MongoClient,client_str='mongodb://localhost:27017', db_str='webscraping_dataLabKiel', coll_str='avg_rent'):
        return super().connect_db(Client,client_str,db_str,coll_str)

    def design_query_dict(self, input):
        query_terms = []

        if "rent" in input.keys():
                #query_terms = super().design_query_dict() #get everything
                query_terms = [{"collected":"True"}]

        if "what" in input.keys():
            for val in input["what"]:
                if val in ["rent","rents","miete","Miete"]:
                    query_terms = query_terms = [{"collected":"True"}]

        if "all" in input.keys():
              #query_terms = super().design_query_dict()
              query_terms = [{"collected":"True"}]

        if "by_ids" in input.keys():
            query_terms.append({"_id":{"$in": input["by_ids"]}})

        if query_terms == []:
                query_terms.append({"_id":"thisisanimpossibleid"}) #so that if nothing is entered nothing will be returned instead of everything
                
        self.query_dict = {"$or":query_terms}
    

    def retrieve_data(self):
        self.data = [elem for elem in self.collection.find(self.query_dict)]

    
    def check_click_radius(self, click_point, radius, data):
        return super().check_click_radius(click_point, radius, data)


    def no_duplicates(self, dict_list, new_dicts, key):
        return super().no_duplicates(dict_list, new_dicts, key)


    def tooltip_generator(self,row):
        tts = ""
        try: 
            tts = f"{row["average_rent"]}€ m²"
            return tts
        except:
            pass
        return tts

    def popupStr_generator(self,df_row):
        try:
            if df_row["Landkreis"] != []:
                name = f"<b>{df_row['Landkreis']}</b><br><br>"
            else:
                name = f"<b>{df_row['Stadt']}</b><br><br>"
            plz = f"<i>postcode:</i> {df_row['PLZ']}<br>"
            rent = f"<i>rent:</i> {df_row['average_rent']}€ per m²"
            pps = name + plz + rent
            return pps
        except:
            pass

        #so make sure data has no duplicates and all the naming is the same
    def gestalte_map(self,map,data):
        poi_df = pd.DataFrame(data) #compile business sample into df
        try:    
            median_rent = poi_df.average_rent.mean()
            poi_df["rent_ratio"] = poi_df.average_rent.apply(lambda x: "high" if x > median_rent else "low")
            high_rents = poi_df[poi_df["rent_ratio"]=="high"]
            for _, row in high_rents.iterrows():
                folium.Marker(
                    location=[row['lat'], row['lon']],
                    popup=self.popupStr_generator(row),
                    tooltip=self.tooltip_generator(row),
                    icon=folium.Icon(color="lightred")
                ).add_to(map)
            low_rents = poi_df[poi_df["rent_ratio"]=="low"]
            for _, row in low_rents.iterrows():
                folium.Marker(
                    location=[row['lat'], row['lon']],
                    popup=self.popupStr_generator(row),
                    tooltip=self.tooltip_generator(row),
                    icon=folium.Icon(color="lightblue")
                ).add_to(map)
            return map
        except:
            #print("no map available due to e.g. naming errors")
            return map

            
    def display_output(self,map,data):
        self.gestalte_map(map=map,data=data) #z is level of zoom

    def click_polygon(self, click_point, data):
        return super().click_polygon(click_point, data)
    
    def plot_polygon(self,polygons,map):
        #add county area
        gdf = gpd.GeoDataFrame(polygons, geometry='geometry')
        gdf.set_crs(epsg=4326, inplace=True)
        for _,row in gdf.iterrows():
            sim_geo = gpd.GeoSeries(row["geometry"]).simplify(tolerance=0.001)
            geo_j = sim_geo.to_json()
            geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "lightgray"})
            folium.Popup(row["name"]).add_to(geo_j)
            geo_j.add_to(map)

        #add rent markers
        self.display_output(map=map,data=self.results)

        return map
        


    def orderly_output(self, data):
        output = []
        for df_row in data:
            doc = {"county":df_row["Landkreis"],
                   "city":df_row['Stadt'],
                   "postcode":df_row['PLZ'],
                   "avgerage rent":df_row['average_rent']}
            output.append(doc)
        return output 
    

    def retrieve_by_id(self, Client, client_str, db_str, coll_str, batch_ids):
        return super().retrieve_by_id(Client, client_str, db_str, coll_str, batch_ids)
    
    

## EVENTS

In [6]:
class EVENT_queryer(DBHandlers):

    def connect_db(self,Client=MongoClient,client_str='mongodb://localhost:27017', db_str='webscraping_dataLabKiel', coll_str='event_data'):
        return super().connect_db(Client,client_str,db_str,coll_str)

    def design_query_dict(self, input):
        query_terms = []

        if "what" in input.keys():
            for val in input["what"]:
                query_terms.append({"title":{ "$regex": rf"^{val}", "$options": "i" }})
                query_terms.append({"categories":{ "$regex": rf"^{val}", "$options": "i" }})
                if val in ["event","events","Event","Events"]:
                    query_terms = super().design_query_dict()


        if "event" in input.keys():
            if input["event"] == True:
                query_terms = super().design_query_dict()
            #else:
                #query_terms = query_terms = super().design_query_dict()
                #check if collected events are at the relevant time or sth

        if "all" in input.keys():
            query_terms = super().design_query_dict()

        if "by_ids" in input.keys():
            query_terms.append({"_id":{"$in": input["by_ids"]}})

        if query_terms == []:
            query_terms.append({"_id":"thisisanimpossibleid"}) #so that if nothing is entered nothing will be returned instead of everything
                
        self.query_dict = {"$or":query_terms}
    

    def retrieve_data(self):
        self.data = [elem for elem in self.collection.find(self.query_dict)]

    
    def check_click_radius(self, click_point, radius, data):
        return super().check_click_radius(click_point, radius, data)


    def no_duplicates(self, dict_list, new_dicts, key):
        return super().no_duplicates(dict_list, new_dicts, key)
    
    def within_time(self,data,date):
        if date=="today":
            date = pd.to_datetime(date,dayfirst=True).date() 
            #date = date.strftime("%Y-%m-%d")  
        elif date == "tomorrow":
            date = pd.to_datetime("today").date()
            date = date + timedelta(days=1)
            #date = date.strftime("%Y-%m-%d")  
        else:
            try:
                date = pd.to_datetime(date,dayfirst=True).date()
            except ValueError:
                #print("check your time format")
                pass
        results = []
        for e in data:
            if e.get("timeIntervals") is not None:
                ongoing = e.get("timeIntervals")
                ongoing = ongoing[0]
                #date = pd.to_datetime(date,dayfirst=True,format="%Y-%m-%d").date()
                #print(type(start),type(end),type(date))
                start, end = pd.to_datetime(ongoing["start"]).date(), pd.to_datetime(ongoing["end"]).date()
                if start <= date <= end:
                    results.append(e)
                else:
                    #results.append(e)
                    pass
        self.results = results
        return results
    

    # PLOTS
    def popupStr_generator(self, df_row):
        try:
            name = f"<b>{df_row['title']}</b><br><br>"
            link = f"<i>link:</i>: <a href='{df_row['source.url']}'>{df_row['source.url']}</a><br>"
            tel = f"<i>tel.:</i> {df_row['phone']}<br>"
            try:
                street = df_row["street"]
            except:
                street = ""
            try:
                city = df_row["city"] 
            except:
                city = ""
            try:
                zip = df_row["zip"]
            except:
                zip = None
            nice_addr = street + ", " + city + ", " + zip
            addr = f"<i>address:</i> {nice_addr}<br>"
            event_type = df_row['categories']
            separator = ", " 
            event_cat = separator.join(event_type)
            cat = f"<i>event type:</i> {event_cat}<br>"
            pps = name + link + tel + cat + addr 
            return pps
        except:
            pass

    def tooltip_generator(self,row):
        tts = ""
        try:
            tts = row['title']
            return tts
        except:
            pass
        return tts

    def gestalte_map(self,map,data):
        poi_df = pd.DataFrame(data) 
        try:    
            poi_df["lat"] = pd.to_numeric(poi_df["lat"])
            poi_df["lon"] = pd.to_numeric(poi_df["lon"])
            #poi_df.dropna(subset=["lat","lon"],inplace=True)

            for _, row in poi_df.iterrows():
                folium.Marker(
                    location=[row['lat'], row['lon']],
                    popup=self.popupStr_generator(row),
                    tooltip=self.tooltip_generator(row),
                    icon=folium.Icon(color="purple")
                ).add_to(map)
            return map
        except:
            #print("no map available due to e.g. naming errors")
            return map
            
    def display_output(self,map,data):
        self.gestalte_map(map=map,data=data) 


    def click_polygon(self, click_point, data):
        return super().click_polygon(click_point, data)
    
    def plot_polygon(self,polygons,map):
        #add county area
        gdf = gpd.GeoDataFrame(polygons, geometry='geometry')
        gdf.set_crs(epsg=4326, inplace=True)
        for _,row in gdf.iterrows():
            sim_geo = gpd.GeoSeries(row["geometry"]).simplify(tolerance=0.001)
            geo_j = sim_geo.to_json()
            geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "lightgray"})
            folium.Popup(row["name"]).add_to(geo_j)
            geo_j.add_to(map)

        #add rent markers
        self.display_output(map=map,data=self.results)

        return map

    def orderly_output(self, data):
        output = []
        for df_row in data:
            try:
                street = df_row["street"]
            except:
                street = ""
            try:
                city = df_row["city"] 
            except:
                city = ""
            try:
                zip = df_row["zip"]
            except:
                zip = None
            nice_addr = street + ", " + city + ", " + zip
            doc = {"name":df_row["name"],
                   "tel":df_row['phone'],
                   "address":nice_addr,
                   "time":df_row['timeIntervals']}
            output.append(doc)
        return output 
    

    def retrieve_by_id(self, Client, client_str, db_str, coll_str, batch_ids):
        return super().retrieve_by_id(Client, client_str, db_str, coll_str, batch_ids)
    

## NATURAL AREAS

In [7]:
class DIGITIZEDPLANET_queryer(DBHandlers):


    #CONNECT TO DB - COLLECTION
    def connect_db(self,Client=MongoClient,client_str='mongodb://localhost:27017', db_str='webscraping_dataLabKiel', coll_str='digitized_planet_v2'):
        return super().connect_db(Client,client_str,db_str,coll_str)

    #GET QUERY DICTIONARY FOR SEARCH
    def design_query_dict(self, input):
        query_terms = []

        if "what" in input.keys():
            for e in input["what"]:
                if e in ["nature","Natur","natural area","protected area","naturschutzgebiet","park"]:
                    query_terms = super().design_query_dict()

        if "all" in input.keys():
            query_terms = super().design_query_dict()

        if "by_ids" in input.keys():
            query_terms.append({"_id":{"$in": input["by_ids"]}})

        if query_terms == []:
            query_terms.append({"_id":"thisisanimpossibleid"}) #so that if nothing is entered nothing will be returned instead of everything
                
        self.query_dict = {"$or":query_terms}
    
    #RETRIEVE DATA BASED ON QUERY DICT
    def retrieve_data(self):
        self.data = [elem for elem in self.collection.find(self.query_dict)]

    
    #FILTER DATA BASED ON`RADIUS`
    def check_click_radius(self, click_point, radius, data):
        return super().check_click_radius(click_point, radius, data)
    

    def no_duplicates(self, dict_list, new_dicts, key):
        return super().no_duplicates(dict_list, new_dicts, key)


    # PLOTS
    def popupStr_generator(self, df_row):
        try:
            name = f"<b>{df_row['name']}</b><br><br>"
            area = f"<i>area in m²:</i> {df_row['area']}<br>"
            geom_s = f"<i>source:</i> {df_row['geometry_source']}<br>"
            pps = name + area + geom_s
            return pps
        except:
            pass

    def tooltip_generator(self,row):
        tts = ""
        try:
            tts = row['name']
            return tts
        except:
            pass
        return tts

    def gestalte_map(self,map,data):
        poi_df = pd.DataFrame(data) 
        try:    
            for _, row in poi_df.iterrows():
                folium.Marker(
                    location=[row['lat'], row['lon']],
                    popup=self.popupStr_generator(row),
                    tooltip=self.tooltip_generator(row),
                    icon=folium.Icon(color="darkblue")
                ).add_to(map)
            return map
        except:
            #print("no map available due to e.g. naming errors")
            return map
                    

    def click_polygon(self, click_point, data):
        return super().click_polygon(click_point, data)

    def plot_area_polygon(self,map,data):
        try:
            poi_df = pd.DataFrame(data)
            poi_df["geometry"] = poi_df['geometry'].apply(shape)
            gdf = gpd.GeoDataFrame(poi_df, geometry='geometry')
            gdf.set_crs(epsg=4326, inplace=True)
            for _,row in gdf.iterrows():
                sim_geo = gpd.GeoSeries(row["geometry"]).simplify(tolerance=0.001)
                geo_j = sim_geo.to_json()
                geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "darkblue"})
                folium.Popup(self.popupStr_generator(row)).add_to(geo_j)
                geo_j.add_to(map)
            
            gdf = gdf.to_crs(epsg=2263)
            gdf["centroid"] = gdf.centroid.to_crs(epsg=4326)
            for _, row in gdf.iterrows():
                lat = row["centroid"].y
                lon = row["centroid"].x
                folium.Marker(
                    location=[lat, lon],
                    popup=self.popupStr_generator(row),
                    tooltip=self.tooltip_generator(row),
                    icon=folium.Icon(color="darkblue")
                ).add_to(map)
            return map
        except:
            return map

    def plot_polygon(self,polygons,map):
        #add county area
        gdf = gpd.GeoDataFrame(polygons, geometry='geometry')
        gdf.set_crs(epsg=4326, inplace=True)
        for _,row in gdf.iterrows():
            sim_geo = gpd.GeoSeries(row["geometry"]).simplify(tolerance=0.001)
            geo_j = sim_geo.to_json()
            geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "lightgray"})
            folium.Popup(row["name"]).add_to(geo_j)
            geo_j.add_to(map)

        #add areas & markers
        self.plot_area_polygon(map=map,data=self.results)

        return map
    
    def display_output(self,map,data):
        self.plot_area_polygon(map=map,data=data) 
        #bzw self.gestalte_map(map=map,data=data) 


    def orderly_output(self, data):
        output = []
        for df_row in data:
            doc = {"name":df_row["name"],
                   "area":df_row['area'],
                   "source":df_row['geometry_source']}
            output.append(doc)
        return output 
    
    def retrieve_by_id(self, Client, client_str, db_str, coll_str, batch_ids):
        return super().retrieve_by_id(Client, client_str, db_str, coll_str, batch_ids)
    
    

## nlp search

In [8]:
class NLP_search(DBHandlers):


    #CONNECT TO DB - COLLECTION
    def connect_db(self,Client,embedding,collection,path):
        vectordb = Client(
            collection_name=collection,
            persist_directory=path,
            embedding_function=embedding
        )
        self.retriever = vectordb.as_retriever(search_kwargs=dict(k=20))
        return vectordb

    
    #RETRIEVE DATA BASED ON QUERY DICT
    def retrieve_data(self,input):
        #print(self.retriever)
        self.data = self.retriever.invoke(input) #HERE LIES THE ISSUE?
        #print(self.data)
        batch_ids = [ObjectId(r.metadata["mongo_id"]) for r in self.data]
        return batch_ids
    
        #GET QUERY DICTIONARY FOR SEARCH
    def design_query_dict(self):
        pass

    
    #FILTER DATA BASED ON`RADIUS`
    def check_click_radius(self, click_point, radius, data):
        pass
    
    def no_duplicates(self, dict_list, new_dicts, key):
        pass

    def popupStr_generator(self, df_row):
        pass

    def tooltip_generator(self, df_row):
        pass

    def gestalte_map(self,map,data):
        pass
                    
    def click_polygon(self, click_point, data):
        pass

    def plot_area_polygon(self,map,data):
        pass

    def plot_polygon(self,polygons,map):
        pass
    
    def display_output(self,map,data):
        pass


    def orderly_output(self, data):
        pass
    
    def retrieve_by_id(self, Client, client_str, db_str, coll_str, batch_ids):
        pass
    
    

# OUTPUT MANAGER

In [9]:
class OutputManager():

    #make function which integrated but doesnt return data from all data bases!!!
    def disambiguate(self,queriers):    
        threshold = 80 #of similarity, selected by just trying it out
        temp = []

        #go through the returned (no duplicate) result list of every querier
        for q in queriers:
            querier_results = q.results #get data for the querier
            q.results = [] #empty slate for querier.results

            #as long as theres still names to check
            while querier_results != []:
                elem = querier_results.pop()
                try:
                    name = elem["name"]
                    possible_double = [k for k in temp if fuzz.token_set_ratio(name,k.get("name")) >= threshold] #retrieve similar names from temp result list
                    if len(possible_double) >= 1: #if some similar name(s) found
                        elem_loc = (elem["lat"],elem["lon"]) #get coordinates of original point which we want to add
                        the_same = False #assume they are not the same location
                        for pos_doub in possible_double:
                            pos_doub_loc = (pos_doub["lat"],pos_doub["lon"]) #get coordinates from matches from temporary result list
                            if geodesic(elem_loc,pos_doub_loc).km < 0.1: # if it turns out they are in same locations, so if they are more less 100m apart
                                the_same = True # flag: okay so actually we found a match, so something with a similar name less than 100 m away
                        if the_same==False: # only if we didnt find anything simlar at all
                            temp.append(elem) #append to temporary checking list
                            q.results.append(elem) #add to collection specific result bucket so we can pass it again to database handler for output
                    else: #if we dont have anything similar (yet) just stick it in there
                        temp.append(elem)
                        q.results.append(elem)
                except KeyError:
                    pass


    #return (filtered) search results 
    def initiate_search(self,Querier,query,key):
        q = Querier()
        q.connect_db()
        q.design_query_dict(query)
        q.retrieve_data()
        q.results = q.no_duplicates([],q.data,key)
        return q
    
    def unbounded_data(self,q,map):
        q.display_output(map=self.map,data=q.results)
        return map, q.results
    
    #get data within a certain radius
    def radius_data(self,q,map,point,radius):
        q.check_click_radius(click_point=point,radius=radius, data=q.results) 
        q.display_output(map=self.map,data=q.results)
        return map, q.results

    #get data within polygon (== county)
    def polygon_data(self,q,map,point):
        poly = q.click_polygon(click_point=point,data=q.results)
        q.plot_polygon(poly,map)
        return map, q.results

    # area is radius (e.g. 0.5) or "polygon"
    def process_query_click(self, querier_instance, area, point, map_object):
        if area == "polygon":
            return self.polygon_data(querier_instance, map_object, point)
        elif isinstance(area, (int, float)): #is radius essentially
            return self.radius_data(querier_instance, map_object, point, area)
        else:
            return self.unbounded_data(querier_instance, map_object)
        

    def perform_search(self,**kwargs):
        
        # click + filter 
        # click + empty filter: all 
        # no click + filter (so not area specific, neither radius nor polygon!!!) -> area = None

        #if "query" in kwargs and "point" in kwargs and "area" in kwargs:

        # SET UP: get arguments
        if "query" in kwargs and "point" in kwargs and "area" in kwargs:
            query, point, area = kwargs["query"], kwargs["point"], kwargs["area"]
            self.map = folium.Map(location=point,zoom_start=8) #one output object of output manager
        elif "query" in kwargs:
            query, point, area = kwargs["query"], None, None
            self.map = folium.Map(location=(54.2194,9.6961),zoom_start=6) #one output object of output manager

        # out of the query dict get the actual value
        time_search = False
        if query.get("time") is not None:
            time_search = True
            time = query.get("time")

        
        # INSTANTIATE QUERIERS (get data, apply no duplicate, get results per querier)
        self.querier_osm = self.initiate_search(OSM_queryer, query=query,key="")
        self.querier_yp = self.initiate_search(YP_queryer, query=query,key="name")
        self.querier_event = self.initiate_search(EVENT_queryer, query=query,key="")
        self.querier_rent = self.initiate_search(RENT_queryer, query=query,key="PLZ")
        self.querier_area = self.initiate_search(DIGITIZEDPLANET_queryer, query=query,key="name")

        # DISAMBIGUATE ALL DATA BY NAME SIMILARITY AND LOCATION
        queriers = [self.querier_yp,self.querier_osm,self.querier_event]
        self.disambiguate(queriers)

        # NOW CHECK TIMES 
        if time_search == True:
            #for yellow page easy, just check if result of time is helpful
            self.querier_yp.results = self.querier_yp.within_time(data=self.querier_yp.results,date=time)
            self.querier_event.results = self.querier_event.within_time(data=self.querier_event.results,date=time)
            self.querier_event.no_duplicates([],self.querier_event.results,key="title")

        # NOW PUT DATA ON MAP
        self.map, self.osm_data = self.process_query_click(self.querier_osm, area=area, point=point, map_object=self.map)
        self.map, self.yp_data = self.process_query_click(self.querier_yp, area=area, point=point, map_object=self.map)
        self.map, self.event_data = self.process_query_click(self.querier_event, area=area, point=point, map_object=self.map)
        self.map, self.rent_data = self.process_query_click(self.querier_rent, area=area, point=point, map_object=self.map)
        self.map, self.area_data = self.process_query_click(self.querier_area, area=area, point=point, map_object=self.map)

        # COMBINE ALL DATA
        #for all queriers now! nicefy data
        self.nice_data = self.querier_yp.orderly_output(data=self.yp_data) + self.querier_rent.orderly_output(data=self.rent_data) + self.querier_event.orderly_output(data=self.event_data) + self.querier_area.orderly_output(data=self.area_data) + self.querier_osm.orderly_output(data=self.osm_data)
        self.data = self.yp_data + self.osm_data + self.event_data + self.rent_data + self.area_data

    

    def nlp_search(self, **kwargs):
        if "nlp" in kwargs:
            input = kwargs["nlp"]
            nlp = NLP_search()
            vdb = nlp.connect_db(Chroma,HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2'),"nlp_search",r"C:\Users\dorar\Documents\kiel\data_science\semester_2\application_projects\dataLab_kiel\code\SH_data_platform\vector_store\nlp_search")
            ids = nlp.retrieve_data(input)
            query = {"by_ids":ids}
            self.perform_search(query=query)
        

In [10]:
coordinates = [
    (54.51450, 8.86610),  # Nordfriesland
    (54.17230, 9.03890),  # Dithmarschen
    (54.36980, 9.73560),  # Rendsburg-Eckernförde
    (54.71020, 9.41230),  # Schleswig-Flensburg
    (54.20840, 10.41490), # Plön
    (54.17650, 10.93020), # Ostholstein
    (53.67620, 9.66230),  # Pinneberg
    (53.91540, 10.25580), # Segeberg
    (53.70010, 10.39460)  # Stormarn
]