Merge pull request #11 from codders/feat/processor-pipelines

Add processing pipelines
flathunters · Jun 9, 2020 · 66101c1 · 66101c1
2 parents 3ca75c3 + 959fe66
commit 66101c1
Show file tree

Hide file tree

Showing 17 changed files with 408 additions and 155 deletions.
diff --git a/Pipfile b/Pipfile
@@ -20,6 +20,7 @@ Flask = "*"
 Flask-API = "*"
 firebase-admin = "*"
 mock-firestore = "*"
+pytest-mock = "*"
 pytest = "*"
 
 [dev-packages]

diff --git a/flathunter.py b/flathunter.py
@@ -6,10 +6,6 @@
 import logging
 import time
 import yaml
-from flathunter.crawl_ebaykleinanzeigen import CrawlEbayKleinanzeigen
-from flathunter.crawl_immobilienscout import CrawlImmobilienscout
-from flathunter.crawl_wggesucht import CrawlWgGesucht
-from flathunter.crawl_immowelt import CrawlImmowelt
 from flathunter.idmaintainer import IdMaintainer
 from flathunter.hunter import Hunter
 from flathunter.config import Config
@@ -39,10 +35,9 @@
 
 
 def launch_flat_hunt(config):
-    searchers = [CrawlImmobilienscout(), CrawlWgGesucht(), CrawlEbayKleinanzeigen(), CrawlImmowelt()]
     id_watch = IdMaintainer('%s/processed_ids.db' % os.path.dirname(os.path.abspath(__file__)))
 
-    hunter = Hunter(config, searchers, id_watch)
+    hunter = Hunter(config, id_watch)
     hunter.hunt_flats()
 
     while config.get('loop', dict()).get('active', False):

diff --git a/flathunter/abstract_processor.py b/flathunter/abstract_processor.py
@@ -0,0 +1,4 @@
+class Processor:
+
+    def process_exposes(self, exposes):
+        return map(lambda e: self.process_expose(e), exposes)
diff --git a/flathunter/config.py b/flathunter/config.py
@@ -2,11 +2,16 @@
 import yaml
 import logging
 
+from flathunter.crawl_ebaykleinanzeigen import CrawlEbayKleinanzeigen
+from flathunter.crawl_immobilienscout import CrawlImmobilienscout
+from flathunter.crawl_wggesucht import CrawlWgGesucht
+from flathunter.crawl_immowelt import CrawlImmowelt
 from flathunter.filter import Filter
 
 class Config:
 
     __log__ = logging.getLogger(__name__)
+    __searchers__ = [CrawlImmobilienscout(), CrawlWgGesucht(), CrawlEbayKleinanzeigen(), CrawlImmowelt()]
 
     def __init__(self, filename=None, string=None):
         if string is not None:
@@ -27,6 +32,14 @@ def __getitem__(self, value):
     def get(self, key, value=None):
         return self.config.get(key, value)
 
+    @staticmethod
+    def set_searchers(searchers):
+        Config.__searchers__ = searchers
+
+    @staticmethod
+    def searchers():
+        return Config.__searchers__
+
     def get_filter(self):
         builder = Filter.builder()
         if "excluded_titles" in self.config:

diff --git a/flathunter/default_processors.py b/flathunter/default_processors.py
@@ -0,0 +1,37 @@
+import re
+import logging
+from flathunter.abstract_processor import Processor
+
+class Filter(Processor):
+
+    def __init__(self, config, filter):
+        self.config = config
+        self.filter = filter
+
+    def process_exposes(self, exposes):
+        return self.filter.filter(exposes)
+
+class AddressResolver(Processor):
+    __log__ = logging.getLogger(__name__)
+
+    def __init__(self, config):
+        self.config = config
+
+    def process_expose(self, expose):
+        if expose['address'].startswith('http'):
+            url = expose['address']
+            for searcher in self.config.searchers():
+                if re.search(searcher.URL_PATTERN, url):
+                    expose['address'] = searcher.load_address(url)
+                    self.__log__.debug("Loaded address %s for url %s" % (expose['address'], url))
+                    break
+        return expose
+
+class LambdaProcessor(Processor):
+
+    def __init__(self, config, func):
+        self.func = func
+
+    def process_expose(self, expose):
+        res = self.func(expose)
+        return expose
diff --git a/flathunter/filter.py b/flathunter/filter.py
@@ -103,37 +103,42 @@ def is_interesting(self, expose):
             return True
         return False
 
-class FilterBuilder:
-
-    def __init__(self):
-        self.filters = []
+class PredicateFilter:
 
-    def title_filter(self, filtered_titles):
-        self.filters.append(TitleFilter(filtered_titles))
-        return self
-
-    def min_price_filter(self, min_price):
-        self.filters.append(MinPriceFilter(min_price))
-        return self
+    def __init__(self, predicate):
+        self.predicate = predicate
 
-    def max_price_filter(self, max_price):
-        self.filters.append(MaxPriceFilter(max_price))
-        return self
+    def is_interesting(self, expose):
+        return self.predicate(expose)
 
-    def min_size_filter(self, min_size):
-        self.filters.append(MinSizeFilter(min_size))
-        return self
+class FilterBuilder:
 
-    def max_size_filter(self, max_size):
-        self.filters.append(MaxSizeFilter(max_size))
-        return self
+    def __init__(self):
+        self.filters = []
 
-    def min_rooms_filter(self, min_rooms):
-        self.filters.append(MinRoomsFilter(min_rooms))
+    def read_config(self, config):
+        if "excluded_titles" in config:
+            self.filters.append(TitleFilter(config["excluded_titles"]))
+        if "filters" in config:
+            filters_config = config["filters"]
+            if "excluded_titles" in filters_config:
+                self.filters.append(TitleFilter(filters_config["excluded_titles"]))
+            if "min_price" in filters_config:
+                self.filters.append(MinPriceFilter(filters_config["min_price"]))
+            if "max_price" in filters_config:
+                self.filters.append(MaxPriceFilter(filters_config["max_price"]))
+            if "min_size" in filters_config:
+                self.filters.append(MinSizeFilter(filters_config["min_size"]))
+            if "max_size" in filters_config:
+                self.filters.append(MaxSizeFilter(filters_config["max_size"]))
+            if "min_rooms" in filters_config:
+                self.filters.append(MinRoomsFilter(filters_config["min_rooms"]))
+            if "max_rooms" in filters_config:
+                self.filters.append(MaxRoomsFilter(filters_config["max_rooms"]))
         return self
 
-    def max_rooms_filter(self, max_rooms):
-        self.filters.append(MaxRoomsFilter(max_rooms))
+    def predicate_filter(self, predicate):
+        self.filters.append(PredicateFilter(predicate))
         return self
 
     def build(self):

diff --git a/flathunter/gmaps_duration_processor.py b/flathunter/gmaps_duration_processor.py
@@ -0,0 +1,79 @@
+import logging
+import datetime
+import time
+import urllib
+import requests
+
+from flathunter.abstract_processor import Processor
+
+class GMapsDurationProcessor(Processor):
+
+    GM_MODE_TRANSIT = 'transit'
+    GM_MODE_BICYCLE = 'bicycling'
+    GM_MODE_DRIVING = 'driving'
+
+    __log__ = logging.getLogger(__name__)
+
+    def __init__(self, config):
+        self.config = config
+
+    def process_expose(self, expose):
+        expose['durations'] = self.get_formatted_durations(expose['address']).strip()
+        return expose
+
+    def get_formatted_durations(self, address):
+        out = ""
+        for duration in self.config.get('durations', list()):
+            if 'destination' in duration and 'name' in duration:
+                dest = duration.get('destination')
+                name = duration.get('name')
+                for mode in duration.get('modes', list()):
+                    if 'gm_id' in mode and 'title' in mode and 'key' in self.config.get('google_maps_api', dict()):
+                        duration = self.get_gmaps_distance(address, dest, mode['gm_id'])
+                        out += "> %s (%s): %s\n" % (name, mode['title'], duration)
+
+        return out.strip()
+
+    def get_gmaps_distance(self, address, dest, mode):
+        # get timestamp for next monday at 9:00:00 o'clock
+        now = datetime.datetime.today().replace(hour=9, minute=0, second=0)
+        next_monday = now + datetime.timedelta(days=(7 - now.weekday()))
+        arrival_time = str(int(time.mktime(next_monday.timetuple())))
+
+        # decode from unicode and url encode addresses
+        address = urllib.parse.quote_plus(address.strip().encode('utf8'))
+        dest = urllib.parse.quote_plus(dest.strip().encode('utf8'))
+        self.__log__.debug("Got address: %s" % address)
+
+        # get google maps config stuff
+        base_url = self.config.get('google_maps_api', dict()).get('url')
+        gm_key = self.config.get('google_maps_api', dict()).get('key')
+
+        if not gm_key and mode != self.GM_MODE_DRIVING:
+            self.__log__.warning("No Google Maps API key configured and without using a mode different from "
+                                 "'driving' is not allowed. Downgrading to mode 'drinving' thus. ")
+            mode = 'driving'
+            base_url = base_url.replace('&key={key}', '')
+
+        # retrieve the result
+        url = base_url.format(dest=dest, mode=mode, origin=address, key=gm_key, arrival=arrival_time)
+        result = requests.get(url).json()
+        if result['status'] != 'OK':
+            self.__log__.error("Failed retrieving distance to address %s: " % address, result)
+            return None
+
+        # get the fastest route
+        distances = dict()
+        for row in result['rows']:
+            for element in row['elements']:
+                if 'status' in element and element['status'] != 'OK':
+                    self.__log__.warning("For address %s we got the status message: %s" % (address, element['status']))
+                    self.__log__.debug("We got this result: %s" % repr(result))
+                    continue
+                self.__log__.debug("Got distance and duration: %s / %s (%i seconds)"
+                                   % (element['distance']['text'], element['duration']['text'],
+                                      element['duration']['value'])
+                                   )
+                distances[element['duration']['value']] = '%s (%s)' % \
+                                                          (element['duration']['text'], element['distance']['text'])
+        return distances[min(distances.keys())] if distances else None