# Crawler

## dtacrawl

In [4]:
import json
import random 
import time 
import logging
import argparse
from urllib.parse import urlunsplit
from urllib.parse import urlsplit
from urllib.parse import urlencode
from urllib.request import urlopen

logging.basicConfig(format="%(asctime)s: %(message)s", level=logging.INFO, datefmt="%H:%M:%S")

tei_baseurl = 'https://www.deutschestextarchiv.de/book/download_xml'
query_baseurl = 'https://kaskade.dwds.de/dstar/dta/dstar.perl'
tcf_baseurl = 'https://www.deutschestextarchiv.de/book/download_fulltcf'

def tei_url(basename):
    return tei_baseurl + "/" + basename

def tcf_url(id):
    return tcf_baseurl + "/" + id

def query_url(query):
    parts = list(urlsplit(query_baseurl))
    parts[3] = urlencode(query)
    return urlunsplit(parts)

print(tei_url('brehm_thierleben05_1869'))
print(tcf_url('12345'))
print(query_url({'q': 'abc'}))

https://www.deutschestextarchiv.de/book/download_xml/brehm_thierleben05_1869
https://www.deutschestextarchiv.de/book/download_fulltcf/12345
https://kaskade.dwds.de/dstar/dta/dstar.perl?q=abc


In [5]:
def download(url):
    secs = random.uniform(0.5, 1.5)
    time.sleep(secs)
    logging.info(f"downloading {url} after waiting {secs}s")
    with urlopen(url) as f:
        return f.read() 

def download_to(url, out):
    with open(out, 'wb') as f:
        f.write(download(url))
        
def query(q):
    return json.loads(download(query_url(q)))


In [6]:
def dtaids(max, q):
    ids = {}
    start = 1
    while len(ids) < max:
        q["limit"] = max 
        q["start"] = start 
        q["fmt"] = "json"
        hits = query(q)
        for hit in hits["hits_"]:
            id = hit["meta_"]["dtaid"]
            if id not in ids:
                ids[id] = hit["meta_"]["basename"]
            if len(ids) == max:
                return ids 
        start = max + start
    return ids

print(dtaids(3, {'q': 'Axolotl'}))

16:04:47: downloading https://kaskade.dwds.de/dstar/dta/dstar.perl?q=Axolotl&limit=3&start=1&fmt=json after waiting 0.6934956890238664s
16:04:51: downloading https://kaskade.dwds.de/dstar/dta/dstar.perl?q=Axolotl&limit=3&start=4&fmt=json after waiting 1.0205144961539503s
{'25164': 'haeckel_schoepfungsgeschichte_1868', '16241': 'weismann_keimplasma_1892', '25165': 'brehm_thierleben05_1869'}


In [7]:
    ids = dtaids(3, {'q': 'Axolotl'})
    for id in ids:
        download_to(tei_url(ids[id]), f"out/{id}.tei.xml")
        download_to(tcf_url(id), f"out/{id}.tcf.xml")
    logging.info('done')

16:04:54: downloading https://kaskade.dwds.de/dstar/dta/dstar.perl?q=Axolotl&limit=3&start=1&fmt=json after waiting 0.8054506189337978s
16:04:58: downloading https://kaskade.dwds.de/dstar/dta/dstar.perl?q=Axolotl&limit=3&start=4&fmt=json after waiting 1.4683739383820988s
16:05:04: downloading https://www.deutschestextarchiv.de/book/download_xml/haeckel_schoepfungsgeschichte_1868 after waiting 0.8962228530395613s
16:05:09: downloading https://www.deutschestextarchiv.de/book/download_fulltcf/25164 after waiting 1.274685395392647s
16:05:30: downloading https://www.deutschestextarchiv.de/book/download_xml/weismann_keimplasma_1892 after waiting 0.5403405918591221s
16:05:34: downloading https://www.deutschestextarchiv.de/book/download_fulltcf/16241 after waiting 1.0757443464914163s
16:05:52: downloading https://www.deutschestextarchiv.de/book/download_xml/brehm_thierleben05_1869 after waiting 1.1805839690431699s
16:05:59: downloading https://www.deutschestextarchiv.de/book/download_fulltcf/2

## Asynchroner Crawler
* lineares Vorgehen:
  1. lade alle Suchergebnisse herunter
  2. lade die zugeh&ouml;rigen Dateien herunter
* asynchrones Vorgehen:
  * lade die Suchergebnisse herunter
  * parallel dazu lade die Dateien herunter sobald Suchergebnisse vorhanden sind.

### Threads
* asynchrone Pfade durch den Code
* Threads laufen gleichzeitig auf verschiedenen CPU's
* Reihenfolge der Threads ist nicht deterministisch
* Kommunikation zwischen den Threads muss synchronisiert werden (Mutex, atomare Variablen ...)
* unsynchronisierter Zugriff auf gemeinsame Daten f&uuml;hrt zu Problemen

In [8]:
import concurrent.futures

def thread(name, max):
    for i in range(max):
        time.sleep(random.uniform(0.5, 1.5))
        logging.info(f'{name} producing {i}')

with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
    executor.submit(thread, "Thread 1", 3)
    executor.submit(thread, "Thread 2", 5)
    executor.submit(thread, "Thread 3", 2)
logging.info('done')


16:06:47: Thread 1 producing 0
16:06:47: Thread 2 producing 0
16:06:48: Thread 2 producing 1
16:06:48: Thread 1 producing 1
16:06:49: Thread 1 producing 2
16:06:49: Thread 2 producing 2
16:06:50: Thread 3 producing 0
16:06:51: Thread 2 producing 3
16:06:51: Thread 3 producing 1
16:06:52: Thread 2 producing 4
16:06:52: done


### Wettlaufsituation (Race Condition)

In [9]:
deposit = 50
    
def withdraw(amount):
    global deposit
    while True:
        if deposit >= amount:
            time.sleep(random.uniform(0.5, 1.5))
            deposit = deposit - amount
            logging.info(f'deposit: {deposit}')
        else:
            return
        
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
    executor.submit(withdraw, 13)
    executor.submit(withdraw, 17)
logging.info(f'final deposit: {deposit}')


16:06:53: deposit: 37
16:06:53: deposit: 20
16:06:54: deposit: 3
16:06:54: deposit: -10
16:06:54: final deposit: -10


### Synchronisation mit Locks

In [10]:
import threading

locked_deposit = 50
lock = threading.Lock()
    
def withdraw(amount):
    global locked_deposit
    global lock
    while True:       
        lock.acquire()
        if locked_deposit >= amount:
            time.sleep(random.uniform(0.5, 1.5))
            locked_deposit = locked_deposit - amount
            logging.info(f'deposit: {locked_deposit}')
            lock.release()
        else:
            lock.release()
            return
        
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
    executor.submit(withdraw, 13)
    executor.submit(withdraw, 17)
logging.info(f'final deposit: {locked_deposit}')


16:06:55: deposit: 37
16:06:57: deposit: 24
16:06:58: deposit: 11
16:06:58: final deposit: 11


### Locks mit `with ...`

In [11]:
import threading

locked_deposit = 50
lock = threading.Lock()
    
def withdraw(amount):
    global locked_deposit
    global lock
    while True:
        with lock:    
            if locked_deposit >= amount:
                time.sleep(random.uniform(0.5, 1.5))
                locked_deposit = locked_deposit - amount
                logging.info(f'deposit: {locked_deposit}')
            else:
                return
        
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
    executor.submit(withdraw, 13)
    executor.submit(withdraw, 17)
logging.info(f'final deposit: {locked_deposit}')


16:06:59: deposit: 37
16:07:00: deposit: 24
16:07:01: deposit: 11
16:07:01: final deposit: 11


### Erzeuger-Verbraucher (Consumer Producer) Threading
* Erzeuger produzieren Daten
* Verbraucher verarbeiten die Daten weiter
* Verbraucher und Erzeuger laufen in verschiedenen Threads
* Kommunikation (und Synchronisation) der Erzeuger und Verbraucher &uuml;ber Queues
* je nach Anwendung verschiedene Anzahlen von Erzeugern und Verbrauchern

### Queue
* FiFo (First in, first out) Datenstruktur
* Elemente werden in der Reihenfolge heraus genommen in der sie eingef&uuml;gt werden

![Queue](https://upload.wikimedia.org/wikipedia/commons/thumb/5/52/Data_Queue.svg/300px-Data_Queue.svg.png)

Quelle: https://en.wikipedia.org/wiki/Queue_(abstract_data_type)#/media/File:Data_Queue.svg

### Pipeline
* verwendet Pyton Queue implementierung als Basis
* dient der Kommunikation zwischen Erzeuger und Verbraucher
* Schließen der Pipline signalisiert Ende der Arbeit 

In [12]:
import queue
class Pipeline(queue.Queue):
    def __init__(self):
        super().__init__(maxsize=10)
        
    def close(self):
        self.put((None, None)) # insert sentry
        logging.info('pipeline closed')
        
    def add_url(self, url, out):
        self.put((url, out))

    def get_url(self):
        ret = self.get()
        if ret == (None, None):
            self.put(ret) # reinsert sentry
            return (None, None, False)
        else:
            return (ret[0], ret[1], True)

### Verbraucher
* ließt urls aus der Pipline
* l&auml;d die entsprechenden Dateien herunter
* mehrere parallele Verbraucher

In [13]:
def consumer(pipeline):
    while True:
        url, out, ok = pipeline.get_url()
        if not ok:
            return 
        download_to(url, out)

### Erzeuger (Producer)
* stellt Suchanfragen
* schreibt die URL's (tcf und tei) in die Pipeline
* signalisiert Ende der Arbeit an die Verbraucher durch Schließen der Pipeline
* nur ein Erzeuger

In [14]:
def producer(pipeline, out, max, q):
    ids = set()
    start = 1
    while len(ids) < max:
        q["limit"] = max 
        q["start"] = start 
        q["fmt"] = "json"
        hits = query(q)
        for hit in hits["hits_"]:
            id = hit["meta_"]["dtaid"]
            basename = hit["meta_"]["basename"]
            if id not in ids:
                ids.add(id)
                pipeline.add_url(tei_url(basename), os.path.join(out, f'{id}.tei.xml'))
                pipeline.add_url(tcf_url(id), os.path.join(out, f'{id}.tcf.xml'))
            if len(ids) == max:
                pipeline.close()
                return 
        start = max + start
    pipeline.close()

### Asynchroner Crawler

In [15]:
pipeline = Pipeline()
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    executor.submit(producer, pipeline, 'out', 3, {'q': 'Axolotl'})
    executor.submit(consumer, pipeline)
    executor.submit(consumer, pipeline)
    executor.submit(consumer, pipeline)
logging.info('done')

16:07:02: downloading https://kaskade.dwds.de/dstar/dta/dstar.perl?q=Axolotl&limit=3&start=1&fmt=json after waiting 1.2653087105882421s
16:07:06: downloading https://kaskade.dwds.de/dstar/dta/dstar.perl?q=Axolotl&limit=3&start=4&fmt=json after waiting 1.0527904047534369s
16:07:07: downloading https://www.deutschestextarchiv.de/book/download_xml/haeckel_schoepfungsgeschichte_1868 after waiting 1.3053905381316873s
16:07:07: downloading https://www.deutschestextarchiv.de/book/download_fulltcf/25164 after waiting 1.439911512642786s
16:07:09: pipeline closed
16:07:10: downloading https://www.deutschestextarchiv.de/book/download_xml/weismann_keimplasma_1892 after waiting 0.8100858010322504s
16:07:11: downloading https://www.deutschestextarchiv.de/book/download_fulltcf/16241 after waiting 0.599968221850295s
16:07:15: downloading https://www.deutschestextarchiv.de/book/download_xml/brehm_thierleben05_1869 after waiting 1.0681195739987117s
16:07:28: downloading https://www.deutschestextarchiv.d

### Pipeline als Kontextmanagerobjekt
* die `with ... as ...` Syntax erm&ouml;glicht das automatische Schließen von Resourcen (Dateihandel...)
* mit `contextmanager` Objekten k&ouml;nnen eigene Klassen mit `with ... as ...` verwendet werden
* `contextmanager` Objekte in Python m&uuml;ssen zwei Metoden implementieren
  1. `__enter__` gibt das mit `as` referenzierte Objekt zur&uuml;ck (wird automatisch geschlossen)
  2. `__exit__` steuert die Fehlerbehandlung
* genaueres in der Python [Dokumentation](https://docs.python.org/3/library/stdtypes.html#typecontextmanager)

In [16]:
class PipelineCM(Pipeline):
    def __init__(self):
        super().__init__()
    def __enter__(self):
        return self 
    def __exit__(self, et, ev, etb):
        self.close()
        return False

def producer2(pipeline, out, max, q):
    ids = set()
    start = 1
    with pipeline as p:
        while len(ids) < max:
            q["limit"] = max 
            q["start"] = start 
            q["fmt"] = "json"
            hits = query(q)
            for hit in hits["hits_"]:
                id = hit["meta_"]["dtaid"]
                basename = hit["meta_"]["basename"]
                if id not in ids:
                    ids.add(id)
                    p.add_url(tei_url(basename), os.path.join(out, f'{id}.tei.xml'))
                    p.add_url(tcf_url(id), os.path.join(out, f'{id}.tcf.xml'))
                if len(ids) == max:
                    return 
            start = max + start 

pipeline = PipelineCM()
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    executor.submit(producer2, pipeline, 'out', 2, {'q': 'Axolotl'})
    executor.submit(consumer, pipeline)
    executor.submit(consumer, pipeline)
    executor.submit(consumer, pipeline)
logging.info('done')

16:08:30: downloading https://kaskade.dwds.de/dstar/dta/dstar.perl?q=Axolotl&limit=2&start=1&fmt=json after waiting 0.6286216931088301s
16:08:34: downloading https://kaskade.dwds.de/dstar/dta/dstar.perl?q=Axolotl&limit=2&start=3&fmt=json after waiting 0.9097789644972452s
16:08:34: downloading https://www.deutschestextarchiv.de/book/download_fulltcf/25164 after waiting 1.3418201584607246s
16:08:34: downloading https://www.deutschestextarchiv.de/book/download_xml/haeckel_schoepfungsgeschichte_1868 after waiting 1.3778800391157429s
16:08:37: pipeline closed
16:08:38: downloading https://www.deutschestextarchiv.de/book/download_xml/weismann_keimplasma_1892 after waiting 1.2849920341912444s
16:08:40: downloading https://www.deutschestextarchiv.de/book/download_fulltcf/16241 after waiting 1.091954962302415s
16:09:10: done


## time

| Name | time - misst die Laufzeit von Programmen |
|:---|:---|
|Überblick| time \[OPTION\]... \[CMD\] \[ARGS\]... |
|Beschreibung | Misst die Laufzeit von Programmen |
| Wichtige Optionen: | |
| -v, --verbose | detailierte Ausgabe |

In [17]:
%%bash
time python3 dtacrawl.py --max 2 --dir out 'Axolotl'

16:09:11: downloading https://kaskade.dwds.de/dstar/dta/dstar.perl?q=Axolotl&limit=2&start=1&fmt=json after waiting 0.9914523984143253s
16:09:15: downloading https://kaskade.dwds.de/dstar/dta/dstar.perl?q=Axolotl&limit=2&start=3&fmt=json after waiting 0.5640277921646498s
16:09:18: downloading https://www.deutschestextarchiv.de/book/download_xml/haeckel_schoepfungsgeschichte_1868 after waiting 0.5693842343056343s
16:09:22: downloading https://www.deutschestextarchiv.de/book/download_fulltcf/25164 after waiting 0.890126520972706s
16:09:43: downloading https://www.deutschestextarchiv.de/book/download_xml/weismann_keimplasma_1892 after waiting 0.6919032067725522s
16:09:47: downloading https://www.deutschestextarchiv.de/book/download_fulltcf/16241 after waiting 0.9144674417724756s

real	0m52.728s
user	0m1.521s
sys	0m1.610s


In [18]:
%%bash
time python3 dtacrawl_async.py --max 2 --dir out 'Axolotl'

16:10:05: downloading https://kaskade.dwds.de/dstar/dta/dstar.perl?q=Axolotl&limit=2&start=1&fmt=json after waiting 0s
16:10:08: downloading https://kaskade.dwds.de/dstar/dta/dstar.perl?q=Axolotl&limit=2&start=3&fmt=json after waiting 0s
16:10:08: downloading https://www.deutschestextarchiv.de/book/download_xml/haeckel_schoepfungsgeschichte_1868 after waiting 0s
16:10:08: downloading https://www.deutschestextarchiv.de/book/download_fulltcf/25164 after waiting 0s
16:10:11: pipeline closed
16:10:12: downloading https://www.deutschestextarchiv.de/book/download_xml/weismann_keimplasma_1892 after waiting 0s
16:10:13: downloading https://www.deutschestextarchiv.de/book/download_fulltcf/16241 after waiting 0s

real	0m42.906s
user	0m1.615s
sys	0m1.378s
