# Anwendungsbeispiel: Nachrichten-Crawler

In [56]:
import requests 
import lxml.html as lh
from urllib.parse import urljoin

base = 'https://deutschlan'
visited = set()
ignored = set()
todo = { base }

while len(todo) > 0:
    print(f"URLs to crawl: {len(todo)}")
    url = todo.pop()
    print(f"crawling {url}")
    visited.add(url)
    
    resp = requests.get(url)
    if not resp:
        continue
        
    # url könnte aufgrund redirect verändert sein
    url = resp.url
    visited.add(url)
    doc = lh.fromstring(resp.content)
    for link in doc.xpath('//a'):
        href = link.get('href')
        # URLs außerhalb der FH nicht folgen
        if href is None or len(href) == 0:
            continue
            
        # Löse Link URL auf    
        href = urljoin(url, href)
        
        if not href.startswith(base):
            # print(f"ignoring: {href}")
            ignored.add(href)
            continue
        
        if not href in visited:
            todo.add(href)
        
print(f"{len(visited)} pages crawled, {len(ignored)} links ignored")


URLs to crawl: 1
crawling https://github.com/fhswf
URLs to crawl: 12
crawling https://github.com/fhswf/latex-ifv/issues
URLs to crawl: 22
crawling https://github.com/fhswf/latex-ifv/labels
URLs to crawl: 35
crawling https://github.com/fhswf#start-of-content
URLs to crawl: 34
crawling https://github.com/fhswf/latex-ifv/projects
URLs to crawl: 34
crawling https://github.com/fhswf/latex-ifv
URLs to crawl: 50
crawling https://github.com/fhswf/skriptsprachen/pulls
URLs to crawl: 59
crawling https://github.com/fhswf/latex-ifv#installation-unter-debian-9
URLs to crawl: 58
crawling https://github.com/fhswf/latex-ifv/pulse
URLs to crawl: 65
crawling https://github.com/fhswf/skriptsprachen/pulls#start-of-content
URLs to crawl: 64
crawling https://github.com/fhswf/latex-ifv/milestones
URLs to crawl: 74
crawling https://github.com/fhswf/latex-ifv/network/members
URLs to crawl: 74
crawling https://github.com/fhswf/latex-ifv/labels?sort=count-desc
URLs to crawl: 74
crawling https://github.com/fhswf/

URLs to crawl: 241
crawling https://github.com/fhswf/image_captioning#changes-from-the-original-project-by-trang-nguyen
URLs to crawl: 240
crawling https://github.com/fhswf/image_captioning/graphs/code-frequency
URLs to crawl: 240
crawling https://github.com/fhswf/latex-ifv/milestones?direction=asc&sort=count&state=open
URLs to crawl: 240
crawling https://github.com/fhswf/image_captioning/security/advisories?state=published
URLs to crawl: 240
crawling https://github.com/fhswf/image_captioning/blob/master/utils.py
URLs to crawl: 245
crawling https://github.com/fhswf/skriptsprachen/stargazers#start-of-content
URLs to crawl: 244
crawling https://github.com/fhswf/image_captioning/graphs/contributors
URLs to crawl: 244
crawling https://github.com/fhswf/latex-ifv/issues?q=is%3Aissue+is%3Aopen+comments%3A%3E50#start-of-content
URLs to crawl: 243
crawling https://github.com/fhswf/image_captioning/commits/master/utils.py
URLs to crawl: 257
crawling https://github.com/fhswf/latex-ifv/search?l=te

URLs to crawl: 460
crawling https://github.com/fhswf/skriptsprachen/compare
URLs to crawl: 469
crawling https://github.com/fhswf/latex-ifv/branches/active
URLs to crawl: 469
crawling https://github.com/fhswf/image_captioning/archive/1e6a0540401350654d7ebed1505b8b88a685be3f.zip
URLs to crawl: 468
crawling https://github.com/fhswf/image_captioning/network/members
URLs to crawl: 468
crawling https://github.com/fhswf/latex-ifv/blob/master/README.md#start-of-content
URLs to crawl: 467
crawling https://github.com/fhswf/latex-ifv/stargazers
URLs to crawl: 468
crawling https://github.com/fhswf/image_captioning/issues?q=is%3Aissue+sort%3Aupdated-desc+is%3Aopen
URLs to crawl: 469
crawling https://github.com/fhswf/image_captioning/issues?q=is%3Aissue+is%3Aopen+comments%3A%3E50+sort%3Areactions-rocket-desc
URLs to crawl: 471
crawling https://github.com/fhswf/latex-ifv/blame/8c0b7aaacd0eabf50253ba8400695b07212ed2b9/.gitlab-ci.yml
URLs to crawl: 484
crawling https://github.com/fhswf/latex-ifv/tree/8

URLs to crawl: 747
crawling https://github.com/fhswf/skriptsprachen/labels/wontfix#start-of-content
URLs to crawl: 746
crawling https://github.com/fhswf/latex-ifv/commit/22344ea764d5d79f37375c7629b49e5869fe88e0
URLs to crawl: 750
crawling https://github.com/fhswf/image_captioning/commit/5cded40661b31af3f63a86124fc9086af365f088
URLs to crawl: 752
crawling https://github.com/fhswf/image_captioning/commit/b79627bc44d6c302f66abb0d494bef0e8f5feca2#diff-93ff136fae812392eb0f68d1ce89b7fe
URLs to crawl: 751
crawling https://github.com/fhswf/latex-ifv/commits/8c0b7aaacd0eabf50253ba8400695b07212ed2b9/tex/ifv.sty
URLs to crawl: 755
crawling https://github.com/fhswf/latex-ifv/tree/8c0b7aaacd0eabf50253ba8400695b07212ed2b9#installation-unter-debian-9
URLs to crawl: 754
crawling https://github.com/fhswf/image_captioning/commits/5cded40661b31af3f63a86124fc9086af365f088/images
URLs to crawl: 755
crawling https://github.com/fhswf/skriptsprachen/pulls?q=is%3Apr+is%3Aopen#start-of-content
URLs to crawl: 75

URLs to crawl: 1000
crawling https://github.com/fhswf/image_captioning/tree/d4530dfb585786be3c37b8c45524b048335b177d#data
URLs to crawl: 999
crawling https://github.com/fhswf/image_captioning/issues?q=is%3Aissue+is%3Aopen+comments%3A%3E50+updated%3A%3C2019-09-20+sort%3Areactions-%2B1-desc
URLs to crawl: 1002
crawling https://github.com/fhswf/image_captioning/issues?q=is%3Aissue+sort%3Areactions-tada-desc+comments%3A%3E50+is%3Aopen
URLs to crawl: 1017
crawling https://github.com/fhswf/image_captioning/issues?q=is%3Aissue+comments%3A%3E50+sort%3Areactions-eyes-desc+is%3Aclosed
URLs to crawl: 1018
crawling https://github.com/fhswf/latex-ifv/milestones?direction=desc&sort=completeness&state=open#start-of-content
URLs to crawl: 1017
crawling https://github.com/fhswf/image_captioning/blame/8fa3bf3d73307f96848d9bb3db1ece203ab50f21/.gitignore#start-of-content
URLs to crawl: 1016
crawling https://github.com/fhswf/image_captioning/commits/master/0_Dataset.ipynb
URLs to crawl: 1017
crawling https

URLs to crawl: 1301
crawling https://github.com/fhswf/image_captioning/issues?q=is%3Apr+author%3Acgawron+sort%3Areactions-eyes-desc+review%3Anone
URLs to crawl: 1317
crawling https://github.com/fhswf/image_captioning/issues?q=is%3Apr+is%3Aopen+sort%3Aupdated-desc+review%3Arequired
URLs to crawl: 1320
crawling https://github.com/fhswf/image_captioning#data
URLs to crawl: 1319
crawling https://github.com/fhswf/skriptsprachen/commit/524cccdfb6f0752004a17503558954244b541d68#diff-04c6e90faac2675aa89e2176d2eec7d8
URLs to crawl: 1318
crawling https://github.com/fhswf/image_captioning/issues?q=is%3Aopen+is%3Apr+review%3Achanges-requested+sort%3Acomments-asc
URLs to crawl: 1326
crawling https://github.com/fhswf/image_captioning/commits/5cded40661b31af3f63a86124fc9086af365f088#start-of-content
URLs to crawl: 1325
crawling https://github.com/fhswf/latex-ifv/find/fdfed412bc7f79f90e35f9dd0daaddc2f714a63c
URLs to crawl: 1326
crawling https://github.com/fhswf/latex-ifv/commits/44f97b3401b47e2c69e4b39

KeyboardInterrupt: 

In [55]:
for page in visited:
    print(f"visited: {page}")
for page in ignored:
    print(f"ignored: {page}")
    

visited: https://gitlab.com/fh-swf?sort=created_asc
visited: https://gitlab.com/fh-swf?sort=updated_asc#content-body
visited: https://gitlab.com/fh-swf#content-body
visited: https://gitlab.com/fh-swf?sort=created_asc#content-body
visited: https://gitlab.com/fh-swf?sort=name_asc#content-body
visited: https://gitlab.com/fh-swf?sort=created_desc#content-body
visited: https://gitlab.com/fh-swf?sort=name_desc
visited: https://gitlab.com/fh-swf?sort=name_asc
visited: https://gitlab.com/fh-swf
visited: https://gitlab.com/fh-swf?sort=updated_desc
visited: https://gitlab.com/fh-swf?sort=stars_desc#content-body
visited: https://gitlab.com/fh-swf?sort=stars_desc
visited: https://gitlab.com/fh-swf?sort=updated_desc#content-body
visited: https://gitlab.com/fh-swf?sort=updated_asc
visited: https://gitlab.com/fh-swf?sort=created_desc
visited: https://gitlab.com/fh-swf?sort=name_desc#content-body
ignored: https://gitlab.com/groups/fh-swf/-/shared
ignored: https://gitlab.com/groups/fh-swf/-/activity
ig

In [16]:


help(links[0])

Help on HtmlElement in module lxml.html object:

class HtmlElement(lxml.etree.ElementBase, HtmlMixin)
 |  ElementBase(*children, attrib=None, nsmap=None, **_extra)
 |  
 |  The public Element class.  All custom Element classes must inherit
 |  from this one.  To create an Element, use the `Element()` factory.
 |  
 |  __new__ as it is absolutely undefined when these objects will be
 |  created or destroyed.  All persistent state of Elements must be
 |  stored in the underlying XML.  If you really need to initialize
 |  the object after creation, you can implement an ``_init(self)``
 |  method that will be called directly after object creation.
 |  
 |  Subclasses of this class can be instantiated to create a new
 |  Element.  By default, the tag name will be the class name and the
 |  namespace will be empty.  You can modify this with the following
 |  class attributes:
 |  
 |  * TAG - the tag name, possibly containing a namespace in Clark
 |    notation
 |  
 |  * NAMESPACE - the def

In [17]:
help(doc)

Help on HtmlElement in module lxml.html object:

class HtmlElement(lxml.etree.ElementBase, HtmlMixin)
 |  ElementBase(*children, attrib=None, nsmap=None, **_extra)
 |  
 |  The public Element class.  All custom Element classes must inherit
 |  from this one.  To create an Element, use the `Element()` factory.
 |  
 |  __new__ as it is absolutely undefined when these objects will be
 |  created or destroyed.  All persistent state of Elements must be
 |  stored in the underlying XML.  If you really need to initialize
 |  the object after creation, you can implement an ``_init(self)``
 |  method that will be called directly after object creation.
 |  
 |  Subclasses of this class can be instantiated to create a new
 |  Element.  By default, the tag name will be the class name and the
 |  namespace will be empty.  You can modify this with the following
 |  class attributes:
 |  
 |  * TAG - the tag name, possibly containing a namespace in Clark
 |    notation
 |  
 |  * NAMESPACE - the def

In [41]:
resp.url

'https://www4.fh-swf.de/de/home/studieninteressierte/studienangebote/stg_so/esem_master/index.php'

In [20]:
for t in doc.itertext():
    print(t)




    
FH-SWF Home

	

	

	

	

	

	

	

	

    

	

	

    
 

    

        // Definiert die Anzahl der sichtbaren Page-Rotator Elemente
        visibleRotatorItems = 5;
    

    





   
        

            

                

                    
Sprungmarken

                    
Zur Hauptnavigation springen

                    
Zur Subnavigation springen

                    
Zum Inhalt springen

                

                
Fachhochschule Südwestfalen

                

                

                    

					                            
Mobile Version >

						                    

				

                

                

                

                

  	

    
Studieninteressierte

  	
Das Studienangebot

  	
Studiengangsuche

  	
Gesamtübersicht

  	
Duales Studium / Verbundstudium
Übersicht
Verbundstudium
Kooperative Modelle
Angebote von Bildungspartnern

  	
Studium Flexibel

  	
International studieren
Internationale Angebote
Studierende von Partner