# Crawling von Stellenanzeigen mit `scrapy`

Mithilfe des Pakets `scrapy` wollen wir heute Stellenangebote von [stellenanzeigen.de](https://www.stellenanzeigen.de) crawlen und in einer *MongoDB* speichern. 

In [None]:
import scrapy
from scrapy.linkextractors import LinkExtractor

class Job(scrapy.Item):
    """Klasse zur Speicherung der Stellenangebote"""
    header = scrapy.Field()
    text = scrapy.Field()
    link = scrapy.Field()
    
class JobSpider(scrapy.Spider):
    """Crawler für die Seite stellenanzeigen.de"""
    
    name = "stellenanzeigen.de"
    
    custom_settings = {
        'DOWNLOAD_FAIL_ON_DATALOSS': False,
        'DOWNLOAD_DELAY': 0.5,
        'ITEM_PIPELINES': {
            '__main__.MongoPipeline': 100,
        }
    }
    
    link_extractor = LinkExtractor(restrict_css=('a.position-link'))
    iframe_extractor = LinkExtractor(tags=('iframe'), attrs=('src'), restrict_css=('iframe.jobview-iframe'))

    def start_requests(self):
        urls = [
            "https://www.stellenanzeigen.de/stellenangebote/it/",
            #"https://www.stellenanzeigen.de/job/softwareentwickler-python-m-w-d-2896251/"
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split("/")
        links = self.link_extractor.extract_links(response)
        iframes = self.iframe_extractor.extract_links(response)
        
        for link in iframes:
            print(f"link: {link}")
            yield scrapy.Request(link.url, priority=100, callback=self.parse_iframe)
            
        for link in links:
            print(f"link: {link}")
            yield scrapy.Request(link.url)
            
    def parse_iframe(self, response):
        for job in response.css('html'):
            referer = response.request.headers.get('Referer', None).decode('latin1')
            print(f"job: {job.get()}")
            item = Job()
            item['header'] = job.css('h1 ::text').get()
            item['text'] = job.get().strip()
            item['link'] = referer
            yield item    
            

In [None]:
import json

class JSONPipeline():
    """Ablage der gecrawlten Items als JSON-Datei"""
    
    def open_spider(self, spider):
        self.file = open('jobs.jl', 'w')

    def close_spider(self, spider):
        self.file.close()
        
    def process_item(self, item, spider):
        print(f"processing: {item}")
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [None]:
import pymongo

class MongoPipeline():
    """Ablage der gecrawlten Items in einer MongoDB"""
    
    collection_name = 'scrapy_items'

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        self.db[self.collection_name].insert_one(dict(item))
        return item

In [None]:
from scrapy.crawler import CrawlerProcess

process = CrawlerProcess({
    'MONGO_URI': "mongodb://172.22.0.2:27017",
    'MONGO_DATABASE': 'jobs',
    'DOWNLOAD_DELAY': 0.5,
    #'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
}
)

process.crawl(JobSpider)
process.start()

In [None]:
!pwd
!ls

In [None]:
!pip3 freeze