# Scraper


In [2]:
# Importing in each cell because need to restart kernel
import scrapy # version 2.2.0
from scrapy.crawler import CrawlerProcess


class CFRSpider(scrapy.Spider):
    name = "CFR" # Naming the spider if you  running more than one spider of this class simultaneously.

    # URL(s) to start with.
    start_urls = [
        'https://www.ekr.admin.ch/f524/2018-037N.html#2018-037N',
        'https://www.ekr.admin.ch/f524/2018-036N.html#2018-036N',
    ]

    # What to do with the URL. 
    def parse(self, response):
        
        # Yield a dictionary with the values we want.
        yield {
            
            'case': response.xpath('body/div/div[2]/div/div/div/div/div/p/text()').extract_first(),
            'name': response.xpath('body/div/div[2]/div/div/div/div/div/h2/text()').extract_first(),
            'location': response.xpath('body/div/div[2]/div/div/div/div/div/p[2]/text()').extract_first(),
            
            # procedure history | Historique de la procédure | Verfahrensgeschichte | Cronistoria della procedura
            'year': response.xpath('body/div/div[2]/div/div/div/div/div/table[1]/tr/td/text()').extract()[0],
            'link': response.xpath('body/div/div[2]/div/div/div/div/div/table[1]/tr/td[2]/span/a/@href').extract_first(),
            'history': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[1]/tr/td[3])').extract_first(),
            
            # keywords | Mots-clés | Stichwörter | Parole chiave
            'authors': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[1]/td[2])').extract_first(),
            'victims': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[2]/td[2])').extract_first(),
            'means': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[3]/td[2])').extract_first(),
            'social_env': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[4]/td[2])').extract_first(),
            'ideology': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[5]/td[2])').extract_first(),
            
            # not sure yet what is the best for the text...
            # there is no new div, so it is not possible to distinguish it from the table above in the hierarchy
            # usually there are 2 elements: "Synthèse" and "Dècision", but sometimes more.
            # we will try to get them separately, both title h3 and text p,
            # with the idea to do some Pandas magic after
            # we think that there are no more than 5 titles, let's start with 6 to be sure
            'title_1': response.xpath('body/div/div[2]/div/div/div/div/div/h3[1]/text()').extract_first(),
            'title_2': response.xpath('body/div/div[2]/div/div/div/div/div/h3[2]/text()').extract_first(),
            'title_3': response.xpath('body/div/div[2]/div/div/div/div/div/h3[3]/text()').extract_first(),
            'title_4': response.xpath('body/div/div[2]/div/div/div/div/div/h3[4]/text()').extract_first(),
            'title_5': response.xpath('body/div/div[2]/div/div/div/div/div/h3[5]/text()').extract_first(),
            'title_6': response.xpath('body/div/div[2]/div/div/div/div/div/h3[6]/text()').extract_first()
            # start at p[3], because the 2 first p are case and location
            'text_1': response.xpath('string(body/div/div[2]/div/div/div/div/div/p[3])').extract_first(),
            'text_2': response.xpath('string(body/div/div[2]/div/div/div/div/div/p[4])').extract_first(),
            'text_3': response.xpath('string(body/div/div[2]/div/div/div/div/div/p[5])').extract_first(),
            'text_4': response.xpath('string(body/div/div[2]/div/div/div/div/div/p[6])').extract_first(),
            'text_5': response.xpath('string(body/div/div[2]/div/div/div/div/div/p[7])').extract_first(),
            'text_6': response.xpath('string(body/div/div[2]/div/div/div/div/div/p[8])').extract_first()
        }
            
        page = response.url.split("/")[-1]
#         filename = 'case_%s' % page
#         with open(filename, 'wb') as f:
#             f.write(response.body)
#         self.log('Saved file %s' % filename)
        
# Instantiate our crawler.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'test.json',       # Name our storage file.
    'LOG_ENABLED': False           # Turn off logging for now.
})

# Start the crawler with our spider.
process.crawl(CFRSpider)
process.start()
print("Done!")

Done!


In [3]:
import pandas as pd

test = pd.read_json('test.json', orient='records')
print(test.shape)
test.head()

(2, 17)


Unnamed: 0,case,name,location,year,link,history,authors,victims,means,social_env,ideology,title_1,title_2,title_3,title_4,title_5,title_6
0,Cas 2018-036N,Empfehlung einer Untersuchung beim Joseph Mengele,Zurich,2018,/f524/2018-036N.html#2018-036N,Die zuständige Strafverfolgungsbehörde verfügt...,Particuliers,Aucune indication sur la victime,Ecrits;\nCommunication électronique,Mass media (Internet inclus);\nMédias sociaux,Antisémitisme,Synthèse,Décision,,,,
1,Cas 2018-037N,Tatbestand unbekannt,Bâle-Ville,2018,/f524/2018-037N.html#2018-037N,Der Beschuldigte wird der Rassendiskriminierun...,Aucune indication sur l'auteur,Aucune indication sur la victime,Aucune indication sur les moyens utilisés,Aucune indication sur l'environnement social,Aucune indication sur l'idéologie,Synthèse,Décision,,,,


In [5]:
test.history[0]


'Die zuständige Strafverfolgungsbehörde verfügt eine Nichtanhandnahme.'