# Scraper


In [1]:
# Construct my start_urls for all the cases.
# As there are no "previous/next" link betweem cases,
# we cannot use a typical recursive algorithm.
# Thus, we use the fact that the case names are structured:
# year + number + year
# (This is surely not the best way!)

last_year_in_data = 2018 # to be updated

years = list(range(2018, last_year_in_data+1))
years = [str(y) for y in years] # convert to string to concatenate with case numbers

case_num = list(range(1, 60))
# convert to string to concatenate with years:
case_num = ['-' + str(c).zfill(3) + 'N' for c in case_num] # zfill adds 0 to the left

# concatenate all string and years together:
cases = []
for y in years:
    for c in case_num:
        str(cases.append(y + c))

# complete URLs:
cases = ['https://www.ekr.admin.ch/f524/' + case + '.html' for case in cases]


### Issues to check:

* 2014-005N -> several cases in history: 2015-048N, (and thus appears here?)
    * https://www.ekr.admin.ch/f524/2015-043N.html#2016-020N ==> 2 seperate case number in URL -> use html?p=1 ?
    * https://www.ekr.admin.ch/prestations/f524/2014-005N.html?p=1 il y a bcp de texte...
    

* 2017-029N is missing
    * because no history? 


* 2016-020N missing too

* have duplicates: 2015-043N, 2015-047N, 2017-010N (4), 2018-009N, 2018-015N (3), 

* the accents are not read well by Excel. Excel issue?

* le titre de https://www.ekr.admin.ch/prestations/f524/2018-005N.html ne contient que <<

__Decisions:__  
* scrap https://www.ekr.admin.ch/f524/2014-005N.html no need the number twice! just an anchor
* do not care about jugement en second instance, just extract everything and then remove duplicates if needed. because both have the full text!


In [1]:
# Importing in each cell because need to restart kernel
import scrapy # version 2.2.0
import time
from scrapy.crawler import CrawlerProcess

t0 = time.time()
print("Running...")

class CFRSpider(scrapy.Spider):
    name = "CFR" # Naming the spider if you  running more than one spider of this class simultaneously.

    # URL(s) to start with.
    start_urls = [
       'https://www.ekr.admin.ch/f524/2014-005N.html#2014-005N',
       'https://www.ekr.admin.ch/f524/2015-048N.html#2016-020N',
    ]
#     start_urls = cases

    # What to do with the URL. 
    def parse(self, response):
        
        # Yield a dictionary with the values we want.
        yield {
            
            'case': response.xpath('body/div/div[2]/div/div/div/div/div/p/text()').extract_first(),
            'name': response.xpath('body/div/div[2]/div/div/div/div/div/h2/text()').extract_first(),
            'location': response.xpath('body/div/div[2]/div/div/div/div/div/p[2]/text()').extract_first(),
            
            # procedure history | Historique de la procédure | Verfahrensgeschichte | Cronistoria della procedura
            'year': response.xpath('body/div/div[2]/div/div/div/div/div/table[1]/tr/td/text()').extract()[0],
            'link': response.xpath('body/div/div[2]/div/div/div/div/div/table[1]/tr/td[2]/span/a/@href').extract_first(),
            'history': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[1]/tr/td[3])').extract_first(),
            
            # keywords | Mots-clés | Stichwörter | Parole chiave
            'authors': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[1]/td[2])').extract_first(),
            'victims': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[2]/td[2])').extract_first(),
            'means': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[3]/td[2])').extract_first(),
            'social_env': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[4]/td[2])').extract_first(),
            'ideology': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[5]/td[2])').extract_first(),
            
            # not sure yet what is the best for the text...
            # there is no new div, so it is not possible to distinguish it from the table above in the hierarchy
            # usually there are 2 elements: "Synthèse" and "Dècision", but sometimes more.
            # we will try to get them separately, both title h3 and text p,
            # with the idea to do some Pandas magic after
            # we think that there are no more than 5 titles, let's start with 6 to be sure
            'title_1': response.xpath('body/div/div[2]/div/div/div/div/div/h3[1]/text()').extract_first(),
            'title_2': response.xpath('body/div/div[2]/div/div/div/div/div/h3[2]/text()').extract_first(),
            'title_3': response.xpath('body/div/div[2]/div/div/div/div/div/h3[3]/text()').extract_first(),
            'title_4': response.xpath('body/div/div[2]/div/div/div/div/div/h3[4]/text()').extract_first(),
            'title_5': response.xpath('body/div/div[2]/div/div/div/div/div/h3[5]/text()').extract_first(),
            'title_6': response.xpath('body/div/div[2]/div/div/div/div/div/h3[6]/text()').extract_first(),
            # start at p[3], because the 2 first p are case and location
            'text_1': response.xpath('string(body/div/div[2]/div/div/div/div/div/p[3])').extract_first(),
            'text_2': response.xpath('string(body/div/div[2]/div/div/div/div/div/p[4])').extract_first(),
            'text_3': response.xpath('string(body/div/div[2]/div/div/div/div/div/p[5])').extract_first(),
            'text_4': response.xpath('string(body/div/div[2]/div/div/div/div/div/p[6])').extract_first(),
            'text_5': response.xpath('string(body/div/div[2]/div/div/div/div/div/p[7])').extract_first(),
            'text_6': response.xpath('string(body/div/div[2]/div/div/div/div/div/p[8])').extract_first()
        }
            
        page = response.url.split("/")[-1]
        filename = 'case_%s' % page
        with open(filename, 'wb') as f:
            f.write(response.body)
        self.log('Saved file %s' % filename)
        
# Instantiate our crawler.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'test.json',       # Name our storage file.
    'LOG_ENABLED': False           # Turn off logging for now.
})

# Start the crawler with our spider.
process.crawl(CFRSpider)
process.start()
print("Done! It took " + str(time.time() - t0) + " seconds." )

Running...
Done! It took 1.1657781600952148 seconds.


In [1]:
# Importing in each cell because need to restart kernel
import scrapy # version 2.2.0
import time
from scrapy.crawler import CrawlerProcess

t0 = time.time()
print("Running...")

class CFRSpider(scrapy.Spider):
    name = "CFR" # Naming the spider if you  running more than one spider of this class simultaneously.

    # URL(s) to start with.
    start_urls = [
       'https://www.ekr.admin.ch/f524/2014-005N.html#2014-005N',
       'https://www.ekr.admin.ch/f524/2015-048N.html#2016-020N',
    ]
#     start_urls = cases

    # What to do with the URL. 
    def parse(self, response):
        # store all the titles of the texts:
        titles = []
        for title in response.xpath('body/div/div[2]/div/div/div/div/div/h3/text()').getall():
            titles.append(title)
        
        texts = []
        for text in response.xpath('body/div/div[2]/div/div/div/div/div/p/text()').getall():
            texts.append(text)
        
        # Yield a dictionary with the values we want.
        yield {
            
            'case': response.xpath('body/div/div[2]/div/div/div/div/div/p/text()').extract_first(),
            'name': response.xpath('body/div/div[2]/div/div/div/div/div/h2/text()').extract_first(),
            'location': response.xpath('body/div/div[2]/div/div/div/div/div/p[2]/text()').extract_first(),
            
            # procedure history | Historique de la procédure | Verfahrensgeschichte | Cronistoria della procedura
            'year': response.xpath('body/div/div[2]/div/div/div/div/div/table[1]/tr/td/text()').extract()[0],
            'link': response.xpath('body/div/div[2]/div/div/div/div/div/table[1]/tr/td[2]/span/a/@href').extract_first(),
            'history': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[1]/tr/td[3])').extract_first(),
            
            # keywords | Mots-clés | Stichwörter | Parole chiave
            'authors': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[1]/td[2])').extract_first(),
            'victims': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[2]/td[2])').extract_first(),
            'means': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[3]/td[2])').extract_first(),
            'social_env': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[4]/td[2])').extract_first(),
            'ideology': response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[5]/td[2])').extract_first(),
            
            'titles': titles,
            
            'texts': texts[3:] # start at p[3], because the 2 first p are case and location
            

        }
            
        
# Instantiate our crawler.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'test.json',       # Name our storage file.
    'LOG_ENABLED': False           # Turn off logging for now.
})

# Start the crawler with our spider.
process.crawl(CFRSpider)
process.start()
print("Done! It took " + str(time.time() - t0) + " seconds." )

Running...
Done! It took 2.0776572227478027 seconds.


In [1]:
# Importing in each cell because need to restart kernel
import scrapy # version 2.2.0
import time
from scrapy.crawler import CrawlerProcess

t0 = time.time()
print("Running...")

class CFRSpider(scrapy.Spider):
    name = "CFR" # Naming the spider if you  running more than one spider of this class simultaneously.
    allowed_domains = ['https://www.ekr.admin.ch/f524/']
    # URL(s) to start with.
    start_urls = [
       'https://www.ekr.admin.ch/f524/2014-005N.html',
       'https://www.ekr.admin.ch/f524/2016-020N.html',
    ]
#     start_urls = cases

    # What to do with the URL. 
    def parse(self, response):
        # the css style are not really useful in these webpages... thus we use xpath
        
        # store all the titles of the texts:
        titles = []
        for title in response.xpath('body/div/div[2]/div/div/div/div/div/h3/text()').getall():
            titles.append(title)
        
        texts = []
        for text in response.xpath('body/div/div[2]/div/div/div/div/div/p/text()').getall():
            texts.append(text)
         
        # Yield a dictionary with the values we want.
        item = {}
        
        item['case'] = response.xpath('body/div/div[2]/div/div/div/div/div/p/text()').extract_first()
        item['name'] = response.xpath('body/div/div[2]/div/div/div/div/div/h2/text()').extract_first()
        item['location'] = response.xpath('body/div/div[2]/div/div/div/div/div/p[2]/text()').extract_first()
            
        # procedure history | Historique de la procédure | Verfahrensgeschichte | Cronistoria della procedura
        item['year'] = response.xpath('body/div/div[2]/div/div/div/div/div/table[1]/tr/td/text()').extract()[0]
        item['link'] = response.xpath('body/div/div[2]/div/div/div/div/div/table[1]/tr/td[2]/span/a/@href').extract_first()
        item['history'] = response.xpath('string(body/div/div[2]/div/div/div/div/div/table[1]/tr/td[3])').extract_first()
            
        # keywords | Mots-clés | Stichwörter | Parole chiave
        item['authors'] = response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[1]/td[2])').extract_first()
        item['victims'] = response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[2]/td[2])').extract_first()
        item['means'] = response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[3]/td[2])').extract_first()
        item['social_env'] = response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[4]/td[2])').extract_first()
        item['ideology'] = response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[5]/td[2])').extract_first()
            
        item['titles'] = titles
            
        item['texts'] = texts[3:] # start at p[3], because the 2 first p are case and location
                         
        yield item

            
        
# Instantiate our crawler.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'test.json',       # Name our storage file.
    'LOG_ENABLED': False           # Turn off logging for now.
})

# Start the crawler with our spider.
process.crawl(CFRSpider)
process.start()
print("Done! It took " + str(time.time() - t0) + " seconds." )

Running...
Done! It took 1.3137657642364502 seconds.


In [2]:
# Importing in each cell because need to restart kernel
import scrapy # version 2.2.0
import time
from scrapy.crawler import CrawlerProcess

t0 = time.time()
print("Running...")

class CFRSpider(scrapy.Spider):
    name = "CFR" # Naming the spider if you  running more than one spider of this class simultaneously.
    allowed_domains = ['https://www.ekr.admin.ch/f524/']
#     start_urls = [
#         'https://www.ekr.admin.ch/f524/2014-005N.html',
#         'https://www.ekr.admin.ch/f524/2016-020N.html'
#     ]
    start_urls = cases
    
    def parse(self, response):
        item = {} # a dictionary to store the results
        
        item['case'] = response.xpath('body/div/div[2]/div/div/div/div/div/p/text()').extract_first()
        item['name'] = response.xpath('body/div/div[2]/div/div/div/div/div/h2/text()').extract_first()
        item['location'] = response.xpath('body/div/div[2]/div/div/div/div/div/p[2]/text()').extract_first()

        # procedure history | Historique de la procédure | Verfahrensgeschichte | Cronistoria della procedura


        text = [] # collect all the text with html tags in one list
        # we do this, because there is no regular structure in the texts displayed
        
        # h3 titles that are not part of the interesting data:
        titles_not_wanted = ['<h3>Actualité</h3>', '<h3>Thèmes</h3>', '<h3>Bases juridiques</h3>', '<h3>International</h3>', '<h3>Prestations</h3>', '<h3>Publications</h3>', '<h3>La CFR</h3>', '<h3>Restez informé</h3>']
        
        for cnt, h3_selector in enumerate(response.css('h3'), start=1):
            key = h3_selector.extract() # <-with html tags | without-> xpath('normalize-space()').get() # get the h3 title's text
            if key not in titles_not_wanted:
                text.append(key) # append the h3 title
                values = h3_selector.xpath('following-sibling::p[count(preceding-sibling::h3)=$cnt]', cnt=cnt).getall()
                values = ''.join([str(elem) for elem in values])  # convert list to string (remove [])
                text.append(values) # add the paragraphs <p> in between the <h3>
        
        item['html_text'] = ''.join(text) # all the text together, in a single string (with html tags)
        item['html_text_as_list'] = text # all the text together, as a list (with html tags)

        yield(item)
    
    
# Instantiate our crawler.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'test.json',       # Name our storage file.
    'LOG_ENABLED': False           # Turn off logging for now.
})

# Start the crawler with our spider.
process.crawl(CFRSpider)
process.start()
print("Done! It took " + str(time.time() - t0) + " seconds." )

Running...
Done! It took 9.980156183242798 seconds.


In [3]:
import pandas as pd

test = pd.read_json('test.json', orient='records')
print(test.shape)
test.head().T

(59, 5)


Unnamed: 0,0,1,2,3,4
case,Cas 2018-002N,Cas 2018-001N,Cas 2018-006N,Cas 2018-008N,Cas 2018-010N
name,Internationale Konferenz der Anti-Zensur-Koali...,"Gens du voyage, travaux de construction",Hausfriedensbruchs in der Asylunterkunft,Provokativen Postkarten von Sharia_pride II,Rassistisches Video
location,Grisons,Fribourg,Bâle-Campagne,Schwyz,Berne
html_text_as_list,"[<h3>Synthèse</h3>, <p>Der Beschuldigte gründe...","[<h3>Synthèse</h3>, <p>Plusieurs associations ...","[<h3>Synthèse</h3>, <p>Der Beschuldigte betrat...","[<h3>Synthèse</h3>, <p>In der Nacht wurden Pos...","[<h3>Synthèse</h3>, <p>Der Beschuldigte – welc..."
html_text_in_one,<h3>Synthèse</h3><p>Der Beschuldigte gründete ...,<h3>Synthèse</h3><p>Plusieurs associations fri...,<h3>Synthèse</h3><p>Der Beschuldigte betrat oh...,<h3>Synthèse</h3><p>In der Nacht wurden Postka...,<h3>Synthèse</h3><p>Der Beschuldigte – welcher...


In [5]:
# test["En droit / considérants_4"][0]
test.head()
# test[pd.notna(test["Décision_5"])]
''.join(test.html_text_as_list[12])


'<h3>Synthèse</h3><p>Der Beschuldigte verfasste mit seinem Mobiltelefon auf dem Facebook-Profil der Zeitschrift Blick unter dem Artikel « <i>Zu wenig Tage in der Schweiz verbracht - Ägypter wird ausgeschafft </i>», die folgenden Kommentare: « <i>Na endlich! Christen zu Christen, Araber zu Araber! </i>»; «<i>Und genau deswegen MUSS der Islam bekämpft und ausgerottet werden </i>»; «<i>Und genau darum muss der Islam ausgerottet werden, Ich figge Allah i Arsch. </i>».<br>\nNach Ansicht der Staatsanwaltschaft rief der Beschuldigte mit diesen Kommentaren auf einer öffentlich zugänglichen Internetplattform wissentlich und willentlich gegen Angehörige der Religion des Islams zum Hass auf und verstiess somit gegen Art. 261<sup>bis</sup> StGB.<br>\n</p><h3>Décision</h3><p>Der Beschuldigte wird wegen Rassendiskriminierung schuldig erklärt und wird mit einer Geldstrafe von 60 Tagessätzen zu je CHF 30.00 und eine Busse von CHF 400.00 bestraft. Die Geldstrafe wird bedingt ausgesprochen bei einer Pro

In [6]:
test.html_text_as_list[12]

['<h3>Synthèse</h3>',
 '<p>Der Beschuldigte hat auf einem Fussballplatz in Anwesenheit von Kindern und anderen Zuschauern einen Mann als « <i>Dreckneger</i> » betitelt und ihm gesagt « <i>Wenn ich eine Banane werfen würde, würdest du wie ein Affe die Banane holen</i> ».Diese öffentliche Herabsetzung in der Menschenwürde verstösst nach Meinung der zuständigen Strafverfolgungsbehörde gegen das Verbot der Rassendiskriminierung in Art. 261<sup>bis</sup> Abs. 4 StGB, weshalb der Beschuldigte zu einer Geldstrafe verurteilt wurde.<br>\n</p>',
 '<h3>Décision</h3>',
 '<p>Der Beschuldigte wird wegen Rassendiskriminierung (Art. 261<sup>bis</sup> Abs. 4 StGB), für schuldig erklärt und wird mit einer Geldstrafe von 30 Tagessätzen zu je CHF 50.00, ausmachend CHF 300.00 bestraft. Die Geldstrafe wird bedingt ausgesprochen bei einer Probezeit von 3 Jahren. Die Kosten des Verfahrens im Umfang von CHF 530.00 werden dem Beschuldigten auferlegt.<br>\n</p>']

In [None]:
# some cleaning 

In [6]:
# save data table as .csv

test.to_csv(path_or_buf='test.csv')