# Scraper


In [1]:
# Construct the start_urls for all the cases.
# As there are no "previous/next" link between cases,
# we cannot use a typical recursive algorithm.
# Thus, we use the fact that the case names are structured:
# year + number + N (YYYY-nnnN)
# (This is surely not the best way!)

last_year_in_data = 2019 # to be updated when CFR add data for new years...
extract_from_year = 1995 # first year in dataset is 1995.

years = list(range(extract_from_year, last_year_in_data+1))
years = [str(y) for y in years] # convert to string to concatenate with case numbers

case_num = list(range(1, 87)) # 86 is the max number of cases per year (so far)
# convert to string to concatenate with years:
case_num = ['-' + str(c).zfill(3) + 'N' for c in case_num] # zfill adds 0 to the left

# concatenate all strings (case numbers) and years together:
cases = []
for y in years:
    for c in case_num:
        str(cases.append(y + c))

# complete URLs:
cases = ['https://www.ekr.admin.ch/f524/' + case + '.html' for case in cases]


## Subtleties in the dataset

The cases that were judged more than once, typically because the issue was brought to another court "at second instance" or to a higher court, have a separate case number. They do not have a new entry as a new webpage with a new URL (e.g. case _[1997-032N](https://www.ekr.admin.ch/prestations/f524/1997-032N.html)_), but the URL redirects to the original (first related) case. In other words: second instance judgments do have a new case number, but not a new entry (URL). The URL where the original case is hosted contains the information for both the original (initial) case and the later judgments (the keywords are the same, but the text is completed to comment on the second instance judgment(s). 

We thus added two case numbers in the dataset: the original initial case number (_original_case_) from the title of the page, and the related later second instance judgment(s) (_current_case_), from the URL. Thus, anyone doing statistics on the data can choose which one they want to use.  

The consequence is that the number of _current_case_ in the scraped data is slightly higher than the number of cases displayed on the [CFR online database](https://www.ekr.admin.ch/prestations/f518.html). However, the distinct number of _original_case_ is the same.


In [2]:
# Importing in each cell because need to restart kernel everytime in Jupyter
import scrapy # version 2.2.0
import time
from scrapy.crawler import CrawlerProcess

t0 = time.time()
print("Running...")

class CFRSpider(scrapy.Spider):
    name = "CFR" # Naming the spider if you  running more than one spider of this class simultaneously.
    allowed_domains = ['https://www.ekr.admin.ch/f524/'] # not really necessary

    start_urls = cases # URLs constructed in previous code chunk
    
    def parse(self, response):
        
        item = {} # a dictionary to store the results
        
        
        item['original_case'] = response.xpath('body/div/div[2]/div/div/div/div/div/p/text()').extract_first()[4:] # [:4] to remove "Cas "
        item['current_case'] = response.request.url[30:39] # the easiest is to extract from the URL calleds
        item['name'] = response.xpath('string(body/div/div[2]/div/div/div/div/div/h2)').extract_first() # string(.) instead of ./text() is useful to get the <i>
        item['location'] = response.xpath('body/div/div[2]/div/div/div/div/div/p[2]/text()').extract_first()
        item['url'] = response.request.url

        
        # --- procedure history | Historique de la procédure | Verfahrensgeschichte | Cronistoria della procedura
        tr_list = []
        # we list all related cases in history (year, name, decision), separated by a |
        for cnt, tr_selector in enumerate(response.xpath('body/div/div[2]/div/div/div/div/div/table[1]/tr'), start=1):
            tr_list.append(tr_selector.xpath('normalize-space()').get())
        item['history'] = '|'.join(tr_list) # each case in history is appended, in one field.
        
        
        
        # --- legal search criteria | Critères de recherche juridiques | Juristische Suchbegriffe | Criteri di ricerca giuridici
        # not all cases have the same number of criteria listed. However, it seems that there is still a fixed list of possible criteria (7).
        # we take advantage of the facts that the first column (keys) is a <td class="width50pr">,
        # and the second col (values) is a <td class="verticalaligntop">
        # loop through first columns (keys);
        for cnt, tr_selector in enumerate(response.xpath('body/div/div[2]/div/div/div/div/div/table[2]/tr/td[contains(@class, "width50pr")]'), start=1):
            key = tr_selector.xpath('normalize-space()').get()
            # there is always only one value (second column) associated with the key:
            value = response.xpath('string(body/div/div[2]/div/div/div/div/div/table[2]/tr[$cnt]/td[contains(@class, "verticalaligntop")])', cnt=cnt).getall()
            # (some issues with special characters that we will have to handle later when cleaning data), e.g. \n
            item[key] = ''.join(value) # add as new variable
            
        
        
        # --- Keywords | Mots-clés | Stichwörter | Parole chiave
        # there are at maximum these 5, and it seem that they are always displayed, and always in the same order.
        item['authors'] = response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[1]/td[2])').extract_first()
        item['victims'] = response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[2]/td[2])').extract_first()
        item['means'] = response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[3]/td[2])').extract_first()
        item['social_env'] = response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[4]/td[2])').extract_first()
        item['ideology'] = response.xpath('string(body/div/div[2]/div/div/div/div/div/table[3]/tr[5]/td[2])').extract_first()

        
        # --- The Text describing the case
        text = [] # collect all the text with html tags in one list
        # we do this, because there is no regular structure in the texts displayed
        
        # h3 titles that are not part of the interesting data:
        titles_not_wanted = ['<h3>Actualité</h3>', '<h3>Thèmes</h3>', '<h3>Bases juridiques</h3>', '<h3>International</h3>', '<h3>Prestations</h3>', '<h3>Publications</h3>', '<h3>La CFR</h3>', '<h3>Restez informé</h3>']
        
        # loop through all h3 titles and get them, as well as all the <p> in between:
        # (adapted from https://stackoverflow.com/questions/45957062/how-to-select-and-extract-texts-between-two-elements)
        for cnt, h3_selector in enumerate(response.css('h3'), start=1):
            key = h3_selector.extract() # <-with html tags | without-> xpath('normalize-space()').get() # get the h3 title's text
            if key not in titles_not_wanted:
                text.append(key) # append the h3 title
                values = h3_selector.xpath('following-sibling::p[count(preceding-sibling::h3)=$cnt]', cnt=cnt).getall()
                values = ''.join([str(elem) for elem in values])  # convert list to string (remove [])
                text.append(values) # add the paragraphs <p> in between the <h3>
        # we keep the html tags because we think that it is important to keep the structure of the text (title etc.)
        item['html_text'] = ''.join(text) # all the text together, in a single string (with html tags)
        # item['html_text_as_list'] = text # all the text, but as a list (with html tags) / could be useful if someone wants to structure this data.
        
        
        yield(item)
    
    
# Instantiate our crawler.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_EXPORT_ENCODING': 'utf-8',    # set encoding, for special (German and French) characters
    'FEED_URI': '../data/CFR_' + str(extract_from_year) + '-' +str(last_year_in_data) + '_raw.json',       # Name our storage file.
    'LOG_ENABLED': False           # Turn off logging for now.
})

# Start the crawler with our spider.
process.crawl(CFRSpider)
process.start()
print("Done! It took " + str(time.time() - t0) + " seconds." )

Running...
Done! It took 223.2725908756256 seconds.


In [4]:
import pandas as pd
import numpy as np

CFR_raw = pd.read_json('../data/CFR_1995-2019_raw.json', orient='records')
print(CFR_raw.shape)
CFR_raw.head().T

(2150, 16)


Unnamed: 0,0,1,2,3,4
original_case,1995-003N,1995-003N,1995-002N,1995-005N,1995-004N
current_case,1995-003N,1995-007N,1995-002N,1995-005N,1995-004N
name,Strafanzeige gegen Polizeidirektor wegen Nicht...,Strafanzeige gegen Polizeidirektor wegen Nicht...,"Schriftzug: «Wir kriegen euch alle, ihr Scheis...",Leserbrief mit dem Titel «Wer trug die Schuld ...,Zeitungsartikel über eine religiöse Gruppierun...
location,Soleure,Soleure,Zurich,Grisons,Zurich
url,https://www.ekr.admin.ch/f524/1995-003N.html,https://www.ekr.admin.ch/f524/1995-007N.html,https://www.ekr.admin.ch/f524/1995-002N.html,https://www.ekr.admin.ch/f524/1995-005N.html,https://www.ekr.admin.ch/f524/1995-004N.html
history,1995 1995-007N Nichtanhandnahme (zuständige St...,1995 1995-007N Nichtanhandnahme (zuständige St...,1995 1995-002N Die zuständige Strafverfolgungs...,1995 1995-005N Die zuständige Strafverfolgungs...,1995 1995-004N Die zuständige Strafverfolgungs...
Acte / Eléments constitutifs objectifs,Art. 261bis CP / 171c CPM (aucune spécificatio...,Art. 261bis CP / 171c CPM (aucune spécificatio...,Incitation à la haine et à la discrimination (...,Négation dun génocide (al. 4 2ème phrase),Art. 261bis CP / 171c CPM (aucune spécificatio...
Objet de protection,,,Ethnie,Objet de protection en général,
Questions spécifiques sur l'élément constitutif,Elément constitutif subjectif de linfraction,Elément constitutif subjectif de linfraction,,,
authors,Employés du service public;\nExtrémistes de dr...,Employés du service public;\nExtrémistes de dr...,Extrémistes de droite;\nJeunes,Particuliers,Journalistes / éditeurs


## Some Cleaning

In [6]:
# Remove empty rows (case number, and thus URL, does not exist)
CFR = CFR_raw[CFR_raw["original_case"] != 'wurde gelöscht oder der Link auf die Seite ist falsch.']

print('number of distinct original cases: ' + str(CFR.original_case.nunique()))
print('number of distinct current cases: ' + str(CFR.current_case.nunique()))

# rename the long column names:
CFR = CFR.rename(columns = {
    'Acte / Eléments constitutifs objectifs':'act',
    'Objet de protection':'protection_object',
    "Questions spécifiques sur l'élément constitutif":'specific_questions',
    'Autorité/Instance':'authority'
})


number of distinct original cases: 935
number of distinct current cases: 1108


In [7]:
# replace fields that's entirely space (or empty) with NaN:
CFR = CFR.replace(r'^\s*$', np.nan, regex=True)

# replace all line breaks (\n) with a space ' '
CFR = CFR.replace(r'\n', ' ', regex=True)

In [8]:
# we add a new column with the text without the html tags in case (but we lose the structure!)
def remove_html_tags(text):
    """Remove html tags from a string"""
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, ' ', text) # replace tags with a space

CFR['text'] = CFR['html_text'] # first copy, then use list comprehension to clean each row:
CFR['text'] = [remove_html_tags(row) for row in CFR['text']]

In [9]:
# reorder column order to have "authority" next to the other legal search criteria
CFR = CFR[['original_case', 'current_case', 'name', 'location', 'url', 'history',
       'act', 'protection_object', 'specific_questions', 'authority', 'authors', 'victims',
       'means', 'social_env', 'ideology', 'html_text', 'text']]

In [11]:
# save data table as .csv

# # Excel does not handle the data well (accents + sudden line breaks), in contrary to Numbers (or python). Can we do something here?
CFR.to_csv(path_or_buf='../data/CFR_' + str(extract_from_year) + '-' + str(last_year_in_data) + '.csv', index=False, encoding='utf-8-sig') # the sig stands for “signature” which is used by Microsoft software to detect the encoding
CFR.to_csv(path_or_buf='../data/CFR_' + str(extract_from_year) + '-' + str(last_year_in_data) + '.json')
           
           