## BGE Scraping

A Jupyter notebook to scrape all BGEs that contain the keyword 'svg' and are relevant for our task.
They are saved into a simple CSV file which is split in id and text.

#### Handle imports

In [1]:
import requests
import csv

from bs4 import BeautifulSoup

link_list = []

#### Get all relevant BGE links

In [2]:
for i in range(1, 90):
    url = f'https://www.bger.ch/ext/eurospider/live/de/php/clir/http/index.php?lang=de&type=simple_query&page={i}&from_date=&to_date=&from_year=1954&to_year=2023&sort=relevance&insertion_date=&from_date_push=&top_subcollection_clir=bge&query_words=svg&part=all&de_fr=&de_it=&fr_de=&fr_it=&it_de=&it_fr=&orig=&translation='

    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')

    for a in soup.find_all('span', class_='rank_title'):
        link_list.append(a.find('a')['href'])


In [4]:
print(len(link_list))

884


#### Get content by scraping the links

In [5]:
def scrape_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        content = soup.find('div', class_='content')
        content_text = content.get_text()

        period_index = content_text.find('.')

        if period_index != -1:
            unformatted_id = content_text[:period_index]
            unformatted_content = content_text[period_index:]
            formatted_id = unformatted_id.replace('\nUrteilskopf\n', '')
            formatted_content = unformatted_content.replace('\n', '')
            return formatted_id, formatted_content
        else:
            print('No period found in the content.')
            return None

    else:
        print(f'Failed to scrape {url}')
        return None

In [6]:
with open('scraped_bges.csv', 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter='|')
    csv_writer.writerow(['id', 'bge'])

    for url in link_list:
        id, bge = scrape_url(url)
        if id is not None and bge is not None:
            csv_writer.writerow([id, bge])
        else:
            print(f'Error for bge with URL: {url} - Either id of bge content was None.')