In [None]:
!pip install requests --quiet
!pip install beautifulsoup4 --quiet
!pip install pandas --quiet
!pip install datetime --quiet

**Imports**

In [1]:
import pandas as pd
import numpy as np
import requests
import datetime
from bs4 import BeautifulSoup

**Functions**

In [96]:
def read_metadata(filename):
    '''
    reads in metadata from the api.py file and adds an empty column where the content of the pages will be in the end
    '''
    data = pd.read_csv(filename)
    data['Content'] = None
    return data

def get_url(cellar_ref, doctype="03"):
    '''
    creates a url based on the cellar reference in the metadata, which will be used to scrape the content
    '''
    psid = cellar_ref
    psname = "cellar" # other options: cellar, celex, oj...
    lancode = "0006" # language code
    doctype = doctype # default: 03
    docnum = "DOC_1"
    # for further information, see Documentation Page 37: https://op.europa.eu/en/publication-detail/-/publication/50ecce27-857e-11e8-ac6a-01aa75ed71a1/language-en/format-PDF/source-73059305
    return f"http://publications.europa.eu/resource/{psname}/{psid}.{lancode}.{doctype}/{docnum}"

def get_content(URL):
    '''
    main function, scrapes content. added some code to catch errors.
    '''
    response = requests.get(URL, headers={"Accept-Language":"en-US"})
    # one minor bug still in there: some requests (for example number 58 in 20220601_larger_data_b) are a valid request but have to download many mb first. the solution would be to stop the request.get if it runs longer than x seconds
    try:
        soup = BeautifulSoup(response.content, "html.parser")
        if str(soup)[1:4] == "PDF":
            '''
            in some (few) cases, the doctype is not 03 but 02. change it for these cases
            '''
            print("pdf detected, but fixed")
            doctype = '02'
            URL = URL[:-8] + doctype + URL[-6:]
            response = requests.get(URL, headers={"Accept-Language":"en-US"})
            soup = BeautifulSoup(response.content, "html.parser")
        else:
            print("no problem here")
            doctype = '03'
    except:
        '''
        in case there is an error
        '''
        print("yes problem here")
        URL = URL[:-8] + '02' + URL[-6:]
        response = requests.get(URL, headers={"Accept-Language":"en-US"})
        soup = BeautifulSoup(response.content, "html.parser")
        
    if soup.find("p", class_="oj-normal") == None:
        content = ' '.join([item.text for item in soup.find_all("p", class_="normal")])
    else:
        content = ' '.join([item.text for item in soup.find_all("p", class_="oj-normal")])
    return content

def clean_data(data):
    '''
    takes scraped data and removes rows which contain, no information, information in non-english and the head of all the valid content
    '''
    data = data[data['Content'] != ""]
    data = data[data['Content'].str[0:3] == 'THE'] #remove content in other languages
    data = data[data['Content'].str.contains('Whereas: ')] # contains the split word
    data.loc[:, 'Content'] = data['Content'].apply(lambda x: x.split('Whereas: ', 1)[1]) # split off header
    data = data[data['Content'].str[0:3] == "(1)"] #gotta make sure it's standardized!
    return data.reset_index().drop(columns = "index")

def get_all_content(data):
    '''
    loops over the functions to get all content
    '''
    cellar_references = data['cellar']    
    for index, ref in enumerate(cellar_references):
        data.loc[index, 'Content'] = get_content(get_url(ref))
        print(f'Row {index} with cellar-number {ref} done')
    return data

def get_all_content_with_splitting(data, batchsize=500, path ="../raw_data/", filename_without_csv="20220602"):
    '''
    loops over the functions to get all content. for more than 500 files the process will be split up
    '''
    if len(data) > batchsize:
        remaining = len(data) % batchsize
        iterations = int((len(data)-remaining) / batchsize)
        for batch in range(0,iterations):
            tmp = data.iloc[batch*batchsize:(batch+1)*batchsize]
            cellar_references = tmp['cellar'] 
            for index, ref in enumerate(cellar_references):
                tmp.loc[index, 'Content'] = get_content(get_url(ref))
                print(f'Row {index} from batch {batch} with cellar-number {ref} done')
                tmp.to_csv(f"{path}{filename_without_csv}_tmp_batch{batch}.csv")
        # add remaining rows
        #return "partitioned data can be found in '../raw_data/'
    else:
        cellar_references = data['cellar']    
        for index, ref in enumerate(cellar_references):
            data.loc[index, 'Content'] = get_content(get_url(ref))
            print(f'Row {index} with cellar-number {ref} done')
        return data

**Workflow**

In [None]:
%%time
#retrieve metadata
path = "../raw_data/"
filename_without_csv = "20220602"
data = read_metadata(path + filename_without_csv + '.csv')

# subset metadata
data = data.iloc[8000:] 
# started at 12:00 at 8000

# get content
data_with_content = get_all_content_with_splitting(data, batchsize = 500)

# clean content
#data_with_content_clean = clean_data(data_with_content)

# export data to csv
#data_with_content_clean.to_csv(path + filename_without_csv + "_scraped_test.csv")

no problem here
Row 0 from batch 0 with cellar-number a9754cd1-e50a-4a3b-becb-704873aba01d done


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp.loc[index, 'Content'] = get_content(get_url(ref))


no problem here
Row 1 from batch 0 with cellar-number 700cf946-b6ac-11ea-bb7a-01aa75ed71a1 done
no problem here
Row 2 from batch 0 with cellar-number 54b35a83-cd0a-11ec-a95f-01aa75ed71a1 done




no problem here
Row 3 from batch 0 with cellar-number 0091c5ed-9d1a-11e2-ab01-01aa75ed71a1 done
no problem here
Row 4 from batch 0 with cellar-number 6fcc7323-9d00-11e7-b92d-01aa75ed71a1 done
no problem here
Row 5 from batch 0 with cellar-number 6914afe7-e9fc-11e2-99b0-01aa75ed71a1 done
no problem here
Row 6 from batch 0 with cellar-number bd856048-7547-11e4-b593-01aa75ed71a1 done
no problem here
Row 7 from batch 0 with cellar-number aae0d092-37f6-11e4-8c3c-01aa75ed71a1 done
no problem here
Row 8 from batch 0 with cellar-number 38e850de-d174-11e7-a5b9-01aa75ed71a1 done
no problem here
Row 9 from batch 0 with cellar-number 1ff55a20-14b4-11e5-8817-01aa75ed71a1 done
no problem here
Row 10 from batch 0 with cellar-number c43335ba-5e00-11e6-9b08-01aa75ed71a1 done
no problem here
Row 11 from batch 0 with cellar-number a8d3732d-a1f7-11e2-ab01-01aa75ed71a1 done




no problem here
Row 12 from batch 0 with cellar-number 5b83c6fa-e2cb-11e6-ad7c-01aa75ed71a1 done
no problem here
Row 13 from batch 0 with cellar-number 77e17a4b-2632-4db7-b455-5b6b1ec9ca01 done
no problem here
Row 14 from batch 0 with cellar-number 342a85d5-4c74-11e9-a8ed-01aa75ed71a1 done
no problem here
Row 15 from batch 0 with cellar-number 1212ce4f-60ef-11e4-9cbe-01aa75ed71a1 done
no problem here
Row 16 from batch 0 with cellar-number 0e086e57-bca1-11e8-99ee-01aa75ed71a1 done
no problem here
Row 17 from batch 0 with cellar-number b289b30c-b243-48ea-a05d-c3df94c08eb2 done
no problem here
Row 18 from batch 0 with cellar-number b8d9fc04-9518-11ec-b4e4-01aa75ed71a1 done




no problem here
Row 19 from batch 0 with cellar-number e7f353fd-86ab-4913-ad4b-1fa1be4d436e done
no problem here
Row 20 from batch 0 with cellar-number bf338ccc-1dc7-11e2-91ce-01aa75ed71a1 done




no problem here
Row 21 from batch 0 with cellar-number b646b0cd-ca4d-11ea-adf7-01aa75ed71a1 done
no problem here
Row 22 from batch 0 with cellar-number 343d2f1f-fb82-4c5a-b571-9ba1e8443112 done
no problem here
Row 23 from batch 0 with cellar-number 5d2498d4-fcc2-11e1-8e28-01aa75ed71a1 done




no problem here
Row 24 from batch 0 with cellar-number f6a8dd1e-8308-11ea-bf12-01aa75ed71a1 done
no problem here
Row 25 from batch 0 with cellar-number bd89b5db-efc2-11e4-a3bf-01aa75ed71a1 done
no problem here
Row 26 from batch 0 with cellar-number 5c64b5e2-7e89-11e3-b889-01aa75ed71a1 done
no problem here
Row 27 from batch 0 with cellar-number c16b7c33-f10a-11e2-a22e-01aa75ed71a1 done
no problem here
Row 28 from batch 0 with cellar-number 27445b50-49f0-11e8-be1d-01aa75ed71a1 done
no problem here
Row 29 from batch 0 with cellar-number 9c3dc100-1d84-11e6-ba9a-01aa75ed71a1 done
no problem here
Row 30 from batch 0 with cellar-number 9f92e418-04c1-11e3-a352-01aa75ed71a1 done
no problem here
Row 31 from batch 0 with cellar-number a152eb07-ee20-11e8-b690-01aa75ed71a1 done
no problem here
Row 32 from batch 0 with cellar-number b0d03ba3-f75f-11e7-b8f5-01aa75ed71a1 done
no problem here
Row 33 from batch 0 with cellar-number 86fd1748-0eac-11ec-b771-01aa75ed71a1 done
no problem here
Row 34 from ba



no problem here
Row 36 from batch 0 with cellar-number 54f7c03b-a104-11e2-ab01-01aa75ed71a1 done
no problem here
Row 37 from batch 0 with cellar-number 8ba0a623-9099-11ec-b4e4-01aa75ed71a1 done
no problem here
Row 38 from batch 0 with cellar-number a7da69e9-547f-11ea-aece-01aa75ed71a1 done
no problem here
Row 39 from batch 0 with cellar-number 781ee249-4ac5-11e4-a0cb-01aa75ed71a1 done
no problem here
Row 40 from batch 0 with cellar-number 03e03497-f903-11e3-831f-01aa75ed71a1 done
no problem here
Row 41 from batch 0 with cellar-number 8b144ef5-3b91-11e7-a08e-01aa75ed71a1 done
no problem here
Row 42 from batch 0 with cellar-number 7e066043-9cff-11e7-b92d-01aa75ed71a1 done
no problem here
Row 43 from batch 0 with cellar-number f18b4e67-8e01-11e8-8a53-01aa75ed71a1 done
no problem here
Row 44 from batch 0 with cellar-number 7b2ebd92-12e4-4f73-9a64-dc690becb429 done
no problem here
Row 45 from batch 0 with cellar-number ba726d7d-114a-11e3-8d1c-01aa75ed71a1 done
no problem here
Row 46 from ba



no problem here
Row 49 from batch 0 with cellar-number 5b36e86c-fc38-11e3-831f-01aa75ed71a1 done
no problem here
Row 50 from batch 0 with cellar-number 8d5afbdf-fe91-11e3-831f-01aa75ed71a1 done
no problem here
Row 51 from batch 0 with cellar-number 5b475c91-36a0-11e7-a08e-01aa75ed71a1 done
no problem here
Row 52 from batch 0 with cellar-number df49da7d-c1d6-11e6-a6db-01aa75ed71a1 done
no problem here
Row 53 from batch 0 with cellar-number f123fc8c-c60a-11e8-9424-01aa75ed71a1 done
no problem here
Row 54 from batch 0 with cellar-number 87817dc1-ea7d-11e5-a2a7-01aa75ed71a1 done
no problem here
Row 55 from batch 0 with cellar-number 1f97b10c-ad3c-11eb-9767-01aa75ed71a1 done
no problem here
Row 56 from batch 0 with cellar-number 421f21bb-3704-11e3-a86c-01aa75ed71a1 done
no problem here
Row 57 from batch 0 with cellar-number 2b72e998-8a3f-11e2-b5c3-01aa75ed71a1 done




no problem here
Row 58 from batch 0 with cellar-number 9e70487d-d7fb-40b6-a642-dc5536ef5d12 done
no problem here
Row 59 from batch 0 with cellar-number edfe7ab1-c94d-11e4-bbe1-01aa75ed71a1 done
no problem here
Row 60 from batch 0 with cellar-number e96dd688-e400-11e4-b1d3-01aa75ed71a1 done
no problem here
Row 61 from batch 0 with cellar-number a8b39684-f0f9-11e1-8e28-01aa75ed71a1 done




no problem here
Row 62 from batch 0 with cellar-number 3918d2a6-36b5-11ea-ba6e-01aa75ed71a1 done
no problem here
Row 63 from batch 0 with cellar-number 61dfb603-0e00-11e8-966a-01aa75ed71a1 done
no problem here
Row 64 from batch 0 with cellar-number d28eef9a-9a76-11e6-9bca-01aa75ed71a1 done
no problem here
Row 65 from batch 0 with cellar-number 5b09cc6c-9dcb-11e7-b92d-01aa75ed71a1 done
no problem here
Row 66 from batch 0 with cellar-number cd9701fe-0849-11e7-8a35-01aa75ed71a1 done
no problem here
Row 67 from batch 0 with cellar-number a823dbb9-44cf-11e6-9c64-01aa75ed71a1 done
no problem here
Row 68 from batch 0 with cellar-number f393df18-84ec-11e4-91cd-01aa75ed71a1 done
no problem here
Row 69 from batch 0 with cellar-number 98bb494f-7f9a-11e5-b8b7-01aa75ed71a1 done
no problem here
Row 70 from batch 0 with cellar-number 9fc958ec-034b-11e3-a352-01aa75ed71a1 done
no problem here
Row 71 from batch 0 with cellar-number 6c6f32df-13ea-11e5-8817-01aa75ed71a1 done
no problem here
Row 72 from ba



no problem here
Row 80 from batch 0 with cellar-number 54f82a33-7597-11e6-b076-01aa75ed71a1 done
no problem here
Row 81 from batch 0 with cellar-number 1610abc4-7bd1-11e4-97c9-01aa75ed71a1 done
no problem here
Row 82 from batch 0 with cellar-number 23fc0846-935e-11ea-aac4-01aa75ed71a1 done
no problem here
Row 83 from batch 0 with cellar-number 9a5fd854-bde5-4650-a8e0-1d67050b0cfa done
no problem here
Row 84 from batch 0 with cellar-number ceb17fdb-cead-11e3-b682-01aa75ed71a1 done
no problem here
Row 85 from batch 0 with cellar-number 732ac5bb-46d6-4bf9-9890-af178f33c96c done
no problem here
Row 86 from batch 0 with cellar-number 3add343a-55dc-11e6-89bd-01aa75ed71a1 done
no problem here
Row 87 from batch 0 with cellar-number bcb4b4f8-4a27-11e9-a8ed-01aa75ed71a1 done
no problem here
Row 88 from batch 0 with cellar-number 2e3fe5f1-a91e-11e4-8e01-01aa75ed71a1 done
no problem here
Row 89 from batch 0 with cellar-number 6db42361-0206-11e6-b713-01aa75ed71a1 done
no problem here
Row 90 from ba



no problem here
Row 114 from batch 0 with cellar-number e84b4a5e-0269-11e8-b8f5-01aa75ed71a1 done
no problem here
Row 115 from batch 0 with cellar-number a00abef6-df07-11e2-9165-01aa75ed71a1 done




no problem here
Row 116 from batch 0 with cellar-number 1e8be5d2-6f8e-11e2-9294-01aa75ed71a1 done


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 117 from batch 0 with cellar-number e234c41f-9e90-11eb-b85c-01aa75ed71a1 done
no problem here
Row 118 from batch 0 with cellar-number bb4b1db6-3e52-11e4-8c3c-01aa75ed71a1 done
no problem here
Row 119 from batch 0 with cellar-number c36e55d1-be4b-11e4-bbe1-01aa75ed71a1 done
no problem here
Row 120 from batch 0 with cellar-number 0f126219-de57-11e2-9165-01aa75ed71a1 done
no problem here
Row 121 from batch 0 with cellar-number 9bdc4387-c755-11e6-a6db-01aa75ed71a1 done
no problem here
Row 122 from batch 0 with cellar-number c7a2b837-5370-11e4-a0cb-01aa75ed71a1 done
no problem here
Row 123 from batch 0 with cellar-number e7eedca3-ee74-11eb-a71c-01aa75ed71a1 done
no problem here
Row 124 from batch 0 with cellar-number d7b1f15c-1a79-11eb-b57e-01aa75ed71a1 done
no problem here
Row 125 from batch 0 with cellar-number 3eb36f76-0673-11e4-831f-01aa75ed71a1 done
no problem here
Row 126 from batch 0 with cellar-number 26362af8-6203-11eb-aeb5-01aa75ed71a1 done
no problem h

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 133 from batch 0 with cellar-number 2b0ee521-00ad-11ea-8c1f-01aa75ed71a1 done
no problem here
Row 134 from batch 0 with cellar-number a56bb904-43f9-11eb-b59f-01aa75ed71a1 done
no problem here
Row 135 from batch 0 with cellar-number 3cb5cd3b-1cd2-11ec-b4fe-01aa75ed71a1 done
no problem here
Row 136 from batch 0 with cellar-number 24ab7c27-5999-46e7-94ac-dd4d315ba50c done


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 137 from batch 0 with cellar-number 0166e052-9ddb-11eb-b85c-01aa75ed71a1 done
no problem here
Row 138 from batch 0 with cellar-number 3a3c4980-5edf-11ec-9c6c-01aa75ed71a1 done
no problem here
Row 139 from batch 0 with cellar-number 21586cde-8850-454d-befc-db160d4acfc1 done




no problem here
Row 140 from batch 0 with cellar-number 36ee7954-37d5-11e8-b5fe-01aa75ed71a1 done
no problem here
Row 141 from batch 0 with cellar-number 74c6f2ec-e09a-11e7-9749-01aa75ed71a1 done




no problem here
Row 142 from batch 0 with cellar-number f1145725-07c5-11e2-8e28-01aa75ed71a1 done
no problem here
Row 143 from batch 0 with cellar-number be7e05fe-9f8a-11e3-8b87-01aa75ed71a1 done
no problem here
Row 144 from batch 0 with cellar-number e880931d-fa65-11e6-8a35-01aa75ed71a1 done
no problem here
Row 145 from batch 0 with cellar-number 47780e56-352e-11eb-b27b-01aa75ed71a1 done
no problem here
Row 146 from batch 0 with cellar-number 1d9602e8-0126-499e-bfe4-cca6ad7288f0 done




no problem here
Row 147 from batch 0 with cellar-number b21bf6b3-c518-11e2-ab01-01aa75ed71a1 done


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 148 from batch 0 with cellar-number cf46df1b-e20d-11e7-9749-01aa75ed71a1 done
no problem here
Row 149 from batch 0 with cellar-number 70751579-5122-11e9-a8ed-01aa75ed71a1 done
no problem here
Row 150 from batch 0 with cellar-number 7705f6fa-423b-11e3-ae03-01aa75ed71a1 done
no problem here
Row 151 from batch 0 with cellar-number 263e49b3-876a-11e9-9f05-01aa75ed71a1 done
no problem here
Row 152 from batch 0 with cellar-number bd4772d3-23ce-11e6-86d0-01aa75ed71a1 done
no problem here
Row 153 from batch 0 with cellar-number 563bbbf5-47c6-11e5-9f5a-01aa75ed71a1 done
no problem here
Row 154 from batch 0 with cellar-number 18d1c66a-37d6-11e8-b5fe-01aa75ed71a1 done
no problem here
Row 155 from batch 0 with cellar-number bde0320b-55d1-11e4-a0cb-01aa75ed71a1 done
no problem here
Row 156 from batch 0 with cellar-number 9fd3fc1f-ad78-11e7-837e-01aa75ed71a1 done
no problem here
Row 157 from batch 0 with cellar-number 7f792e2c-56b7-11e5-afbf-01aa75ed71a1 done
no problem h

KeyboardInterrupt: 

**Read partitioned Data**

In [88]:
data_full = pd.read_csv("../raw_data/tmp/20220602_tmp_batch0.csv")

liste = list(np.arange(1,15)) # number+1 of tmp-dataframes, minus the first one

for i in liste:
    data_tmp = pd.read_csv(f"../raw_data/tmp/20220602_tmp_batch{i}.csv")
    data_tmp['Content'].iloc[0:499] = data_tmp['Content'].iloc[500:999].reset_index()['Content']
    data_tmp = data_tmp.drop(list(np.arange(500,1000)))
    data_full = data_full.append(data_tmp)
    
data_full = data_full.reset_index().drop(columns = ['Unnamed: 0.1', 'Unnamed: 0', 'index'])
data_full = clean_data(data_full)
#data_full.to_csv("../raw_data/20220602_part1_scraped.csv") # save to csv

# drop duplicates!

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_tmp['Content'].iloc[0:499] = data_tmp['Content'].iloc[500:999].reset_index()['Content']
  data_full = data_full.append(data_tmp)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_tmp['Content'].iloc[0:499] = data_tmp['Content'].iloc[500:999].reset_index()['Content']
  data_full = data_full.append(data_tmp)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_tmp['Content'].iloc[0:499] = data_tmp['Content'].iloc[500:999].reset_index()['Content']
  data_full = d

**Test Area**

In [None]:
get_url(data['cellar'][100])