In [None]:
!pip install requests --quiet
!pip install beautifulsoup4 --quiet
!pip install pandas --quiet
!pip install datetime --quiet

**Imports**

In [None]:
import pandas as pd
import numpy as np
import requests
import datetime
from bs4 import BeautifulSoup

**Functions**

In [None]:
def read_metadata(filename):
    data = pd.read_csv(filename).drop(columns = 'Unnamed: 0')
    data['Content'] = None
    return data

def get_url(cellar_ref, doctype="03"):
    psid = cellar_ref
    psname = "cellar" # other options: cellar, celex, oj...
    lancode = "0006" # language code
    doctype = doctype # default: 03
    docnum = "DOC_1"
    # for further information, see Documentation Page 37: https://op.europa.eu/en/publication-detail/-/publication/50ecce27-857e-11e8-ac6a-01aa75ed71a1/language-en/format-PDF/source-73059305
    return f"http://publications.europa.eu/resource/{psname}/{psid}.{lancode}.{doctype}/{docnum}"

def get_content(URL):
    response = requests.get(URL, headers={"Accept-Language":"en-US"})
    # one minor bug still in there: some requests (for example number 58 in 20220601_larger_data_b) are a valid request but have to download many mb first. the solution would be to stop the request.get if it runs longer than x seconds
    try:
        soup = BeautifulSoup(response.content, "html.parser")
        if str(soup)[1:4] == "PDF":
            print("pdf detected, but fixed")
            '''
            in some (few) cases, the doctype is not 03 but 02. change it for these cases
            '''
            URL = URL[:-8] + '02' + URL[-6:]
            response = requests.get(URL, headers={"Accept-Language":"en-US"})
            soup = BeautifulSoup(response.content, "html.parser")
        else:
            print("no problem here")
    except:
        '''
        in case there is an error
        '''
        print("yes problem here")
        URL = URL[:-8] + '02' + URL[-6:]
        response = requests.get(URL, headers={"Accept-Language":"en-US"})
        soup = BeautifulSoup(response.content, "html.parser")
        
    if soup.find("p", class_="oj-normal") == None:
        content = ' '.join([item.text for item in soup.find_all("p", class_="normal")])
    else:
        content = ' '.join([item.text for item in soup.find_all("p", class_="oj-normal")])
    return content #.split('Whereas:', 1)[1] # only return text without the head

def get_all_content(data):
    cellar_references = data['cellar']    
    for index, ref in enumerate(cellar_references):
        data.loc[index, 'Content'] = get_content(get_url(ref))
        print(f'Row {index} with cellar-number {ref} done')
    return data

**Workflow**

In [None]:
%%time
#retrieve metadata
filename = "../raw_data/20220601_larger_data_b.csv"
metadata = read_metadata(filename)

# subset metadata (for easier processing at the beginning)
#metadata_subset = metadata.iloc[40:70]

# get content
metadata_with_content = get_all_content(metadata)
#metadata_with_content[metadata_with_content['Content'] == ""]

# remove rows that didn't work at the beginning
#data_with_content_subset = data_with_content[data_with_content['Content'] != ""].reset_index().drop(columns = "index")

# content nachträglich splitten?

# export data to csv
#data_b_with_content.to_csv("../raw_data/20220601_larger_data_b_scraped.csv")

**Test Area**

In [None]:
data = read_metadata(filename)

In [None]:
URL = get_url(metadata['cellar'][0])

In [None]:
URL = get_url(metadata['cellar'][58], "03")

In [None]:
response = requests.get(URL, headers={"Accept-Language":"en-US"})
try:
    soup = BeautifulSoup(response.content, "html.parser")
    if str(soup)[1:4] == "PDF":
        print("pdf detected, but fixed")
        '''
        in some (few) cases, the doctype is not 03 but 02. change it for these cases
        '''
        URL = URL[:-8] + '02' + URL[-6:]
        response = requests.get(URL, headers={"Accept-Language":"en-US"})
        soup = BeautifulSoup(response.content, "html.parser")
    else:
        print("no problem here")
except:
    '''
    in case there is an error
    '''
    print("yes problem here")
    URL = URL[:-8] + '02' + URL[-6:]
    response = requests.get(URL, headers={"Accept-Language":"en-US"})
    soup = BeautifulSoup(response.content, "html.parser")

if soup.find("p", class_="oj-normal") == None:
    content = ' '.join([item.text for item in soup.find_all("p", class_="normal")])
else:
    content = ' '.join([item.text for item in soup.find_all("p", class_="oj-normal")])

print(URL)

content#.split('Whereas:', 1)[1] # only return text without the head