In [None]:
!pip install requests --quiet
!pip install beautifulsoup4 --quiet
!pip install pandas --quiet
!pip install datetime --quiet

**Imports**

In [43]:
import pandas as pd
import numpy as np
import requests
import datetime
from bs4 import BeautifulSoup

**Functions**

In [106]:
def read_metadata(filename):
    data = pd.read_csv(filename)
    data['Keywords'] = data['EUROVOC descriptor'].str.lower() + ", " + data['Subject matter'].str.lower() + ", " + data['Directory code'].str.lower()
    data['Content'] = None
    return data

def filter_data_words(data, searchwords = None):
    if searchwords == None: # if no searchwords are given
        print('no filtering done')
        return data
    elif type(searchwords) != list: # if search words are not in list format
        return 'please pass searchwords in list format'
    searchwords = list(map(lambda x: x.lower(), searchwords)) # convert searchwords to lowercase
    data_filtered = data[data['Keywords'].apply(lambda x: any(item for item in searchwords if item in x))]
    print(f"filtered on {searchwords}")
    return data_filtered.reset_index().drop(columns = 'index')

def filter_data_words_and_time(data, searchwords = None, startdate = None, enddate = None):
    if np.logical_and(searchwords == None, np.logical_and(startdate == None, enddate == None)): 
        # case: if no searchwords are given
        print('no filtering done')
        return data
    elif type(searchwords) != list: # if search words are not in list format
        return "Please pass searchwords in list format. If you don't want to filter for words, please apply the filter ['']"
    elif np.logical_or(np.logical_and(searchwords == None, np.logical_or(startdate != None, enddate != None)), np.logical_and(searchwords != None, np.logical_or(startdate == None, enddate == None))):
        # case: one type of filtering is applied, but not the other
        return "please pass searchwords in list-format and a start- and end-date in format '2022-05-25'. If you don't want to apply search words, please apply the filter ['']"
    elif np.logical_or(np.logical_and(startdate == None, enddate != None), np.logical_and(startdate != None, enddate == None)):
        # case: only start- or enddate given
        return 'please select both start and end-date in format "2022-05-25"'
    # case: searchwords and time given
    startdate = datetime.datetime.strptime(startdate, '%Y-%m-%d')    
    enddate = datetime.datetime.strptime(enddate, '%Y-%m-%d')
    data['Date of document'] = pd.to_datetime(data['Date of document'])
    if startdate > enddate:
        return 'startdate must be smaller than enddate'
    searchwords = list(map(lambda x: x.lower(), searchwords)) # convert searchwords to lowercase
    if searchwords == [""]:
        data_filtered_word = data
        print("no word-filter applied")
    else: 
        data_filtered_word = data[data['Keywords'].apply(lambda x: any(item for item in searchwords if item in x))]
        print(f"filtered on {searchwords}")
    data_filtered_word_time = data_filtered_word[np.logical_and(data_filtered_word['Date of document'] >= startdate, data_filtered_word['Date of document'] <= enddate)]
    print(f"selected data between {startdate} and {enddate}")
    return data_filtered_word_time.reset_index().drop(columns = 'index')

def get_url(cellar_ref, doctype="03"):
    psid = cellar_ref
    psname = "cellar" # other options: cellar, celex, oj, com, genpub, ep, jurisprudence, dd, mtf, consolidation, eurostat, eesc, cor, nim, pegase, agent, uriserv, join, swd, comnat,mdr, legissum, ecli, procedure, procedure-event, eli, immc, planjo
    lancode = "0006" # language code
    doctype = doctype # default: 03
    docnum = "DOC_1"
    # for further information, see Documentation Page 37: https://op.europa.eu/en/publication-detail/-/publication/50ecce27-857e-11e8-ac6a-01aa75ed71a1/language-en/format-PDF/source-73059305
    return f"http://publications.europa.eu/resource/{psname}/{psid}.{lancode}.{doctype}/{docnum}"

def get_content(URL):
    response = requests.get(URL, headers={"Accept-Language":"en-US"})
    soup = BeautifulSoup(response.content, "html.parser")
    if str(soup)[1:4] == "PDF":
        '''
        in some (few) cases, the doctype is not 03 but 02. change it for these cases
        '''
        URL = URL[:-8] + '02' + URL[-6:]
        response = requests.get(URL, headers={"Accept-Language":"en-US"})
        soup = BeautifulSoup(response.content, "html.parser")
    else:
        pass
    content = ' '.join([item.text for item in soup.find_all("p", class_="oj-normal")])
    return content.split('Whereas:', 1)[1] # only return text without the head

def get_all_content(data):
    cellar_references = data['Cellar reference']    
    for index, ref in enumerate(cellar_references):
        data.loc[index, 'Content'] = get_content(get_url(ref))
    # omit unnecessary columns
    return data[['Date of document', 'Title', 'Subtitle', 'CELEX number', 'Cellar reference', 'EUROVOC descriptor', 'Subject matter', 'Directory code', 'Author', 'In force indicator', 'Content']]

**Workflow**

In [107]:
#retrieve metadata
filename = "../raw_data/Search results 20220531.csv"
metadata = read_metadata(filename)

#filter for keywords
metadata_filtered = filter_data_words(metadata, ['medical'])
#or
metadata_filtered = filter_data_words_and_time(metadata, ['medical'], "2019-01-01", "2022-12-01")

#get content for filtered data
data_with_content = get_all_content(metadata_filtered)
data_with_content

#export data to csv
#data_with_content.to_csv("../raw_data/test_data_scraped.csv")

filtered on ['medical']
filtered on ['medical']
selected data between 2019-01-01 00:00:00 and 2022-12-01 00:00:00


Unnamed: 0,Date of document,Title,Subtitle,CELEX number,Cellar reference,EUROVOC descriptor,Subject matter,Directory code,Author,In force indicator,Content
0,2022-01-25,Regulation (EU) 2022/123 of the European Parli...,PE/76/2021/REV/1,32022R0123,197e1547-823a-11ec-8c40-01aa75ed71a1,"medical device, European Medicines Agency, inf...","Public health, Provisions governing the Instit...","General, financial and institutional matters, ...","European Parliament, Council of the European U...",True,(1) Pursuant to Articles 9 and 168 of the Tre...
1,2021-12-15,Regulation (EU) 2021/2282 of the European Parl...,PE/80/2021/INIT,32021R2282,177f73e7-62c9-11ec-a033-01aa75ed71a1,"public health, scientific cooperation, medical...","Research and training, Public health","Science, information, education and culture, E...","European Parliament, Council of the European U...",True,(1) The development of health technologies is...
