In [None]:
!pip install requests --quiet
!pip install beautifulsoup4 --quiet
!pip install pandas --quiet
!pip install datetime --quiet

**Imports**

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

**Functions**

In [None]:
def read_metadata(filename):
    data = pd.read_csv(filename)
    data['Keywords'] = data['EUROVOC descriptor'].str.lower() + ", " + data['Subject matter'].str.lower() + ", " + data['Directory code'].str.lower()
    data['Content'] = None
    return data

def filter_data(data, searchwords = None):
    if searchwords == None: # if no searchwords are given
        print('no filtering done')
        return data
    elif type(searchwords) != list: # if search words are not in list format
        return 'please pass searchwords in list format'
    searchwords = list(map(lambda x: x.lower(), searchwords)) # convert searchwords to lowercase
    data_filtered = data[data['Keywords'].apply(lambda x: any(item for item in searchwords if item in x))]
    print(f"filtered on {searchwords}")
    return data_filtered.reset_index().drop(columns = 'index')

def get_url(cellar_ref, doctype="03"):
    psid = cellar_ref
    psname = "cellar" # other options: cellar, celex, oj, com, genpub, ep, jurisprudence, dd, mtf, consolidation, eurostat, eesc, cor, nim, pegase, agent, uriserv, join, swd, comnat,mdr, legissum, ecli, procedure, procedure-event, eli, immc, planjo
    lancode = "0006" # language code
    doctype = doctype # default: 03
    docnum = "DOC_1"
    # for further information, see Documentation Page 37: https://op.europa.eu/en/publication-detail/-/publication/50ecce27-857e-11e8-ac6a-01aa75ed71a1/language-en/format-PDF/source-73059305
    return f"http://publications.europa.eu/resource/{psname}/{psid}.{lancode}.{doctype}/{docnum}"

def get_content(URL):
    response = requests.get(URL, headers={"Accept-Language":"en-US"})
    soup = BeautifulSoup(response.content, "html.parser")
    if str(soup)[1:4] == "PDF":
        '''
        in some (few) cases, the doctype is not 03 but 02. change it for these cases
        '''
        URL = URL[:-8] + '02' + URL[-6:]
        response = requests.get(URL, headers={"Accept-Language":"en-US"})
        soup = BeautifulSoup(response.content, "html.parser")
    else:
        pass
    content = ' '.join([item.text for item in soup.find_all("p", class_="oj-normal")])
    return content.split('Whereas:', 1)[1] # only return text without the head

def get_all_content(data):
    cellar_references = data['Cellar reference']    
    for index, ref in enumerate(cellar_references):
        data.loc[index, 'Content'] = get_content(get_url(ref))
    # omit unnecessary columns
    return data[['Date of document', 'Title', 'Subtitle', 'CELEX number', 'EUROVOC descriptor', 'Subject matter', 'Directory code', 'Author', 'In force indicator', 'Content']]

**Workflow**

In [None]:
#retrieve metadata
filename = "../raw_data/Search results 20220531.csv"
metadata = read_metadata(filename)

#filter for keywords
metadata_filtered = filter_data(metadata, ['medical'])

#get content for filtered data
data_with_content = get_all_content(metadata_filtered)
data_with_content

#export data to csv
#data_with_content.to_csv("../raw_data/test_data_scraped.csv")

**Test Area**

In [None]:
metadata = read_metadata(filename)

In [None]:
metadata_filtered = filter_data(metadata, ['a'])
metadata_filtered

In [None]:
def filter_data(data, searchwords = None):
    if searchwords == None: # if no searchwords are given
        print('no filtering done')
        return data
    elif type(searchwords) != list: # if search words are not in list format
        return 'please pass searchwords in list format'
    searchwords = list(map(lambda x: x.lower(), searchwords)) # convert searchwords to lowercase
    data_filtered = data[data['Keywords'].apply(lambda x: any(item for item in searchwords if item in x))]
    print(f"filtered on {searchwords}")
    return data_filtered.reset_index().drop(columns = 'index')

In [None]:
metadata['Date of document'][0]