In [None]:
!pip install requests --quiet
!pip install beautifulsoup4 --quiet
!pip install pandas --quiet

In [311]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [323]:
def read_metadata(filename):
    data = pd.read_csv(filename)
    data['Content'] = None
    data['Keywords'] = data['EUROVOC descriptor'].str.lower() + ", " + data['Subject matter'].str.lower() + ", " + data['Directory code'].str.lower()
    return data

def filter_data(data, searchwords = None):
    if searchwords == None: # if no searchwords are given
        return data
    elif type(searchwords) != list: # if search words are not in list format
        return 'please pass searchwords in list format'
    searchwords = list(map(lambda x: x.lower(), searchwords)) # convert searchwords to lowercase
    data_filtered = data[data['Keywords'].apply(lambda x: any(item for item in searchwords if item in x))]
    return data_filtered

def get_url(cellar_ref, doctype="03"):
    psid = cellar_ref
    psname = "cellar" # other options: cellar, celex, oj, com, genpub, ep, jurisprudence, dd, mtf, consolidation, eurostat, eesc, cor, nim, pegase, agent, uriserv, join, swd, comnat,mdr, legissum, ecli, procedure, procedure-event, eli, immc, planjo
    lancode = "0006" # language code
    doctype = doctype # default: 03
    docnum = "DOC_1"
    # for further information, see Documentation Page 37: https://op.europa.eu/en/publication-detail/-/publication/50ecce27-857e-11e8-ac6a-01aa75ed71a1/language-en/format-PDF/source-73059305
    return f"http://publications.europa.eu/resource/{psname}/{psid}.{lancode}.{doctype}/{docnum}"

def get_content(URL):
    response = requests.get(URL, headers={"Accept-Language":"en-US"})
    soup = BeautifulSoup(response.content, "html.parser")
    if str(soup)[1:4] == "PDF":
        '''
        in some (few) cases, the doctype is not 03 but 02. change it for these cases
        '''
        URL = URL[:-8] + '02' + URL[-6:]
        response = requests.get(URL, headers={"Accept-Language":"en-US"})
        soup = BeautifulSoup(response.content, "html.parser")
    else:
        pass
    content = ' '.join([item.text for item in soup.find_all("p", class_="oj-normal")])
    return content.split('Whereas:', 1)[1] # only return text without the head

def run_all(data, searchwords = None): #davor: filename
#    data = read_metadata(filename)
    if searchwords != None:
        data = filter_data(data, searchwords)
    cellar_references = data['Cellar reference']    
    for index, ref in enumerate(cellar_references):
        data.loc[index, 'Content'] = get_content(get_url(ref))
    # omit unnecessary columns
    return data[['Date of document', 'Title', 'Subtitle', 'CELEX number', 'EUROVOC descriptor', 'Subject matter', 'Directory code', 'Author', 'In force indicator', 'Content']]

In [303]:
filename = "../raw_data/Search results 20220531.csv"
data = read_metadata(filename)
data_with_content = run_all(data)
data_with_content
#data_with_content.to_csv("../raw_data/test_data_scraped.csv")

**Test Area**

In [327]:
data = read_metadata(filename)
data = filter_data(data, ['a'])
data_with_content_filtered = run_all(data)
data_with_content_filtered

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Unnamed: 0,Date of document,Title,Subtitle,CELEX number,EUROVOC descriptor,Subject matter,Directory code,Author,In force indicator,Content
0,2022-01-25,Regulation (EU) 2022/123 of the European Parli...,PE/76/2021/REV/1,32022R0123,"medical device, European Medicines Agency, inf...","Public health, Provisions governing the Instit...","General, financial and institutional matters, ...","European Parliament, Council of the European U...",True,(1) Pursuant to Articles 9 and 168 of the Tre...
1,2021-12-15,Regulation (EU) 2021/2303 of the European Parl...,PE/61/2021/REV/1,32021R2303,"EU office or agency, operation of the Institut...",Asylum policy,"Area of freedom, security and justice, Free mo...","Council of the European Union, European Parlia...",True,(1) The objective of the Union’s policy on as...
2,2021-12-15,Regulation (EU) 2021/2282 of the European Parl...,PE/80/2021/INIT,32021R2282,"public health, scientific cooperation, medical...","Research and training, Public health","Science, information, education and culture, E...","European Parliament, Council of the European U...",True,(1) The development of health technologies is...
3,2021-12-02,Regulation (EU) 2021/2115 of the European Parl...,PE/64/2021/REV/1,32021R2115,"aid to agriculture, EAGF, common agricultural ...",Agricultural structural funds,"Agriculture, Agriculture, Agricultural structu...","Council of the European Union, European Parlia...",True,(1) The Commission communication of 29 Novemb...
4,2021-12-02,Regulation (EU) 2021/2116 of the European Parl...,PE/65/2021/INIT,32021R2116,"aid to agriculture, EAGF, rural development, E...",Agricultural structural funds,"Agriculture, Agricultural structures, Social a...","Council of the European Union, European Parlia...",True,(1) The Commission communication of 29 Novemb...
5,2021-10-20,Regulation (EU) 2021/1873 of the European Parl...,PE/50/2021/REV/2,32021R1873,"fruit-growing, industrial property, floricultu...","Intellectual, industrial and commercial proper...","Law relating to undertakings, Agriculture, Int...","European Parliament, Council of the European U...",True,"(1) Technical difficulties in breeding, due t..."
6,2021-10-06,Regulation (EU) 2021/1755 of the European Parl...,PE/59/2021/REV/1,32021R1755,"economic and social cohesion, allocation of re...","Financial provisions, Economic, social and ter...","General, financial and institutional matters, ...","European Parliament, Council of the European U...",True,(1) The Agreement on the withdrawal of the Un...
7,2021-09-15,Regulation (EU) 2021/1529 of the European Parl...,PE/67/2021/INIT,32021R1529,"EU aid, political reform, EU financial instrum...","Financial provisions, External relations, Acce...","External relations, External relations, Action...","Council of the European Union, European Parlia...",True,(1) Regulation (EU) No 231/2014 of the Europe...
8,2021-07-14,Regulation (EU) 2021/1229 of the European Parl...,PE/33/2021/REV/1,32021R1229,"green economy, public sector, climate change, ...","Environment, Economic policy",Economic and monetary policy and free movement...,"European Parliament, Council of the European U...",True,"(1) On 11 December 2019, the Commission adopt..."
9,2021-07-14,Regulation (EU) 2021/1232 of the European Parl...,PE/38/2021/REV/1,32021R1232,"personal data, social media, regulation of tel...","Human rights, Justice and home affairs, Data p...","Industrial policy and internal market, Area of...","Council of the European Union, European Parlia...",True,(1) Directive 2002/58/EC of the European Parl...


In [298]:
def filter_data(searchwords = None):
    if searchwords == None: # if no searchwords are given
        return data
    elif type(searchwords) != list: # if search words are not in list format
        return 'please pass searchwords in list format'
    searchwords = list(map(lambda x: x.lower(), searchwords)) # convert searchwords to lowercase
    data_filtered = data[data['Keywords'].apply(lambda x: any(item for item in searchwords if item in x))]
    return data_filtered