In [1]:
!pip install requests --quiet
!pip install beautifulsoup4 --quiet
!pip install pandas --quiet
!pip install datetime --quiet

**Imports**

In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

**Functions**

In [3]:
def read_metadata(filename):
    data = pd.read_csv(filename)
    data['Keywords'] = data['EUROVOC descriptor'].str.lower() + ", " + data['Subject matter'].str.lower() + ", " + data['Directory code'].str.lower()
    data['Content'] = None
    return data

def filter_data(data, searchwords = None):
    if searchwords == None: # if no searchwords are given
        print('no filtering done')
        return data
    elif type(searchwords) != list: # if search words are not in list format
        return 'please pass searchwords in list format'
    searchwords = list(map(lambda x: x.lower(), searchwords)) # convert searchwords to lowercase
    data_filtered = data[data['Keywords'].apply(lambda x: any(item for item in searchwords if item in x))]
    print(f"filtered on {searchwords}")
    return data_filtered.reset_index().drop(columns = 'index')

def get_url(cellar_ref, doctype="03"):
    psid = cellar_ref
    psname = "cellar" # other options: cellar, celex, oj, com, genpub, ep, jurisprudence, dd, mtf, consolidation, eurostat, eesc, cor, nim, pegase, agent, uriserv, join, swd, comnat,mdr, legissum, ecli, procedure, procedure-event, eli, immc, planjo
    lancode = "0006" # language code
    doctype = doctype # default: 03
    docnum = "DOC_1"
    # for further information, see Documentation Page 37: https://op.europa.eu/en/publication-detail/-/publication/50ecce27-857e-11e8-ac6a-01aa75ed71a1/language-en/format-PDF/source-73059305
    return f"http://publications.europa.eu/resource/{psname}/{psid}.{lancode}.{doctype}/{docnum}"

def get_content(URL):
    response = requests.get(URL, headers={"Accept-Language":"en-US"})
    soup = BeautifulSoup(response.content, "html.parser")
    if str(soup)[1:4] == "PDF":
        '''
        in some (few) cases, the doctype is not 03 but 02. change it for these cases
        '''
        URL = URL[:-8] + '02' + URL[-6:]
        response = requests.get(URL, headers={"Accept-Language":"en-US"})
        soup = BeautifulSoup(response.content, "html.parser")
    else:
        pass
    content = ' '.join([item.text for item in soup.find_all("p", class_="oj-normal")])
    return content.split('Whereas:', 1)[1] # only return text without the head

def get_all_content(data):
    cellar_references = data['Cellar reference']    
    for index, ref in enumerate(cellar_references):
        data.loc[index, 'Content'] = get_content(get_url(ref))
    # omit unnecessary columns
    return data[['Date of document', 'Title', 'Subtitle', 'CELEX number', 'EUROVOC descriptor', 'Subject matter', 'Directory code', 'Author', 'In force indicator', 'Content']]

**Functions to generate meta**

In [25]:
URL = 'https://eur-lex.europa.eu/EURLexWebService'

body = '''<soap:Envelope xmlns:soap="http://www.w3.org/2003/05/soap-envelope" xmlns:sear="http://eur-lex.europa.eu/search">
  <soap:Header>
    <wsse:Security xmlns:wsse="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-secext-1.0.xsd" soap:mustUnderstand="true">
      <wsse:UsernameToken xmlns:wsu="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-utility-1.0.xsd" wsu:Id="UsernameToken-1">
        <wsse:Username>n00a1zjc</wsse:Username>
        <wsse:Password Type="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-username-token-profile-1.0#PasswordText">h1iouDqcN5H</wsse:Password>
      </wsse:UsernameToken>
    </wsse:Security>
  </soap:Header>
  <soap:Body>
    <sear:searchRequest>
      <sear:expertQuery><![CDATA[DTS_SUBDOM = LEGISLATION AND (FM_CODED = REG OR REG_DEL OR REG_FINANC OR REG_IMPL) AND DTS_SUBDOM = LEGISLATION AND ((DD >= 01/01/2020  <= 31/12/2021 OR PD >= 01/01/2020  <= 31/12/2021) OR (PD >= 01/01/2020  <= 31/12/2021 OR IF >= 01/01/2020  <= 31/12/2021 OR EV >= 01/01/2020  <= 31/12/2021 OR NF >= 01/01/2020  <= 31/12/2021 OR SG >= 01/01/2020  <= 31/12/2021 OR TP >= 01/01/2020  <= 31/12/2021 OR DL >= 01/01/2020  <= 31/12/2021))]]></sear:expertQuery>
      <sear:page>1</sear:page>
      <sear:pageSize>20</sear:pageSize>
      <sear:searchLanguage>en</sear:searchLanguage>
    </sear:searchRequest>
  </soap:Body>
</soap:Envelope>'''
    
    
headerdict = {'Content-Type':'application/soap+xml',
              'Content-Length':'0',
              'Accept':'*/*',
              'Connection':'keep-alive'
             }

test = requests.post(URL,headers=headerdict,data=body)
test

<Response [200]>

In [None]:
def get_pages(request):
    

In [287]:
#requests_list = []
for i in range(100,300):
    page_num = i
    body = '''<soap:Envelope xmlns:soap="http://www.w3.org/2003/05/soap-envelope" xmlns:sear="http://eur-lex.europa.eu/search">
      <soap:Header>
        <wsse:Security xmlns:wsse="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-secext-1.0.xsd" soap:mustUnderstand="true">
          <wsse:UsernameToken xmlns:wsu="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-utility-1.0.xsd" wsu:Id="UsernameToken-1">
            <wsse:Username>n00a1zjc</wsse:Username>
            <wsse:Password Type="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-username-token-profile-1.0#PasswordText">h1iouDqcN5H</wsse:Password>
          </wsse:UsernameToken>
        </wsse:Security>
      </soap:Header>
      <soap:Body>
        <sear:searchRequest>
          <sear:expertQuery><![CDATA[DTS_SUBDOM = LEGISLATION AND (FM_CODED = REG OR REG_DEL OR REG_FINANC OR REG_IMPL) AND DTS_SUBDOM = LEGISLATION AND ((DD >= 01/01/2020  <= 31/12/2021 OR PD >= 01/01/2020  <= 31/12/2021) OR (PD >= 01/01/2020  <= 31/12/2021 OR IF >= 01/01/2020  <= 31/12/2021 OR EV >= 01/01/2020  <= 31/12/2021 OR NF >= 01/01/2020  <= 31/12/2021 OR SG >= 01/01/2020  <= 31/12/2021 OR TP >= 01/01/2020  <= 31/12/2021 OR DL >= 01/01/2020  <= 31/12/2021))]]></sear:expertQuery>
          <sear:page>''' + str(page_num) + '''</sear:page>
          <sear:pageSize>20</sear:pageSize>
          <sear:searchLanguage>en</sear:searchLanguage>
        </sear:searchRequest>
      </soap:Body>
    </soap:Envelope>'''
    headerdict = {'Content-Type':'application/soap+xml',
              'Content-Length':'0',
              'Accept':'*/*',
              'Connection':'keep-alive'
             }
    print(i)
    temp = requests.post(URL,headers=headerdict,data=body)
    requests_list.append(temp)

100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299


In [320]:
newroot = et.fromstring(requests_list[1].content)
newroot.find(".//{http://eur-lex.europa.eu/search}totalhits").text
newroot.find(".//{http://eur-lex.europa.eu/search}numhits").text

'20'

In [308]:
requests_list = requests_list[0:219]
df = pd.DataFrame(parse_xml(requests_list[0]))
count = 1
for i in requests_list[1:]:
    print(count)
    count += 1
    tempdf = pd.DataFrame(parse_xml(i))
    df = pd.concat([df,tempdf])
df = df.reset_index(drop=True)
df

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218


Unnamed: 0,title,cellar,date,dir_code,dir_1,dir_2,dir_3,dir_4,dir_5,dir_6
0,Regulation (EU) 2020/1503 of the European Parl...,5f5898ca-1299-11eb-9a54-01aa75ed71a1,2020-10-20,062020,Right of establishment and freedom to provide ...,Sectoral application,Service activities,,,
1,Regulation (EU) 2020/2224 of the European Parl...,0713d9a7-48af-11eb-b59f-01aa75ed71a1,2020-12-28,072030,Transport policy,Inland transport,Market operation,,,
2,Commission Implementing Regulation (EU) 2020/1...,a3902456-0d1c-11eb-bc07-01aa75ed71a1,2020-10-13,11604020,External relations,Commercial policy,Trade protection,Anti-dumping measures,,
3,Regulation (EU) 2021/691 of the European Parli...,8c5e37a3-ac09-11eb-9767-01aa75ed71a1,2021-05-03,05203020,Freedom of movement for workers and social policy,Social policy,Employment and unemployment,Protection of workers,,
4,Regulation (EU) 2021/953 of the European Parli...,141deaf1-cd73-11eb-ac72-01aa75ed71a1,2021-06-15,1530,"Environment, consumers and health protection",Health protection,,,,
...,...,...,...,...,...,...,...,...,...,...
4375,Commission Regulation (EC) No 1749/96 of 9 Sep...,61904dd3-5365-41ca-9604-54fae9552b79,1996-09-10,1007,Economic and monetary policy and free movement...,Statistics,,,,
4376,Commission Delegated Regulation (EU) 2019/981 ...,a1686aa8-9195-11e9-9369-01aa75ed71a1,2019-06-18,06202010,Right of establishment and freedom to provide ...,Sectoral application,Service activities,Insurance,,
4377,Commission Implementing Regulation (EU) 2015/7...,622467af-a13c-11e4-872e-01aa75ed71a1,2015-01-21,06202020,Right of establishment and freedom to provide ...,Sectoral application,Service activities,Banks,,
4378,Commission Implementing Regulation (EU) No 137...,655026a7-8a6c-11e4-b8a5-01aa75ed71a1,2014-12-23,11604020,External relations,Commercial policy,Trade protection,Anti-dumping measures,,


In [294]:
from xml.etree import ElementTree as et

def parse_xml(request):

    ''' Takes an XML returned from request and turns it into a dataframe with:
    - Cellar ID
    - Date
    - Dir code
    - Name of the dirs
    
    '''
    
    root = et.fromstring(request.content)
    
    pd_dict = {'title':[],'cellar':[],'date':[],'dir_code':[],'dir_1':[],'dir_2':[],'dir_3':[],'dir_4':[],'dir_5':[],'dir_6':[]}

    for child in root[1][0].findall('{http://eur-lex.europa.eu/search}result'):
            # Get reference
            raw_ref = child.find("./{http://eur-lex.europa.eu/search}reference").text
            ref = str.replace(raw_ref,"eng_cellar:","")
            pd_dict['cellar'].append(ref[0:-3])

            # Get date

            pd_dict['date'].append(child.find(".//{http://eur-lex.europa.eu/search}DATE_PUBLICATION")[0].text)

            # Get dir_code

            dirs = child.find(".//{http://eur-lex.europa.eu/search}RESOURCE_LEGAL_IS_ABOUT_CONCEPT_DIRECTORY-CODE")
            if dirs == None:
                pd_dict['dir_code'].append("")
                for i in range (1,6+1):
                    entry = 'dir_'+str(i)
                    pd_dict[entry].append("")
            else:
                pd_dict['dir_code'].append(dirs[-1][0].text)
                for i in range(0,len(dirs)):
                    entry = 'dir_'+str(i+1)
                    pd_dict[entry].append(dirs[i][2].text)
                    #print(dirs[i].tag)
                for i in range (len(dirs)+1,6+1):
                    entry = 'dir_'+str(i)
                    pd_dict[entry].append("")
                    #print(i)
                    #print("Empty")
            # Get title
            title = child.find(".//{http://eur-lex.europa.eu/search}EXPRESSION_TITLE")
            if title == None:
                pd_dict['title'].append("")
            else:
                if len(title) == 1:
                    pd_dict['title'].append(title[0].text)
                else:
                    pd_dict['title'].append(title[1].text)

            # Get dirs names

            
    return pd_dict

In [208]:
pd.DataFrame(pd_dict)

Unnamed: 0,cellar,date,dir_code,dir_1,dir_2,dir_3,dir_4,dir_5,dir_6
0,5f5898ca-1299-11eb-9a54-01aa75ed71a1,2020-10-20,62020,Right of establishment and freedom to provide ...,Sectoral application,Service activities,,,
1,0713d9a7-48af-11eb-b59f-01aa75ed71a1,2020-12-28,72030,Transport policy,Inland transport,Market operation,,,
2,a3902456-0d1c-11eb-bc07-01aa75ed71a1,2020-10-13,11604020,External relations,Commercial policy,Trade protection,Anti-dumping measures,,
3,8c5e37a3-ac09-11eb-9767-01aa75ed71a1,2021-05-03,5203020,Freedom of movement for workers and social policy,Social policy,Employment and unemployment,Protection of workers,,
4,141deaf1-cd73-11eb-ac72-01aa75ed71a1,2021-06-15,1530,"Environment, consumers and health protection",Health protection,,,,
5,365a2e8e-e04f-11eb-895a-01aa75ed71a1,2021-07-09,15102030,"Environment, consumers and health protection",Environment,Pollution and nuisances,Monitoring of atmospheric pollution,,
6,a0dfa793-d98d-11eb-895a-01aa75ed71a1,2021-06-30,160,"General, financial and institutional matters",Financial and budgetary provisions,,,,
7,85656db7-5c53-11eb-b487-01aa75ed71a1,2021-01-22,6202020,Right of establishment and freedom to provide ...,Sectoral application,Service activities,Banks,,
8,e5ba36a8-b454-11ea-bb7a-01aa75ed71a1,2020-06-22,103020,Economic and monetary policy and free movement...,Economic policy,Instruments of economic policy,,,
9,04b1a86c-74b0-11eb-9ac9-01aa75ed71a1,2021-02-22,730,Transport policy,Shipping,,,,


In [269]:
pd_dict = {'cellar':[],'date':[],'dir_code':[],'dir_1':[],'dir_2':[],'dir_3':[],'dir_4':[],'dir_5':[],'dir_6':[]}

root = et.fromstring(requests_list[11].content)

for child in root[1][0].findall('{http://eur-lex.europa.eu/search}result'):
    for j in child.iter():
        print(j.tag)

{http://eur-lex.europa.eu/search}result
{http://eur-lex.europa.eu/search}reference
{http://eur-lex.europa.eu/search}rank
{http://eur-lex.europa.eu/search}document_link
{http://eur-lex.europa.eu/search}document_link
{http://eur-lex.europa.eu/search}content
{http://eur-lex.europa.eu/search}DTS_SUBDOM
{http://eur-lex.europa.eu/search}DTS_SUBDOM
{http://eur-lex.europa.eu/search}DTS_SUBDOM
{http://eur-lex.europa.eu/search}DTS_SUBDOM
{http://eur-lex.europa.eu/search}NOTICE
{http://eur-lex.europa.eu/search}EXPRESSION
{http://eur-lex.europa.eu/search}EXPRESSION_SUBTITLE
{http://eur-lex.europa.eu/search}VALUE
{http://eur-lex.europa.eu/search}EXPRESSION_TITLE
{http://eur-lex.europa.eu/search}LANG
{http://eur-lex.europa.eu/search}VALUE
{http://eur-lex.europa.eu/search}EXPRESSION_USES_LANGUAGE
{http://eur-lex.europa.eu/search}URI
{http://eur-lex.europa.eu/search}IDENTIFIER
{http://eur-lex.europa.eu/search}MANIFESTATION
{http://eur-lex.europa.eu/search}SAMEAS
{http://eur-lex.europa.eu/search}URI
{h

In [207]:
for i in pd_dict:
    print(i)
    print(len(pd_dict[i]))

cellar
20
date
20
dir_code
20
dir_1
20
dir_2
20
dir_3
20
dir_4
20
dir_5
20
dir_6
20


**Workflow**

In [126]:
get_url('5f5898ca-1299-11eb-9a54-01aa75ed71a1_en')
get_content('http://publications.europa.eu/resource/cellar/5f5898ca-1299-11eb-9a54-01aa75ed71a1.0006.03/DOC_1')



In [4]:
#retrieve metadata
filename = "../raw_data/Search results 20220531.csv"
metadata = read_metadata(filename)

#filter for keywords
metadata_filtered = filter_data(metadata, ['medical'])

#get content for filtered data
data_with_content = get_all_content(metadata_filtered)
data_with_content

#export data to csv
#data_with_content.to_csv("../raw_data/test_data_scraped.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../raw_data/Search results 20220531.csv'

In [None]:
df = pd.read_csv('../raw_data/test_data_scraped_new.csv')

In [None]:
df


**Test Area**

In [None]:
metadata = read_metadata(filename)

In [None]:
metadata_filtered = filter_data(metadata, ['a'])
metadata_filtered

In [None]:
def filter_data(data, searchwords = None):
    if searchwords == None: # if no searchwords are given
        print('no filtering done')
        return data
    elif type(searchwords) != list: # if search words are not in list format
        return 'please pass searchwords in list format'
    searchwords = list(map(lambda x: x.lower(), searchwords)) # convert searchwords to lowercase
    data_filtered = data[data['Keywords'].apply(lambda x: any(item for item in searchwords if item in x))]
    print(f"filtered on {searchwords}")
    return data_filtered.reset_index().drop(columns = 'index')

In [None]:
metadata['Date of document'][0]