In [None]:
!pip install requests --quiet
!pip install beautifulsoup4 --quiet
!pip install pandas --quiet
!pip install datetime --quiet

**Imports**

In [1]:
import pandas as pd
import numpy as np
import requests
import datetime
from bs4 import BeautifulSoup

**Functions**

In [2]:
def read_metadata(filename):
    data = pd.read_csv(filename).drop(columns = 'Unnamed: 0')
    data['Content'] = None
    return data

def get_url(cellar_ref, doctype="03"):
    psid = cellar_ref
    psname = "cellar" # other options: cellar, celex, oj...
    lancode = "0006" # language code
    doctype = doctype # default: 03
    docnum = "DOC_1"
    # for further information, see Documentation Page 37: https://op.europa.eu/en/publication-detail/-/publication/50ecce27-857e-11e8-ac6a-01aa75ed71a1/language-en/format-PDF/source-73059305
    return f"http://publications.europa.eu/resource/{psname}/{psid}.{lancode}.{doctype}/{docnum}"

def get_content(URL):
    response = requests.get(URL, headers={"Accept-Language":"en-US"})
    # one minor bug still in there: some requests (for example number 58 in 20220601_larger_data_b) are a valid request but have to download many mb first. the solution would be to stop the request.get if it runs longer than x seconds
    try:
        soup = BeautifulSoup(response.content, "html.parser")
        if str(soup)[1:4] == "PDF":
            '''
            in some (few) cases, the doctype is not 03 but 02. change it for these cases
            '''
            print("pdf detected, but fixed")
            doctype = '02'
            URL = URL[:-8] + doctype + URL[-6:]
            response = requests.get(URL, headers={"Accept-Language":"en-US"})
            soup = BeautifulSoup(response.content, "html.parser")
        else:
            print("no problem here")
            doctype = '03'
    except:
        '''
        in case there is an error
        '''
        print("yes problem here")
        URL = URL[:-8] + '02' + URL[-6:]
        response = requests.get(URL, headers={"Accept-Language":"en-US"})
        soup = BeautifulSoup(response.content, "html.parser")
        
    if soup.find("p", class_="oj-normal") == None:
        content = ' '.join([item.text for item in soup.find_all("p", class_="normal")])
    else:
        content = ' '.join([item.text for item in soup.find_all("p", class_="oj-normal")])
    return content

def clean_data(data):
    data = data[data['Content'] != ""]
    data = data[data['Content'].str[0:3] == 'THE'] #remove content in other languages
    data = data[data['Content'].str.contains('Whereas: ')] # contains the split word
    data.loc[:, 'Content'] = data['Content'].apply(lambda x: x.split('Whereas: ', 1)[1]) # split off header
    data = data[data['Content'].str[0:3] == "(1)"] #gotta make sure it's standardized!
    return data.reset_index().drop(columns = "index")

def get_all_content(data):
    cellar_references = data['cellar']    
    for index, ref in enumerate(cellar_references):
        data.loc[index, 'Content'] = get_content(get_url(ref))
        print(f'Row {index} with cellar-number {ref} done')
    return data

**Workflow**

In [3]:
%%time
#retrieve metadata
filename = "../raw_data/20220602.csv"
metadata = read_metadata(filename)

# subset metadata
metadata_subset = metadata.iloc[6851:]

# get content
data_with_content = get_all_content(metadata)#_subset)

# clean content
data_with_content_clean = clean_data(data_with_content8)

# export data to csv
#data_b_with_content.to_csv("../raw_data/20220601_larger_data_b_scraped.csv")

no problem here
Row 0 with cellar-number 3cb645b2-2a23-43c9-b842-b03665a6733a done
no problem here
Row 1 with cellar-number 3b729ddf-f1f7-11e3-8cd4-01aa75ed71a1 done
no problem here
Row 2 with cellar-number 8917d52e-4432-11e2-9b3b-01aa75ed71a1 done




no problem here
Row 3 with cellar-number ff333821-8325-4fcd-aad0-0a6f3913a242 done
no problem here
Row 4 with cellar-number e351eb07-9713-11e9-9369-01aa75ed71a1 done
no problem here
Row 5 with cellar-number 3cc00bdc-955e-11e3-8c34-01aa75ed71a1 done


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 6 with cellar-number fa9532bb-12e6-11e8-9253-01aa75ed71a1 done
no problem here
Row 7 with cellar-number 4763e956-79a1-42db-851f-c9521271ad5c done
no problem here
Row 8 with cellar-number e0c803af-9e0f-11e4-872e-01aa75ed71a1 done
no problem here
Row 9 with cellar-number 61d76918-8b7a-11eb-b85c-01aa75ed71a1 done
no problem here
Row 10 with cellar-number b3c20c11-85b9-11e4-b8a5-01aa75ed71a1 done
no problem here
Row 11 with cellar-number 2736e514-b31b-11eb-8aca-01aa75ed71a1 done
no problem here
Row 12 with cellar-number 1b011b40-2003-11eb-b57e-01aa75ed71a1 done
no problem here
Row 13 with cellar-number 6bf01c54-d572-4952-beaa-159de150f119 done
no problem here
Row 14 with cellar-number de093874-a22d-40ef-a7bf-5998d8920152 done


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 15 with cellar-number 03deb38e-9dda-11eb-b85c-01aa75ed71a1 done
no problem here
Row 16 with cellar-number 77c92212-b70d-11eb-8aca-01aa75ed71a1 done
no problem here
Row 17 with cellar-number 2c192686-2d63-11eb-b27b-01aa75ed71a1 done
no problem here
Row 18 with cellar-number bfe37913-06b3-11ea-8c1f-01aa75ed71a1 done
no problem here
Row 19 with cellar-number 78a94f7f-66b8-11ec-9136-01aa75ed71a1 done
no problem here
Row 20 with cellar-number 6692d62f-e094-11e7-9749-01aa75ed71a1 done
no problem here
Row 21 with cellar-number 2b672621-cd0a-11ec-a95f-01aa75ed71a1 done
no problem here
Row 22 with cellar-number bc77a477-81d7-11e9-9f05-01aa75ed71a1 done
no problem here
Row 23 with cellar-number 6624060c-9488-11e8-8bc1-01aa75ed71a1 done
no problem here
Row 24 with cellar-number 6f9ed10a-f734-11e9-8c1f-01aa75ed71a1 done
no problem here
Row 25 with cellar-number c6980a60-04e7-11e9-adde-01aa75ed71a1 done
no problem here
Row 26 with cellar-number 66a2c066-71c5-11eb-9ac9-01



no problem here
Row 76 with cellar-number ef447c04-e2cb-11e6-ad7c-01aa75ed71a1 done
no problem here
Row 77 with cellar-number f71cc663-5714-11e7-a5ca-01aa75ed71a1 done
no problem here
Row 78 with cellar-number eb4667ef-98ab-11e9-b2f2-01aa75ed71a1 done
no problem here
Row 79 with cellar-number a0ba9c80-7461-11e2-9294-01aa75ed71a1 done
no problem here
Row 80 with cellar-number 310526eb-014e-11eb-af6b-01aa75ed71a1 done
no problem here
Row 81 with cellar-number 5a88e1f0-37f6-11e4-8c3c-01aa75ed71a1 done
no problem here
Row 82 with cellar-number 22871978-6418-11e5-9317-01aa75ed71a1 done
no problem here
Row 83 with cellar-number 384f1677-93ed-11e9-9369-01aa75ed71a1 done
no problem here
Row 84 with cellar-number 3cf3e93a-f6c8-11e8-9982-01aa75ed71a1 done
no problem here
Row 85 with cellar-number f329b852-85d0-11e9-9f05-01aa75ed71a1 done
no problem here
Row 86 with cellar-number 797fe9c0-e463-11e6-ad7c-01aa75ed71a1 done
no problem here
Row 87 with cellar-number 0d1b72f2-d6dc-11e5-8fea-01aa75ed71

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 137 with cellar-number f0325987-4814-11ec-91ac-01aa75ed71a1 done
no problem here
Row 138 with cellar-number 687f1f03-cec0-41ec-a24f-91ad342ccb92 done
no problem here
Row 139 with cellar-number 04720e76-b749-11e9-9d01-01aa75ed71a1 done
no problem here
Row 140 with cellar-number 07d0b1ba-7dca-11e3-b889-01aa75ed71a1 done
no problem here
Row 141 with cellar-number 21441948-d032-11e3-8cd4-01aa75ed71a1 done
no problem here
Row 142 with cellar-number 3d5a993c-85d0-11e9-9f05-01aa75ed71a1 done
no problem here
Row 143 with cellar-number 15a11d93-ec06-11e9-9c4e-01aa75ed71a1 done
no problem here
Row 144 with cellar-number 5f14c14d-c271-11ea-b3a4-01aa75ed71a1 done
no problem here
Row 145 with cellar-number 9e989104-db52-11e8-afb3-01aa75ed71a1 done
no problem here
Row 146 with cellar-number 8bae35e4-f32d-11eb-aeb9-01aa75ed71a1 done
no problem here
Row 147 with cellar-number 91f56e8a-9ad1-11e3-82a3-01aa75ed71a1 done
no problem here
Row 148 with cellar-number be51c655-ffa4-

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 152 with cellar-number bc279c77-f135-11e3-8cd4-01aa75ed71a1 done
no problem here
Row 153 with cellar-number 1c5f8c62-750b-11e2-9294-01aa75ed71a1 done
no problem here
Row 154 with cellar-number 71f586ec-092c-11ec-b5d3-01aa75ed71a1 done
no problem here
Row 155 with cellar-number d8f2b15c-bac5-11ec-b6f4-01aa75ed71a1 done
no problem here
Row 156 with cellar-number e0329d17-882d-11e9-9369-01aa75ed71a1 done
no problem here
Row 157 with cellar-number b71381bf-e5b8-11e5-8a50-01aa75ed71a1 done
no problem here
Row 158 with cellar-number eb38e82c-1727-11ea-8c1f-01aa75ed71a1 done
no problem here
Row 159 with cellar-number 015749f2-52c9-11e5-9f5a-01aa75ed71a1 done
no problem here
Row 160 with cellar-number 41198264-ca4f-11eb-84ce-01aa75ed71a1 done
no problem here
Row 161 with cellar-number 7fcd0b09-022d-11eb-8919-01aa75ed71a1 done
no problem here
Row 162 with cellar-number bb65c657-89f9-11e6-b955-01aa75ed71a1 done
no problem here
Row 163 with cellar-number 8e3bafc2-40d5-

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


no problem here
Row 200 with cellar-number ecd4c241-ac00-11e2-ab01-01aa75ed71a1 done
no problem here
Row 201 with cellar-number 90978f80-4872-11e3-ae03-01aa75ed71a1 done
no problem here
Row 202 with cellar-number ba82626f-6dd8-11e7-b2f2-01aa75ed71a1 done
no problem here
Row 203 with cellar-number 980fd462-8977-4c03-9958-7187c934ae8b done
no problem here
Row 204 with cellar-number eb1e22bb-1d3e-11ea-95ab-01aa75ed71a1 done
no problem here
Row 205 with cellar-number d0664e08-cb7e-11e3-b74e-01aa75ed71a1 done
no problem here
Row 206 with cellar-number af17b43a-2fbe-11eb-b27b-01aa75ed71a1 done
no problem here
Row 207 with cellar-number 809194a4-5e23-11e7-954d-01aa75ed71a1 done
no problem here
Row 208 with cellar-number fd4ff663-64bd-11e4-9cbe-01aa75ed71a1 done
no problem here
Row 209 with cellar-number 14b081f1-ac16-11e8-99ee-01aa75ed71a1 done
no problem here
Row 210 with cellar-number 6136e47b-fae9-11e6-8a35-01aa75ed71a1 done
no problem here
Row 211 with cellar-number 04f6376d-0074-11e6-b71

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 270 with cellar-number 5dff5202-52ea-11ea-aece-01aa75ed71a1 done
no problem here
Row 271 with cellar-number b02085c1-f3bc-11e4-a3bf-01aa75ed71a1 done
no problem here
Row 272 with cellar-number fe984524-31f1-11e6-b497-01aa75ed71a1 done
no problem here
Row 273 with cellar-number 25f4b9d7-55dc-11e6-89bd-01aa75ed71a1 done
no problem here
Row 274 with cellar-number da3abd15-1361-11eb-9a54-01aa75ed71a1 done
no problem here
Row 275 with cellar-number 46413811-c31b-11e4-bbe1-01aa75ed71a1 done
no problem here
Row 276 with cellar-number c2874d66-403a-11eb-b27b-01aa75ed71a1 done
no problem here
Row 277 with cellar-number bfb7aed7-0abf-11ec-adb1-01aa75ed71a1 done
no problem here
Row 278 with cellar-number 895cf9e1-1a23-11e9-8d04-01aa75ed71a1 done
no problem here
Row 279 with cellar-number d43433ee-d019-11e7-a7df-01aa75ed71a1 done
no problem here
Row 280 with cellar-number 3f55cc04-53b3-11ea-aece-01aa75ed71a1 done
no problem here
Row 281 with cellar-number 12d807d5-a9c1-

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 305 with cellar-number 7cf5bc2e-ea71-11e3-8cd4-01aa75ed71a1 done


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 306 with cellar-number da30bda8-fd5b-11ea-b44f-01aa75ed71a1 done
no problem here
Row 307 with cellar-number df9d0ea7-e2eb-11e2-9165-01aa75ed71a1 done
no problem here
Row 308 with cellar-number 8f6574c4-d8e4-11e2-bfa7-01aa75ed71a1 done
no problem here
Row 309 with cellar-number f8016ccb-4d65-11ea-aece-01aa75ed71a1 done
no problem here
Row 310 with cellar-number 56e16600-7acc-11e9-9f05-01aa75ed71a1 done
no problem here
Row 311 with cellar-number a126d75b-0d1a-11eb-bc07-01aa75ed71a1 done
no problem here
Row 312 with cellar-number 76782d81-d22d-11ea-adf7-01aa75ed71a1 done
no problem here
Row 313 with cellar-number d8402b3b-a305-11eb-9585-01aa75ed71a1 done
no problem here
Row 314 with cellar-number e92bf9b0-0431-11e5-a4c8-01aa75ed71a1 done
no problem here
Row 315 with cellar-number 1c4aca71-c4cf-11eb-a925-01aa75ed71a1 done
no problem here
Row 316 with cellar-number a10e3583-0248-11e2-8e28-01aa75ed71a1 done
no problem here
Row 317 with cellar-number 5a006217-dc8e-

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 359 with cellar-number 88556ee2-8dec-11ea-812f-01aa75ed71a1 done
no problem here
Row 360 with cellar-number 2dd1aa06-6675-11e5-9317-01aa75ed71a1 done
no problem here
Row 361 with cellar-number 738e8fdb-bcc6-11e9-9d01-01aa75ed71a1 done
no problem here
Row 362 with cellar-number 6c9ad693-66c1-11eb-aeb5-01aa75ed71a1 done
no problem here
Row 363 with cellar-number 7c85881b-a0b6-11eb-b85c-01aa75ed71a1 done
no problem here
Row 364 with cellar-number c92e9dc9-b3fd-11e3-86f9-01aa75ed71a1 done
no problem here
Row 365 with cellar-number ccf42f53-a9c0-11e8-99ee-01aa75ed71a1 done
no problem here
Row 366 with cellar-number fd346af1-4955-11ea-b81b-01aa75ed71a1 done
no problem here
Row 367 with cellar-number 8d6cc612-e463-11e6-ad7c-01aa75ed71a1 done
no problem here
Row 368 with cellar-number 22cdcce7-bc66-11ea-811c-01aa75ed71a1 done
no problem here
Row 369 with cellar-number 83d740ff-d874-11eb-895a-01aa75ed71a1 done
no problem here
Row 370 with cellar-number 23c6f7fb-c020-

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 432 with cellar-number b8556188-0475-11eb-a511-01aa75ed71a1 done
no problem here
Row 433 with cellar-number 966c2257-46fc-11e5-9f5a-01aa75ed71a1 done
no problem here
Row 434 with cellar-number cfffa323-cac2-11e3-9fe4-01aa75ed71a1 done
no problem here
Row 435 with cellar-number a06a0604-6753-11e9-9f05-01aa75ed71a1 done
no problem here
Row 436 with cellar-number 20e7d666-f7e4-11ea-991b-01aa75ed71a1 done
no problem here
Row 437 with cellar-number d56c718d-e689-11e5-8a50-01aa75ed71a1 done
no problem here
Row 438 with cellar-number 36a71d6e-3eb0-11eb-b27b-01aa75ed71a1 done
no problem here
Row 439 with cellar-number 331ea02b-fc38-11e3-831f-01aa75ed71a1 done
no problem here
Row 440 with cellar-number 1087cad6-31f2-11e6-b497-01aa75ed71a1 done
no problem here
Row 441 with cellar-number 64313b4a-be4b-11e4-bbe1-01aa75ed71a1 done
no problem here
Row 442 with cellar-number 4544cb54-7d88-11e7-b2f2-01aa75ed71a1 done
no problem here
Row 443 with cellar-number 7698058f-377e-

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 444 with cellar-number 8f345857-14e5-11ea-8c1f-01aa75ed71a1 done
no problem here
Row 445 with cellar-number 7a1cf350-faf4-11e5-b713-01aa75ed71a1 done
no problem here
Row 446 with cellar-number bcf6d0f3-2a8b-11e4-8c3c-01aa75ed71a1 done
no problem here
Row 447 with cellar-number 79ad1475-0abf-11ec-adb1-01aa75ed71a1 done
no problem here
Row 448 with cellar-number 6b18ad97-47ea-11ea-b81b-01aa75ed71a1 done
no problem here
Row 449 with cellar-number d7b67cc7-8e8a-11e5-b8b7-01aa75ed71a1 done
no problem here
Row 450 with cellar-number 171eeb05-a662-11eb-9585-01aa75ed71a1 done
no problem here
Row 451 with cellar-number 3e630983-0a28-11ec-adb1-01aa75ed71a1 done
no problem here
Row 452 with cellar-number 3b273fee-1d3d-11ea-95ab-01aa75ed71a1 done
no problem here
Row 453 with cellar-number 9dfbef60-d563-4bd7-8d3c-be0cad8f9e74 done
no problem here
Row 454 with cellar-number 996b850e-2059-11ea-95ab-01aa75ed71a1 done
no problem here
Row 455 with cellar-number d0777556-7f20-

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 467 with cellar-number 77502da7-0475-11eb-a511-01aa75ed71a1 done
no problem here
Row 468 with cellar-number 0619bcb8-b43e-11e5-8d3c-01aa75ed71a1 done
no problem here
Row 469 with cellar-number 02e93e89-17f8-11ea-8c1f-01aa75ed71a1 done
no problem here
Row 470 with cellar-number 305fe7c1-cf35-11eb-ac72-01aa75ed71a1 done
no problem here
Row 471 with cellar-number b3771f32-315c-11e7-9412-01aa75ed71a1 done
no problem here
Row 472 with cellar-number ff195bf4-58e3-11e4-a0cb-01aa75ed71a1 done
no problem here
Row 473 with cellar-number bbefbe95-fbbd-11e5-b713-01aa75ed71a1 done
no problem here
Row 474 with cellar-number dce9da48-c402-11e5-8d08-01aa75ed71a1 done
no problem here
Row 475 with cellar-number 838243ec-7c01-11e2-9294-01aa75ed71a1 done
no problem here
Row 476 with cellar-number 3e69e913-3e77-11ea-ba6e-01aa75ed71a1 done
no problem here
Row 477 with cellar-number d2dd6ecd-c425-11e6-a6db-01aa75ed71a1 done
no problem here
Row 478 with cellar-number 9e602008-f0b4-

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 519 with cellar-number ca13564b-8dea-11ea-812f-01aa75ed71a1 done
no problem here
Row 520 with cellar-number 95e5065f-dc8e-11e6-ad7c-01aa75ed71a1 done
no problem here
Row 521 with cellar-number 783a8119-e85b-11eb-93a8-01aa75ed71a1 done
no problem here
Row 522 with cellar-number 23470ba8-5cc8-11e3-914b-01aa75ed71a1 done
no problem here
Row 523 with cellar-number e3ba6da3-d09d-11ea-adf7-01aa75ed71a1 done
no problem here
Row 524 with cellar-number f1fa1074-d0c3-11ec-a95f-01aa75ed71a1 done
no problem here
Row 525 with cellar-number dd4fae63-d1c2-11ec-a95f-01aa75ed71a1 done
no problem here
Row 526 with cellar-number 2f7c8dbd-0a98-11ea-8c1f-01aa75ed71a1 done
no problem here
Row 527 with cellar-number 9d61e14a-c2b6-11e2-ab01-01aa75ed71a1 done
no problem here
Row 528 with cellar-number a1daca74-03f8-11e3-a352-01aa75ed71a1 done


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 529 with cellar-number eee72218-37b3-11ea-ba6e-01aa75ed71a1 done
no problem here
Row 530 with cellar-number afb4fe76-5e23-11e7-954d-01aa75ed71a1 done
no problem here
Row 531 with cellar-number 0502047f-b49e-11e7-837e-01aa75ed71a1 done
no problem here
Row 532 with cellar-number ca6d1278-9888-47ca-bb9b-f0fd4cb4211b done
no problem here
Row 533 with cellar-number e452ea0e-d109-11e8-9424-01aa75ed71a1 done
no problem here
Row 534 with cellar-number 97525f8e-92c5-11e7-b92d-01aa75ed71a1 done
no problem here
Row 535 with cellar-number c4c452d0-55db-11e9-a8ed-01aa75ed71a1 done
no problem here
Row 536 with cellar-number aac840b0-476c-11e7-aea8-01aa75ed71a1 done
no problem here
Row 537 with cellar-number b39dd145-377e-11e6-a825-01aa75ed71a1 done
no problem here
Row 538 with cellar-number 391ea49a-3a1e-11e9-8d04-01aa75ed71a1 done
no problem here
Row 539 with cellar-number 3099783b-e305-4969-ac70-8e3d33b0e7ef done
no problem here
Row 540 with cellar-number 5123bf5f-46bc-

no problem here
Row 626 with cellar-number 2892bd22-47c6-11e5-9f5a-01aa75ed71a1 done
no problem here
Row 627 with cellar-number d3363a21-9264-11e6-8e27-01aa75ed71a1 done
no problem here
Row 628 with cellar-number 72cd534b-613d-11eb-8146-01aa75ed71a1 done
no problem here
Row 629 with cellar-number a05ba122-2fad-11ec-bd8e-01aa75ed71a1 done
no problem here
Row 630 with cellar-number 7a87f4d9-99da-11e7-b92d-01aa75ed71a1 done
no problem here
Row 631 with cellar-number 753120ea-7c4c-11e4-97c9-01aa75ed71a1 done
no problem here
Row 632 with cellar-number 093501be-053f-11ec-b5d3-01aa75ed71a1 done
no problem here
Row 633 with cellar-number 2a3aac56-8ac4-11e6-b955-01aa75ed71a1 done
no problem here
Row 634 with cellar-number 77556e55-7fdd-11ec-8c40-01aa75ed71a1 done
no problem here
Row 635 with cellar-number 5d1a61f3-b356-11e9-9d01-01aa75ed71a1 done
no problem here
Row 636 with cellar-number b31fcb92-3743-11e5-98a0-01aa75ed71a1 done
no problem here
Row 637 with cellar-number 6aec9219-27c8-11e6-914

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 640 with cellar-number 24ec3887-25bc-11e8-ac73-01aa75ed71a1 done
no problem here
Row 641 with cellar-number 48c9d019-9561-11e5-983e-01aa75ed71a1 done
no problem here
Row 642 with cellar-number a7759586-81dc-11e9-9f05-01aa75ed71a1 done
no problem here
Row 643 with cellar-number b07a44a4-0248-11e2-8e28-01aa75ed71a1 done
no problem here
Row 644 with cellar-number c3de12a4-639f-11e7-b2f2-01aa75ed71a1 done
no problem here
Row 645 with cellar-number 9cb6777c-7bff-11e7-b2f2-01aa75ed71a1 done
no problem here
Row 646 with cellar-number e687f771-8dd6-11eb-b85c-01aa75ed71a1 done
no problem here
Row 647 with cellar-number 09ea1f84-c272-11ea-b3a4-01aa75ed71a1 done
no problem here
Row 648 with cellar-number eaab107e-47ea-11ea-b81b-01aa75ed71a1 done
no problem here
Row 649 with cellar-number 28afdffa-b4bd-11e3-86f9-01aa75ed71a1 done
no problem here
Row 650 with cellar-number 4d8710c0-dc8f-11e6-ad7c-01aa75ed71a1 done
no problem here
Row 651 with cellar-number b090c948-7546-

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 664 with cellar-number a1b3f1be-47d8-11e4-a0cb-01aa75ed71a1 done
no problem here
Row 665 with cellar-number 0835ebf8-68ff-407a-a1ae-36a3733cedf6 done
no problem here
Row 666 with cellar-number cb8ad0a5-8af7-11ec-8c40-01aa75ed71a1 done
no problem here
Row 667 with cellar-number e85c3e66-7b66-11e8-ac6a-01aa75ed71a1 done
no problem here
Row 668 with cellar-number c2cb3f77-46bd-11e3-ae03-01aa75ed71a1 done
no problem here
Row 669 with cellar-number c73754bc-ae35-11e5-b528-01aa75ed71a1 done
no problem here
Row 670 with cellar-number 7ce59900-cd1c-11e8-9424-01aa75ed71a1 done
no problem here
Row 671 with cellar-number 9a0da788-252e-11e9-8d04-01aa75ed71a1 done
no problem here
Row 672 with cellar-number e355591a-a566-11ea-bb7a-01aa75ed71a1 done
no problem here
Row 673 with cellar-number 93057a1b-4c35-11ec-91ac-01aa75ed71a1 done
no problem here
Row 674 with cellar-number 128ece53-613d-11eb-8146-01aa75ed71a1 done
no problem here
Row 675 with cellar-number f7be62f6-820c-

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 759 with cellar-number e7233b7a-14f9-11eb-b57e-01aa75ed71a1 done
no problem here
Row 760 with cellar-number b9c9e1d7-3a5a-11e3-a247-01aa75ed71a1 done
no problem here
Row 761 with cellar-number bbb21466-4bb2-11e5-9f5a-01aa75ed71a1 done
no problem here
Row 762 with cellar-number 24ba5346-f976-11eb-b520-01aa75ed71a1 done
no problem here
Row 763 with cellar-number 3a40deb4-5e25-11e7-954d-01aa75ed71a1 done
no problem here
Row 764 with cellar-number ab61a9e6-e8b0-11e4-baa4-01aa75ed71a1 done
no problem here
Row 765 with cellar-number 21eb3af6-32c2-4bb4-be0f-99f6c51e9278 done
no problem here
Row 766 with cellar-number 692e4154-2a86-11e3-8d1c-01aa75ed71a1 done
no problem here
Row 767 with cellar-number f82caa68-f75f-11e7-b8f5-01aa75ed71a1 done
no problem here
Row 768 with cellar-number f0874dc5-e8f2-11ea-b3c6-01aa75ed71a1 done
no problem here
Row 769 with cellar-number ede3082e-6bdd-11e3-9afb-01aa75ed71a1 done
no problem here
Row 770 with cellar-number 7880b901-f8ab-

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 839 with cellar-number 872494cc-4814-11ec-91ac-01aa75ed71a1 done
no problem here
Row 840 with cellar-number 90517a3a-efc2-11e4-a3bf-01aa75ed71a1 done
no problem here
Row 841 with cellar-number a6b46cf9-6be2-11e3-9afb-01aa75ed71a1 done
no problem here
Row 842 with cellar-number eef99115-c270-11ea-b3a4-01aa75ed71a1 done
no problem here
Row 843 with cellar-number fff924a9-3f79-11e7-a08e-01aa75ed71a1 done
no problem here
Row 844 with cellar-number 40d43548-cead-11e3-b682-01aa75ed71a1 done
no problem here
Row 845 with cellar-number 64e521df-1d3d-11ea-95ab-01aa75ed71a1 done
no problem here
Row 846 with cellar-number 9fb1fbc3-e866-11e8-b690-01aa75ed71a1 done
no problem here
Row 847 with cellar-number 9beb01e6-4d27-11e3-ae03-01aa75ed71a1 done
no problem here
Row 848 with cellar-number 74af2237-6c44-11e7-b2f2-01aa75ed71a1 done
no problem here
Row 849 with cellar-number 06ab9386-193b-11e4-933d-01aa75ed71a1 done
no problem here
Row 850 with cellar-number 2cdbfbfb-1a8b-

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


no problem here
Row 858 with cellar-number 2810c3a9-4a92-11eb-b59f-01aa75ed71a1 done
no problem here
Row 859 with cellar-number 1ef9ef33-24ea-11e3-8d1c-01aa75ed71a1 done
no problem here
Row 860 with cellar-number f637a1bb-4334-11e6-9c64-01aa75ed71a1 done
no problem here
Row 861 with cellar-number 38d76961-5cc9-11e3-914b-01aa75ed71a1 done
no problem here
Row 862 with cellar-number 1b1b08a4-2580-11eb-9d7e-01aa75ed71a1 done
no problem here
Row 863 with cellar-number 7fecdda2-e400-11e4-b1d3-01aa75ed71a1 done
no problem here
Row 864 with cellar-number 23310a36-4a2c-48c4-9db2-f1ae37b9b22c done
no problem here
Row 865 with cellar-number 9d1db170-edaf-11ea-991b-01aa75ed71a1 done
no problem here
Row 866 with cellar-number a3d32cfb-5a09-11e6-89bd-01aa75ed71a1 done
no problem here
Row 867 with cellar-number 0cf0fd3e-7d26-11e9-9f05-01aa75ed71a1 done
no problem here
Row 868 with cellar-number 3e1f1f4c-d053-11e8-9424-01aa75ed71a1 done
no problem here
Row 869 with cellar-number e34d7d54-ceac-11e3-b68

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 930 with cellar-number 2b65e8aa-afc0-11e7-837e-01aa75ed71a1 done
no problem here
Row 931 with cellar-number e9244998-c01f-11ea-855a-01aa75ed71a1 done
no problem here
Row 932 with cellar-number 5708101f-19c4-11e7-808e-01aa75ed71a1 done
no problem here
Row 933 with cellar-number 4fc43ec0-46c6-11e8-be1d-01aa75ed71a1 done


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


no problem here
Row 934 with cellar-number fd2047c5-abfe-11e2-ab01-01aa75ed71a1 done
no problem here
Row 935 with cellar-number 7f0f54dc-efde-11e9-a32c-01aa75ed71a1 done
no problem here
Row 936 with cellar-number b7931ffb-2058-11e3-8d1c-01aa75ed71a1 done
no problem here
Row 937 with cellar-number 14646556-9bbf-11e4-872e-01aa75ed71a1 done
no problem here
Row 938 with cellar-number 1269d5d4-a0b6-11eb-b85c-01aa75ed71a1 done
no problem here
Row 939 with cellar-number 11603f76-515d-11ea-aece-01aa75ed71a1 done
no problem here
Row 940 with cellar-number 8e73e314-1185-11e5-8817-01aa75ed71a1 done
no problem here
Row 941 with cellar-number b983cde7-678b-11eb-aeb5-01aa75ed71a1 done
no problem here
Row 942 with cellar-number 187fec81-f783-11e3-831f-01aa75ed71a1 done
no problem here
Row 943 with cellar-number 63f90a69-2117-11e8-ac73-01aa75ed71a1 done
no problem here
Row 944 with cellar-number 9976a2f7-c10d-11e1-b84a-01aa75ed71a1 done
no problem here
Row 945 with cellar-number c850bafa-a339-11ec-83e

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 982 with cellar-number 9fdee272-9e92-11eb-b85c-01aa75ed71a1 done
no problem here
Row 983 with cellar-number 89b29d34-57c9-11ec-91ac-01aa75ed71a1 done
no problem here
Row 984 with cellar-number 79e16e2a-76f5-11e5-86db-01aa75ed71a1 done


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 985 with cellar-number 88129a74-0474-11eb-a511-01aa75ed71a1 done
no problem here
Row 986 with cellar-number 23bd42cb-ee10-11e3-8cd4-01aa75ed71a1 done
no problem here
Row 987 with cellar-number 3a5bb359-fcce-11e7-b8f5-01aa75ed71a1 done
no problem here
Row 988 with cellar-number dc9def33-1ee6-11e9-8d04-01aa75ed71a1 done
no problem here
Row 989 with cellar-number 64824594-c82b-11eb-a925-01aa75ed71a1 done
no problem here
Row 990 with cellar-number f086ca7e-d8e2-11e2-bfa7-01aa75ed71a1 done
no problem here
Row 991 with cellar-number a458c63f-339f-11eb-b27b-01aa75ed71a1 done
no problem here
Row 992 with cellar-number 071fdfda-909c-11ec-b4e4-01aa75ed71a1 done
no problem here
Row 993 with cellar-number cdffe646-71b6-4497-b05b-18344c1f5ac7 done
no problem here
Row 994 with cellar-number 85dd7a55-0367-11e9-adde-01aa75ed71a1 done
no problem here
Row 995 with cellar-number 8e43a02d-a636-11e6-aab7-01aa75ed71a1 done
no problem here
Row 996 with cellar-number b881212a-c594-

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 1012 with cellar-number 8f68ed1b-e9a3-11e9-9c4e-01aa75ed71a1 done
no problem here
Row 1013 with cellar-number bd10bf26-7315-11ea-a07e-01aa75ed71a1 done
no problem here
Row 1014 with cellar-number 196a24e1-0562-11e7-8a35-01aa75ed71a1 done
no problem here
Row 1015 with cellar-number e5911edf-8115-11e9-9f05-01aa75ed71a1 done
no problem here
Row 1016 with cellar-number f53f0c53-6673-11e5-9317-01aa75ed71a1 done
no problem here
Row 1017 with cellar-number 49232971-f55f-11e9-8c1f-01aa75ed71a1 done
no problem here
Row 1018 with cellar-number c6ccb753-03f6-11e3-a352-01aa75ed71a1 done
no problem here
Row 1019 with cellar-number d397518e-da59-11e2-bfa7-01aa75ed71a1 done
no problem here
Row 1020 with cellar-number a9049655-2ef4-11eb-b27b-01aa75ed71a1 done


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 1021 with cellar-number 20692300-dded-11ea-adf7-01aa75ed71a1 done
no problem here
Row 1022 with cellar-number c8d21be5-25b2-11e9-8d04-01aa75ed71a1 done
no problem here
Row 1023 with cellar-number 71328854-5fc5-418b-90b8-0e6470e66e74 done
no problem here
Row 1024 with cellar-number c8b536d5-2581-11eb-9d7e-01aa75ed71a1 done
no problem here
Row 1025 with cellar-number c2426808-3a3f-11e4-8c3c-01aa75ed71a1 done
no problem here
Row 1026 with cellar-number 0892e96c-f3a1-11e8-9982-01aa75ed71a1 done
no problem here
Row 1027 with cellar-number 0d85a8f6-5e4e-11e3-ab0f-01aa75ed71a1 done
no problem here
Row 1028 with cellar-number 00ef7b4c-2146-11e5-a342-01aa75ed71a1 done
no problem here
Row 1029 with cellar-number 0aff9d2f-8900-11e9-9369-01aa75ed71a1 done
no problem here
Row 1030 with cellar-number 83bfb441-9f53-11e7-b92d-01aa75ed71a1 done
no problem here
Row 1031 with cellar-number 7e706f36-b428-11e9-9d01-01aa75ed71a1 done
no problem here
Row 1032 with cellar-number 61

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 1051 with cellar-number fa04223c-8dea-11ea-812f-01aa75ed71a1 done
no problem here
Row 1052 with cellar-number 20efb123-ec9d-11e8-b690-01aa75ed71a1 done
no problem here
Row 1053 with cellar-number 30a0f6e0-fef1-11ea-b44f-01aa75ed71a1 done
no problem here
Row 1054 with cellar-number 9eefdbe7-6be1-11e3-9afb-01aa75ed71a1 done
no problem here
Row 1055 with cellar-number 57f2ac0b-3ac1-11e7-a08e-01aa75ed71a1 done
no problem here
Row 1056 with cellar-number 360a766c-4930-11e3-ae03-01aa75ed71a1 done
no problem here
Row 1057 with cellar-number 6690bab2-da69-45aa-9e07-4ee4f6fbde58 done
no problem here
Row 1058 with cellar-number e005fe7e-ac15-11e8-99ee-01aa75ed71a1 done
no problem here
Row 1059 with cellar-number f1759cfa-e3ff-11e4-b1d3-01aa75ed71a1 done
no problem here
Row 1060 with cellar-number 88d41e74-f32e-11eb-aeb9-01aa75ed71a1 done
no problem here
Row 1061 with cellar-number 2e9e75e5-5415-11e3-8945-01aa75ed71a1 done
no problem here
Row 1062 with cellar-number c6

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 1072 with cellar-number 57ecef1a-4b38-11ec-91ac-01aa75ed71a1 done
no problem here
Row 1073 with cellar-number 7efe0059-a278-11e7-a56f-01aa75ed71a1 done
no problem here
Row 1074 with cellar-number c0d110e4-6be0-11e3-9afb-01aa75ed71a1 done
no problem here
Row 1075 with cellar-number 06d19f17-d032-11e3-8cd4-01aa75ed71a1 done
no problem here
Row 1076 with cellar-number 44a754e3-4330-11eb-b27b-01aa75ed71a1 done
no problem here
Row 1077 with cellar-number fe947e94-477f-11e3-ae03-01aa75ed71a1 done
no problem here
Row 1078 with cellar-number 10fed4f8-71b8-11ec-9136-01aa75ed71a1 done
no problem here
Row 1079 with cellar-number 870253c8-68c9-11e5-9317-01aa75ed71a1 done
no problem here
Row 1080 with cellar-number ca41b42c-bd8e-11e9-9d01-01aa75ed71a1 done
no problem here
Row 1081 with cellar-number 5bca5cde-94bb-11e2-ab01-01aa75ed71a1 done
no problem here
Row 1082 with cellar-number 6ca3e752-74eb-11ec-9136-01aa75ed71a1 done
no problem here
Row 1083 with cellar-number 32

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 1108 with cellar-number a7bb6ed7-dad1-11ea-adf7-01aa75ed71a1 done
no problem here
Row 1109 with cellar-number ddae4500-7339-474a-afe6-93b5a75a7bb8 done
no problem here
Row 1110 with cellar-number b6ddfaf6-4334-11e6-9c64-01aa75ed71a1 done
no problem here
Row 1111 with cellar-number b947e442-de13-11e6-ad7c-01aa75ed71a1 done
no problem here
Row 1112 with cellar-number 7485d62b-b3fe-11e3-86f9-01aa75ed71a1 done
no problem here
Row 1113 with cellar-number 8b01fc9a-bfe6-4a3b-a9d3-cc5494f23fa3 done
no problem here
Row 1114 with cellar-number 76cc08cd-8fdf-11e5-983e-01aa75ed71a1 done
no problem here
Row 1115 with cellar-number 923faae1-ceb3-11e2-ba1b-01aa75ed71a1 done
no problem here
Row 1116 with cellar-number ee0808cf-0561-11e7-8a35-01aa75ed71a1 done
no problem here
Row 1117 with cellar-number 6f5ca6c0-0248-11e2-8e28-01aa75ed71a1 done
no problem here
Row 1118 with cellar-number 77fdc2fa-6418-11e5-9317-01aa75ed71a1 done
no problem here
Row 1119 with cellar-number 19

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


no problem here
Row 1169 with cellar-number 448c2a6d-ac00-11e2-ab01-01aa75ed71a1 done
no problem here
Row 1170 with cellar-number b95d3093-5dfe-11ea-b735-01aa75ed71a1 done
no problem here
Row 1171 with cellar-number e1c0bc1e-b0d4-11e8-99ee-01aa75ed71a1 done
no problem here
Row 1172 with cellar-number 09e45e31-91c8-11eb-b85c-01aa75ed71a1 done
no problem here
Row 1173 with cellar-number 6d411fe9-dc8e-11e6-ad7c-01aa75ed71a1 done
no problem here
Row 1174 with cellar-number 8bf45c88-e9ef-4b87-8fbe-f7c39a786d91 done
no problem here
Row 1175 with cellar-number b94b11ff-7a3c-11e6-b076-01aa75ed71a1 done
no problem here
Row 1176 with cellar-number 2702f6c9-49c2-11ea-8aa5-01aa75ed71a1 done
no problem here
Row 1177 with cellar-number 6700c7d4-1a8b-11e7-808e-01aa75ed71a1 done
no problem here
Row 1178 with cellar-number 25d43b72-c1e6-11e1-b84a-01aa75ed71a1 done
no problem here
Row 1179 with cellar-number 5801648a-00cb-11ec-8f47-01aa75ed71a1 done
no problem here
Row 1180 with cellar-number 141deaf1-c

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 1185 with cellar-number 3ac2ed56-0475-11eb-a511-01aa75ed71a1 done
no problem here
Row 1186 with cellar-number b46df43a-71c6-11eb-9ac9-01aa75ed71a1 done
no problem here
Row 1187 with cellar-number 1906f664-27e6-11eb-9d7e-01aa75ed71a1 done
no problem here
Row 1188 with cellar-number 914d5e9e-a568-11ea-bb7a-01aa75ed71a1 done
no problem here
Row 1189 with cellar-number a30fd9d8-cd73-11eb-ac72-01aa75ed71a1 done
no problem here
Row 1190 with cellar-number bf6624ef-e4d5-11e9-9c4e-01aa75ed71a1 done
no problem here
Row 1191 with cellar-number fbe1003c-9fd7-11e5-8781-01aa75ed71a1 done
no problem here
Row 1192 with cellar-number 1f17ab2e-2fcf-11e7-9412-01aa75ed71a1 done
no problem here
Row 1193 with cellar-number d691bb22-983b-11ec-b4e4-01aa75ed71a1 done
no problem here
Row 1194 with cellar-number a483c535-8750-11e4-b8a5-01aa75ed71a1 done
no problem here
Row 1195 with cellar-number e12b2fa5-2f6e-11e5-9f85-01aa75ed71a1 done
no problem here
Row 1196 with cellar-number 94

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


pdf detected, but fixed
Row 1228 with cellar-number 398b4d22-0474-11eb-a511-01aa75ed71a1 done
no problem here
Row 1229 with cellar-number a974b890-424a-11e5-9f5a-01aa75ed71a1 done
no problem here
Row 1230 with cellar-number c76d1ce8-f783-11e3-831f-01aa75ed71a1 done
no problem here
Row 1231 with cellar-number 0e48eb0a-9d00-11e7-b92d-01aa75ed71a1 done
no problem here
Row 1232 with cellar-number 0d1f031a-8564-11e8-ac6a-01aa75ed71a1 done
no problem here
Row 1233 with cellar-number 1287754d-dc8f-11e6-ad7c-01aa75ed71a1 done
no problem here
Row 1234 with cellar-number a974594e-71c5-11e7-b2f2-01aa75ed71a1 done
no problem here
Row 1235 with cellar-number 6e43713e-ce6d-11e7-a5d5-01aa75ed71a1 done
no problem here
Row 1236 with cellar-number 844c308c-45b7-11ec-89db-01aa75ed71a1 done
no problem here
Row 1237 with cellar-number 0c001d4e-36da-441b-bd68-e78d4e40f360 done
no problem here
Row 1238 with cellar-number bea818f1-4d1b-11e3-ae03-01aa75ed71a1 done
no problem here
Row 1239 with cellar-number bf

KeyboardInterrupt: 

In [4]:
data_with_content

NameError: name 'data_with_content' is not defined

In [11]:
len(metadata)

23960

**Read already processed Data**

In [None]:
# data_with_content: Zeile 0 - 1242 (what's up with that?). 
# data_with_content3: Zeile 2833 - 3855
# data_with_content4: Zeile 3855 - 4721
# data_with_content5: Zeile 4721 - 5147
# data_with_content6: Zeile 5147 - 6851
# data_with_content8: Zeile 6851 - 8839


In [None]:
data = pd.read_csv("../raw_data/20220601_larger_data_b_scraped_clean.csv")
data = clean_data(data)

In [26]:
len(data_with_content8)

NameError: name 'data_with_content8' is not defined

In [None]:
data.to_csv("../raw_data/20220601_larger_data_b_scraped_clean_v3.csv")

In [None]:
get_url(data['cellar'][100])

**Test Area**

In [None]:
metadata.iloc[3498:3502]

In [None]:
data = read_metadata(filename)

In [None]:
URL = get_url(metadata['cellar'][58], "03")

In [None]:
response = requests.get(URL, headers={"Accept-Language":"en-US"})
try:
    soup = BeautifulSoup(response.content, "html.parser")
    if str(soup)[1:4] == "PDF":
        print("pdf detected, but fixed")
        '''
        in some (few) cases, the doctype is not 03 but 02. change it for these cases
        '''
        URL = URL[:-8] + '02' + URL[-6:]
        response = requests.get(URL, headers={"Accept-Language":"en-US"})
        soup = BeautifulSoup(response.content, "html.parser")
    else:
        print("no problem here")
except:
    '''
    in case there is an error
    '''
    print("yes problem here")
    URL = URL[:-8] + '02' + URL[-6:]
    response = requests.get(URL, headers={"Accept-Language":"en-US"})
    soup = BeautifulSoup(response.content, "html.parser")

if soup.find("p", class_="oj-normal") == None:
    content = ' '.join([item.text for item in soup.find_all("p", class_="normal")])
else:
    content = ' '.join([item.text for item in soup.find_all("p", class_="oj-normal")])

print(URL)

content#.split('Whereas:', 1)[1] # only return text without the head