In [1]:
import os
import pandas as pd
import pickle
import time
import re
import requests
import tqdm

In [2]:
with open("./klimaatadaptatie/entries.pickle", "rb") as f:
    entries = pickle.load(f)

In [3]:
total = 0
for i in entries:
    if entries[i].get("text"):
        total += 1
print(total)

20309


In [4]:
bekendmaking = "./klimaatadaptatie/metadata/sources/bekendmaking.tsv"
notubiz = "./klimaatadaptatie/metadata/sources/notubiz.tsv"
overheid = "./klimaatadaptatie/metadata/sources/overheid.tsv"
bestuurlijke = "./klimaatadaptatie/metadata/sources/bestuurlijke.tsv"

In [5]:
df_b = pd.read_csv(bekendmaking, sep="\t")
df_b["from"] = "bekendmakig"
df_n = pd.read_csv(notubiz, sep="\t")
df_n["from"] = "notubiz"
df_o = pd.read_csv(overheid, sep="\t")
df_o["from"] = "overheid"
df_b2 = pd.read_csv(bestuurlijke, sep="\t")
df_b2["from"] = "bestuurlijke"

In [6]:
df_b.columns, df_n.columns, df_o.columns, df_b2.columns

(Index(['name', 'url', 'old_id', 'id', 'date', 'organization', 'from'], dtype='object'),
 Index(['name', 'url', 'date', 'format', 'type_doc', 'id', 'organization',
        'description', 'from', 'exists'],
       dtype='object'),
 Index(['page', 'name', 'url', 'old_id', 'id', 'date', 'organization', 'from'], dtype='object'),
 Index(['url', 'name', 'date', 'text', 'organization', 'id', 'description',
        'from'],
       dtype='object'))

In [7]:
df = pd.concat([df_b, df_n, df_o, df_b2])
df.shape

(26419, 13)

In [8]:
df = df.reset_index()
df.drop('index', inplace=True, axis=1)

In [9]:
df["organization"] = df["organization"].str.lower()

In [10]:
df[df["id"] == "zaanstad-9999555"]

Unnamed: 0,name,url,old_id,id,date,organization,from,format,type_doc,description,exists,page,text
24912,Accountantsverslag Zaanstad 2022.pdf,https://zaanstad.bestuurlijkeinformatie.nl/lib...,,zaanstad-9999555,2023-05-25,zaanstad,bestuurlijke,,,,,,Raadsvergadering - donderdag 25 mei 2023 00:00...


In [11]:
def get_invalid_date(x):
    if x == "-1":
        return "01-01-1900"
    return x

df["date"] = df["date"].apply(lambda x: get_invalid_date(x))

In [12]:
def replace_month(text):
    if type(text) == float:
        return "01-01-1900"
    for i, k in enumerate(["januari", "februari", "maart", "april", "mei", "juni", "juli", "augustus","september", "oktober", "november", "december"]):
        k2 = f" {k} "
        if k2 in text:
            return text.replace(k2,f"-{str(i+1)}-").strip().split(" ", 1)[0]
    return text.split(" ", 1)[0]

def to_datetime(x):
    return x.to_pydatetime()

In [13]:
df["date"] = df["date"].apply(lambda x: replace_month(x))
df["date"] = pd.to_datetime(df["date"],format="mixed", dayfirst=True, errors="coerce")
df["date"] = df["date"].apply(lambda x: to_datetime(x))

In [14]:
df.columns

Index(['name', 'url', 'old_id', 'id', 'date', 'organization', 'from', 'format',
       'type_doc', 'description', 'exists', 'page', 'text'],
      dtype='object')

In [16]:
df.to_csv("./klimaatadaptatie/metadata/metadata.tsv", sep="\t")
df.to_csv("../../klimaat_research/data/metadata.tsv", sep="\t")

In [17]:
df_dict = df.to_dict(orient="index")

In [18]:
tbr= {}
for i in df_dict:
    data = df_dict[i]
    new_id = df_dict[i]["id"]
    tbr[new_id] = data

In [19]:
for i in tbr:
    del tbr[i]["id"]

In [20]:
def split_paragraphs(text, min_length=30):
    """
    Split paragraphs based on new space. If the paragraph has less minimum length, it will be ommitted.  
    """
    texts_tbr = []
    texts = re.split(r'\n', text)
    for t in texts:
        if len(t) > min_length:
            texts_tbr.append(t)
    return texts_tbr

In [21]:
still_to_check = {}
still_to_check["notubiz"] = []
still_to_check["bekendmakig"] = []
still_to_check["overheid"] = []
still_to_check["bestuurlijke"] = []
for i in tbr:
    try:
        with open(f"./klimaatadaptatie/txt/{i}.txt", "r") as f:
            text = f.read()

        new_text = re.sub(r'\n(?!\n)', ' ', text).strip()
        new_text = re.sub(r'\s{2,}', ' ', new_text).strip()
        if len(new_text) > 50:
            tbr[i]["text"] = new_text.strip()
        else:
            tbr[i]["text"] = 0.01
            
    except FileNotFoundError:
        source = tbr[i]["from"]
        u = tbr[i]["url"]
        still_to_check[source].append((i, u))

In [22]:
total = 0
to_delete = []
for i in tbr:
    text = tbr[i].get("text")
    if text:
        if type(text) == str:
            total += 1
        else:
            to_delete.append(i)
    else:
        to_delete.append(i)
print(total, len(to_delete))

20309 4769


In [23]:
for i in to_delete:
    del tbr[i]

In [24]:
len(tbr.keys())

20309

In [25]:
with open("./klimaatadaptatie/entries.pickle", "wb") as f:
    pickle.dump(tbr, f)

In [31]:
with open("../../klimaat_research/data/entries.pickle", "wb") as f:
    pickle.dump(tbr, f)

In [32]:
with open("./klimaatadaptatie/entries_to_check.pickle", "wb") as f:
    pickle.dump(still_to_check, f)

In [33]:
with open("./klimaatadaptatie/entries_to_check.pickle", "rb") as f:
    still_to_check = pickle.load(f)

In [34]:
already_existing = [i.split(".")[0] for i in os.listdir("./klimaatadaptatie/pdf")]

In [35]:
still_to_check["overheid"]

[('tweede_kamer-7566098',
  'https://open.overheid.nl/Details/oep-49380fa5d652b27ea130a02a71cee78947566098/2?hit=26&text=klimaatadaptatie&page=1&count=50'),
 ('tweede_kamer-aed2ac5',
  'https://open.overheid.nl/Details/oep-5eaaf276-f7f4-47d7-b864-b14e1aed2ac5/1?hit=27&text=klimaatadaptatie&page=1&count=50'),
 ('tweede_kamer-f01cc05',
  'https://open.overheid.nl/Details/oep-fef6df48b305df08343f1351f32538cc0f01cc05/2?hit=28&text=klimaatadaptatie&page=1&count=50'),
 ('infrastructuur_en_waterstaat-ad179f6',
  'https://open.overheid.nl/Details/ronl-e95aeb70-d42a-4558-b52f-76badad179f6/1?hit=35&text=klimaatadaptatie&page=1&count=50'),
 ('tweede_kamer-5b116fd',
  'https://open.overheid.nl/Details/oep-27d68244222162527c5f1b5afcae1be885b116fd/2?hit=40&text=klimaatadaptatie&page=1&count=50'),
 ('tweede_kamer-bbf4332',
  'https://open.overheid.nl/Details/oep-efe5e858a05685471b29753053da5d7d8bbf4332/1?hit=41&text=klimaatadaptatie&page=1&count=50'),
 ('tweede_kamer-a92f14a',
  'https://open.overhei

In [None]:
def get_files_notubiz(tuples):
    id, url = tuples[0], tuples[1]
    try:
        url_ = url.replace(".html", ".pdf")
        response = requests.get(url_)
    except requests.exceptions.TooManyRedirects:
        return

    with open(f"./pdf/{id}.pdf", 'wb') as f:
        f.write(response.content)
    time.sleep(5)
    return

In [None]:
for t in tqdm.tqdm(still_to_check["overheid"]):
    if t[0] not in already_existing:
        
        get_files_notubiz(t)

In [7]:
response = requests.get("https://open.overheid.nl/Details/ronl-02a120f0-60c4-42a4-99bf-dd682a0f21b2/1?hit=9&text=klimaatadaptatie&page=1&count=50")