In [1]:
import os
import pandas as pd
import pickle
import time
import re
import requests
import tqdm

In [2]:
with open("./klimaatadaptatie/entries.pickle", "rb") as f:
    entries = pickle.load(f)

In [3]:
total = 0
for i in entries:
    if entries[i].get("text"):
        total += 1
print(total)

20309


In [7]:
bekendmaking = "./klimaatadaptatie/metadata/sources/bekendmaking.tsv"
notubiz = "./klimaatadaptatie/metadata/sources/notubiz.tsv"
overheid = "./klimaatadaptatie/metadata/sources/overheid.tsv"
bestuurlijke = "./klimaatadaptatie/metadata/sources/bestuurlijke.tsv"

In [8]:
df_b = pd.read_csv(bekendmaking, sep="\t")
df_b["from"] = "bekendmakig"
df_n = pd.read_csv(notubiz, sep="\t")
df_n["from"] = "notubiz"
df_o = pd.read_csv(overheid, sep="\t")
df_o["from"] = "overheid"
df_b2 = pd.read_csv(bestuurlijke, sep="\t")
df_b2["from"] = "bestuurlijke"

In [9]:
df_b.columns, df_n.columns, df_o.columns, df_b2.columns

(Index(['name', 'url', 'old_id', 'id', 'date', 'organization', 'from'], dtype='object'),
 Index(['name', 'url', 'date', 'format', 'type_doc', 'id', 'organization',
        'description', 'from', 'exists'],
       dtype='object'),
 Index(['page', 'name', 'url', 'old_id', 'id', 'date', 'organization', 'from'], dtype='object'),
 Index(['url', 'name', 'date', 'text', 'organization', 'id', 'description',
        'from'],
       dtype='object'))

In [10]:
df = pd.concat([df_b, df_n, df_o, df_b2])
df.shape

(26419, 13)

In [11]:
df = df.reset_index()
df.drop('index', inplace=True, axis=1)

In [12]:
df["organization"] = df["organization"].str.lower()

In [13]:
df[df["id"] == "zaanstad-9999555"]

Unnamed: 0,name,url,old_id,id,date,organization,from,format,type_doc,description,exists,page,text
24912,Accountantsverslag Zaanstad 2022.pdf,https://zaanstad.bestuurlijkeinformatie.nl/lib...,,zaanstad-9999555,2023-05-25,zaanstad,bestuurlijke,,,,,,Raadsvergadering - donderdag 25 mei 2023 00:00...


In [14]:
def get_invalid_date(x):
    if x == "-1":
        return "01-01-1900"
    return x

df["date"] = df["date"].apply(lambda x: get_invalid_date(x))

In [15]:
def replace_month(text):
    if type(text) == float:
        return "01-01-1900"
    for i, k in enumerate(["januari", "februari", "maart", "april", "mei", "juni", "juli", "augustus","september", "oktober", "november", "december"]):
        k2 = f" {k} "
        if k2 in text:
            return text.replace(k2,f"-{str(i+1)}-").strip().split(" ", 1)[0]
    return text.split(" ", 1)[0]

def to_datetime(x):
    return x.to_pydatetime()

In [16]:
df["date"] = df["date"].apply(lambda x: replace_month(x))
df["date"] = pd.to_datetime(df["date"],format="mixed", dayfirst=True, errors="coerce")
df["date"] = df["date"].apply(lambda x: to_datetime(x))

In [17]:
df.columns

Index(['name', 'url', 'old_id', 'id', 'date', 'organization', 'from', 'format',
       'type_doc', 'description', 'exists', 'page', 'text'],
      dtype='object')

In [20]:
df.to_csv("./klimaatadaptatie/metadata/metadata.tsv", sep="\t")

In [21]:
df_dict = df.to_dict(orient="index")

In [22]:
tbr= {}
for i in df_dict:
    data = df_dict[i]
    new_id = df_dict[i]["id"]
    tbr[new_id] = data

In [23]:
for i in tbr:
    del tbr[i]["id"]

In [24]:
still_to_check = {}
still_to_check["notubiz"] = []
still_to_check["bekendmakig"] = []
still_to_check["overheid"] = []
still_to_check["bestuurlijke"] = []
for i in tbr:
    try:
        with open(f"./klimaatadaptatie/txt/{i}.txt", "r") as f:
            text = f.read()

        new_text = re.sub(r'\n(?!\n)', ' ', text).strip()
        new_text = re.sub(r'\s{2,}', ' ', new_text).strip()
        if len(new_text) > 50:
            tbr[i]["text"] = new_text.strip()
        else:
            tbr[i]["text"] = False
            
    except FileNotFoundError:
        source = tbr[i]["from"]
        u = tbr[i]["url"]
        still_to_check[source].append((i, u))

In [25]:
total = 0
to_delete = []
for i in tbr:
    text = tbr[i].get("text")
    if text:
        if type(text) == str:
            total += 1
        else:
            to_delete.append(i)
    else:
        to_delete.append(i)
print(total, len(to_delete))

20309 4769


In [26]:
for i in to_delete:
    del tbr[i]

In [27]:
len(tbr.keys())

20309

In [28]:
with open("./klimaatadaptatie/entries.pickle", "wb") as f:
    pickle.dump(tbr, f)