In [1]:
import pandas as pd
import os
import requests

from datetime import date
from dateutil.relativedelta import relativedelta

In [2]:
# Suppress DtypeWarning
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Root of project
root_dir = os.path.join("analysis_data")
os.makedirs(root_dir, exist_ok=True)

# Raw data from api
data_dir = os.path.join(root_dir, "data")
os.makedirs(data_dir, exist_ok=True)

# Formatted CSV data
csv_dir = os.path.join(root_dir, "csv")
os.makedirs(csv_dir, exist_ok=True)

# Data split by location
location_dir = os.path.join(root_dir, "location")
os.makedirs(location_dir, exist_ok=True)

In [54]:
def get_data(start: str, end: str):
    data = requests.get(
        f"https://ilm2.site.dustmonitoring.nl/download?from={start}&to={end}&interval=600&align=1&type=csv-semicolon&p=531&p=521&p=542&p=543&p=553&p=544&p=545&p=532&p=533&p=554&p=534&p=535&p=546&p=536&p=556&p=522&p=557&p=547&p=549&p=524&p=537&p=525&p=526&p=539&p=551&p=540&p=558&p=527&p=528&p=529&p=530&p=560&p=561&p=562&p=563&p=564&p=565&p=566&p=567&p=568&p=569&p=570&p=571&p=574&p=575&p=576&p=577&p=578&s=10&s=11&s=128&s=129&s=130&s=145&s=146"
    )
    return data.text

_date = date(2020, 11, 1)

while True:
    start_date = _date
    end_date = _date + relativedelta(months=2)
    data = get_data(start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))

    file_path = os.path.join(data_dir, f"data_{start_date.strftime('%Y-%m-%d')}.csv")

    with open(file_path, "+w") as file:
        file.write(data)

    _date = end_date
    if _date > date(2024, 3, 1):
        break

In [20]:
dfs = []
for file in os.listdir(data_dir):

    file_path = os.path.join(data_dir, file)

    if not os.path.isfile(file_path):
        continue
    df = pd.read_csv(file_path, index_col=False, sep=";")

    # Get the values of the first 2 rows
    header_string = df.iloc[:2].values
    row_1 = [row.split(".")[0] for row in df.columns.tolist()]
    row_2 = header_string[0]

    # Merge these into the new column names
    new_columns = []
    for row1, row2 in zip(row_1, row_2):
        row1 = row1.replace("Unnamed: ", "")
        new_columns.append(f"{row1}-{row2}")
    # Remove the used rows
    df = df.iloc[2:]
    # Set new column names
    df.columns = new_columns
    
    csv_file_path = os.path.join(csv_dir, file)
    # Convert all columns to string
    df = df.astype(str)
    # Replace all commas with dots
    df = df.replace(",", ".", regex=True)

    df.to_csv(csv_file_path, index=False, index_label=False)

In [21]:
# Get all monthly datasets
dfs = []
for file in os.listdir(csv_dir):
    csv_file_path = os.path.join(csv_dir, file)
    df = pd.read_csv(csv_file_path, index_col=False)
    dfs.append(df)

# Join datasets togather into one
df = pd.concat(dfs, ignore_index=True)
df.shape

(185121, 287)

In [22]:
if not df["0-Tijd"].is_unique:
    print(f"Time should be unique! Found {df['0-Tijd'].duplicated().sum()} duplicates")

Time should be unique! Found 2900 duplicates


In [31]:
pm10 = (
    df
    .assign(**{"0-Tijd": pd.to_datetime(df["0-Tijd"])})
    .set_index("0-Tijd")
    .loc[:, lambda x: x.columns.str.contains("PM10")]
)

In [34]:
for column in pm10.columns:
    print(f"Missing fraction of {column}: {pm10[column].dropna().resample("1h").mean().isna().mean():.2%}")

Missing fraction of I02-PM10: 2.04%
Missing fraction of I09-PM10: 2.08%
Missing fraction of I11-PM10: 2.22%
Missing fraction of I12-PM10: 3.38%
Missing fraction of I24-PM10: 0.59%
Missing fraction of I25-PM10: 3.67%
Missing fraction of I29-PM10: 1.88%
Missing fraction of I32-PM10: 0.46%
Missing fraction of I36-PM10: 3.98%
Missing fraction of I37-PM10: 0.12%
Missing fraction of I39-PM10: 7.96%
Missing fraction of I40-PM10: 3.98%
Missing fraction of I04-PM10: 4.42%
Missing fraction of I07-PM10: 0.62%
Missing fraction of I08-PM10: 31.59%
Missing fraction of I14-PM10: 1.63%
Missing fraction of I17-PM10: 6.93%
Missing fraction of I19-PM10: 0.41%
Missing fraction of I22-PM10: 10.95%
Missing fraction of I23-PM10: 1.41%
Missing fraction of I28-PM10: 0.19%
Missing fraction of I30-PM10: 1.34%
Missing fraction of I03-PM10: 0.05%
Missing fraction of I05-PM10: 5.15%
Missing fraction of I10-PM10: 16.18%
Missing fraction of I16-PM10: 2.33%
Missing fraction of I33-PM10: 0.14%
Missing fraction of I41-P