Notebook used to transform the `Air Quality` dataset to a single JSON file
suitable for a web application.


The JSON File has the following structure:

```json
{
    "Year": {
        "Pollutant": [
            {
                "Paese": "Country Name",
                "Concentrazione": "Value"
            }
        ]
    },
    ...
}
```

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import airbase
import pandas as pd

In [43]:
country_code_to_name = {
    'SI': 'Slovenia',
    'DE': 'Germany',
    'IS': 'Iceland',
    'XK': 'Kosovo',
    'CH': 'Switzerland',
    'AT': 'Austria',
    'DK': 'Denmark',
    'AL': 'Albania',
    'HR': 'Croatia',
    'LU': 'Luxembourg',
    'SK': 'Slovakia',
    'PL': 'Poland',
    'GB': 'United Kingdom',
    'IE': 'Ireland',
    'BA': 'Bosnia and Herzegovina',
    'BE': 'Belgium',
    'TR': 'Turkey',
    'LT': 'Lithuania',
    'IT': 'Italy',
    'ES': 'Spain',
    'MT': 'Malta',
    'RO': 'Romania',
    'BG': 'Bulgaria',
    'RS': 'Serbia',
    'NL': 'Netherlands',
    'CZ': 'Czech Republic',
    'PT': 'Portugal',
    'MK': 'North Macedonia',
    'SE': 'Sweden',
    'ME': 'Montenegro',
    'FR': 'France',
    'NO': 'Norway',
    'LV': 'Latvia',
    'AD': 'Andorra',
    'CY': 'Cyprus',
    'HU': 'Hungary',
    'GR': 'Greece',
    'EE': 'Estonia',
    'FI': 'Finland',
}

In [4]:
DATA_PATH = '../data/air_quality/'

In [None]:
from tqdm import tqdm

client = airbase.AirbaseClient()
countries = client.countries

for country in tqdm(countries):
    print(f"Converting data for {country}")
    df = pd.read_csv(f"../data/aggregated_air_quality_monthly/{country}.csv", sep=",", low_memory=False)
    df.to_json(f"../data/aggregated_air_quality_monthly_json/{country}.json", orient='records', lines=True)

  0%|          | 0/39 [00:00<?, ?it/s]

Converting data for SI
Converting data for DE


  5%|▌         | 2/39 [00:00<00:16,  2.26it/s]

Converting data for IS
Converting data for XK
Converting data for CH
Converting data for AT


 28%|██▊       | 11/39 [00:01<00:02, 11.08it/s]

Converting data for DK
Converting data for AL
Converting data for HR
Converting data for LU
Converting data for SK
Converting data for PL


 36%|███▌      | 14/39 [00:01<00:02,  9.19it/s]

Converting data for GB
Converting data for IE
Converting data for BA
Converting data for BE


 41%|████      | 16/39 [00:01<00:02, 10.10it/s]

Converting data for TR


 46%|████▌     | 18/39 [00:02<00:02,  9.45it/s]

Converting data for LT
Converting data for IT
Converting data for ES


 56%|█████▋    | 22/39 [00:04<00:04,  3.93it/s]

Converting data for MT
Converting data for RO
Converting data for BG


 64%|██████▍   | 25/39 [00:04<00:02,  5.53it/s]

Converting data for RS
Converting data for NL
Converting data for CZ


 69%|██████▉   | 27/39 [00:04<00:02,  5.93it/s]

Converting data for PT
Converting data for MK
Converting data for SE
Converting data for ME


 77%|███████▋  | 30/39 [00:04<00:01,  8.28it/s]

Converting data for FR


 95%|█████████▍| 37/39 [00:05<00:00,  9.69it/s]

Converting data for NO
Converting data for LV
Converting data for AD
Converting data for CY
Converting data for HU
Converting data for GR
Converting data for EE
Converting data for FI


100%|██████████| 39/39 [00:05<00:00,  7.01it/s]


In [None]:
df = pd.DataFrame()
for country in tqdm(countries):
    print(f"Converting data for {country}")
    df_country = pd.read_csv(f"../data/aggregated_air_quality_monthly/{country}.csv", sep=",", low_memory=False)    
    df_country['Country'] = country_code_to_name[country]
    df = pd.concat([df, df_country], ignore_index=True)
df.to_json(f"../data/aggregated_air_quality_monthly_json/all.json", orient='records', lines=True)

  0%|          | 0/39 [00:00<?, ?it/s]

Converting data for SI
Converting data for DE


 15%|█▌        | 6/39 [00:00<00:02, 12.79it/s]

Converting data for IS
Converting data for XK
Converting data for CH
Converting data for AT
Converting data for DK


 26%|██▌       | 10/39 [00:00<00:01, 18.80it/s]

Converting data for AL
Converting data for HR
Converting data for LU
Converting data for SK
Converting data for PL


 33%|███▎      | 13/39 [00:00<00:01, 15.50it/s]

Converting data for GB
Converting data for IE
Converting data for BA
Converting data for BE


 46%|████▌     | 18/39 [00:01<00:01, 14.96it/s]

Converting data for TR
Converting data for LT
Converting data for IT
Converting data for ES


 51%|█████▏    | 20/39 [00:02<00:02,  6.38it/s]

Converting data for MT
Converting data for RO


 62%|██████▏   | 24/39 [00:02<00:01,  7.56it/s]

Converting data for BG
Converting data for RS
Converting data for NL


 67%|██████▋   | 26/39 [00:02<00:01,  7.42it/s]

Converting data for CZ
Converting data for PT


 72%|███████▏  | 28/39 [00:03<00:01,  7.64it/s]

Converting data for MK
Converting data for SE


 74%|███████▍  | 29/39 [00:03<00:01,  7.67it/s]

Converting data for ME
Converting data for FR


 82%|████████▏ | 32/39 [00:03<00:01,  6.66it/s]

Converting data for NO
Converting data for LV


 87%|████████▋ | 34/39 [00:03<00:00,  7.74it/s]

Converting data for AD
Converting data for CY


 92%|█████████▏| 36/39 [00:04<00:00,  8.26it/s]

Converting data for HU
Converting data for GR


 97%|█████████▋| 38/39 [00:04<00:00,  8.89it/s]

Converting data for EE
Converting data for FI


100%|██████████| 39/39 [00:04<00:00,  8.83it/s]


Unnamed: 0,Samplingpoint,PollutantCode,PollutantName,Month,Latitude,Longitude,Value,Unit,Country
0,SPO-SI0001A_00001_100,1,SO2,2021-01-01,46.23451,15.26248,3.07953,ug.m-3,Slovenia
1,SPO-SI0001A_00001_100,1,SO2,2021-02-01,46.23451,15.26248,4.090818,ug.m-3,Slovenia
2,SPO-SI0001A_00001_100,1,SO2,2021-03-01,46.23451,15.26248,4.257419,ug.m-3,Slovenia
3,SPO-SI0001A_00001_100,1,SO2,2021-04-01,46.23451,15.26248,2.892125,ug.m-3,Slovenia
4,SPO-SI0001A_00001_100,1,SO2,2021-05-01,46.23451,15.26248,2.136048,ug.m-3,Slovenia


In [None]:
import pandas as pd
import json

df['Month'] = pd.to_datetime(df['Month'])
df['Year'] = df['Month'].dt.year

grouped = df.groupby(['Year', 'Samplingpoint', 'PollutantName', 'Country'], as_index=False).agg(
    avg_value=('Value', 'mean'),
    Unit=('Unit', 'first'),
    Latitude=('Latitude', 'first'),
    Longitude=('Longitude', 'first')
)

display(grouped.head())

grouped['avg_value'] = grouped['avg_value'].round(4)
result = {}

for _, row in grouped.iterrows():
    year = str(row['Year'])
    pollutant = row['PollutantName']
    samplingpoint = row['Samplingpoint']
    country = row['Country']
    value = row['avg_value']
    unit = row['Unit']
    longitude = row['Longitude']
    latitude = row['Latitude']

    if year not in result:
        result[year] = {}
    if pollutant not in result[year]:
        result[year][pollutant] = {}
    if country not in result[year][pollutant]:
        result[year][pollutant][country] = []

    result[year][pollutant][country].append({
        "Country": country,
        "Samplingpoint": samplingpoint, 
        "Concentration": value,
        "Unit": unit,
        "Longitude": longitude,
        "Latitude": latitude,
    })

with open("../data/aggregated_air_quality_yearly_json/all.json", "w") as f:
    json.dump(result, f, indent=2)

Unnamed: 0,Year,Samplingpoint,PollutantName,Country,avg_value,Unit,Latitude,Longitude
0,1990,SPO.04.S156.3945.1.1,SO2,Austria,5.385018,ug.m-3,48.25747,13.03923
1,1990,SPO.04.S156.3950.7.1,O3,Austria,0.0,ug.m-3,48.25747,13.03923
2,1990,SPO.04.S173.56394.8.1,NO2,Austria,42.835243,ug.m-3,48.27975,14.3665
3,1990,SPO.04.S173.56395.1.1,SO2,Austria,11.854724,ug.m-3,48.27975,14.3665
4,1990,SPO.06.119.4942.1.1,SO2,Austria,18.019247,ug.m-3,47.21037,14.82528


In [None]:
import json
import pandas as pd

with open("../data/aggregated_air_quality_yearly_json/all.json", "r") as f:
    data = json.load(f)

rows = []
for year, year_data in data.items():
    for pollutant, pollutant_data in year_data.items():
        for country, records in pollutant_data.items():
            for record in records:
                flat_record = {
                    "Year": int(year),
                    "Pollutant": pollutant,
                    "Country": country,
                    **record  # includes Samplingpoint, Concentration, Latitude, Longitude, etc.
                }
                rows.append(flat_record)

with open("../data/aggregated_air_quality_yearly_json/all_flat.json", "w") as f:
    json.dump(rows, f, indent=2)

df = pd.DataFrame(rows)
display(df.head())

Unnamed: 0,Year,Pollutant,Country,Samplingpoint,Concentration,Unit,Longitude,Latitude
0,1990,SO2,Austria,SPO.04.S156.3945.1.1,5.385,ug.m-3,13.03923,48.25747
1,1990,SO2,Austria,SPO.04.S173.56395.1.1,11.8547,ug.m-3,14.3665,48.27975
2,1990,SO2,Austria,SPO.06.119.4942.1.1,18.0192,ug.m-3,14.82528,47.21037
3,1990,SO2,Austria,SPO.06.156.1813.1.1,5.8418,ug.m-3,15.88222,47.34806
4,1990,SO2,Austria,SPO.06.170.5937.1.1,16.317,ug.m-3,15.43308,47.04172
