## ML Process - Air Quality
---
Data Pipeline

In [1]:
import os
import yaml
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

## 1 - Configuration file
---

- Create two functions: load_config() and update_config().

In [2]:
def load_config(config_path):
    """
    Load the configuration file (config.yaml).

    Parameters:
    ----------
    config_path : str
        Configuration file location.

    Returns:
    -------
    params : dict
        The configuration parameters.
    """

    # Try to load config.yaml file.
    try:
        with open(config_path, 'r') as file:
            params = yaml.safe_load(file)
    except FileNotFoundError as err:
        raise RuntimeError(f"Configuration file not found in {config_path}")

    return params

In [3]:
def update_config(key, value, params, config_path):
    """
    Update the configuration parameter values.

    Parameters:
    ----------
    key : str
        The key to be updated.

    value : any type supported in Python
        The updated value.

    params : dict
        Loaded configuration parameters.

    config_path : str
        Configuration file location.

    Returns:
    -------
    config : dict
        Updated configuration parameters.
    """

    # To maintain the raw config immutable.
    params = params.copy()

    # Update the configuration parameters.
    params[key] = value

    with open(config_path, 'w') as file:
        yaml.dump(params, file)

    print(f"Params Updated! \nKey: {key} \nValue: {value}\n")

    # Reload the updated configuration parameters.
    config = load_config(config_path)

    return config

In [4]:
# Load the configuration file.
PATH_CONFIG = "../config/config.yaml"
config = load_config(PATH_CONFIG)

In [5]:
# Check the configuration parameters.
config

{'datetime_columns': ['tanggal'],
 'features': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'int32_columns': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'label': 'categori',
 'label_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'label_categories_new': ['BAIK', 'TIDAK BAIK'],
 'object_columns': ['stasiun', 'critical', 'categori'],
 'path_joined_data': '../data/interim/joined_dataset.pkl',
 'path_raw_data': '../data/raw/',
 'range_co': [0, 100],
 'range_no2': [0, 100],
 'range_o3': [0, 140],
 'range_pm10': [0, 800],
 'range_pm25': [0, 400],
 'range_so2': [0, 500],
 'range_stasiun': ['DKI1 (Bunderan HI)',
  'DKI2 (Kelapa Gading)',
  'DKI3 (Jagakarsa)',
  'DKI4 (Lubang Buaya)',
  'DKI5 (Kebon Jeruk) Jakarta Barat']}

## 2 - Data Collection
---
- Create load_data() function.
- It receives one argument: data_path
- This function load all csv raw data and return the joined dataframe.

In [6]:
def load_data(data_path):
    """
    Load csv files and join into one dataframe.

    Parameters:
    ----------
    data_path : str
        Raw dataset location.

    Returns:
    -------
    raw_dataset : pd.DataFrame
        Loaded and joined data.
    """

    # Create variable to store raw dataset.
    raw_dataset = pd.DataFrame()

    # Load and join the csv files.
    for i in tqdm(os.listdir(data_path)):
        raw_dataset = pd.concat([pd.read_csv(data_path + i), raw_dataset])

    return raw_dataset

In [7]:
# Load the raw dataset.
PATH_RAW_DATA = config["path_raw_data"]
raw_dataset = load_data(PATH_RAW_DATA)

100%|██████████| 12/12 [00:00<00:00, 158.28it/s]


In [8]:
raw_dataset

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-08-01,DKI1 (Bunderan HI),51,68,25,8,29,22,68,PM25,SEDANG
1,2021-08-02,DKI1 (Bunderan HI),47,63,24,10,25,28,63,PM25,SEDANG
2,2021-08-03,DKI1 (Bunderan HI),50,68,26,11,19,35,68,PM25,SEDANG
3,2021-08-04,DKI1 (Bunderan HI),52,70,29,8,24,26,70,PM25,SEDANG
4,2021-08-05,DKI1 (Bunderan HI),52,66,29,9,21,27,66,PM25,SEDANG
...,...,...,...,...,...,...,...,...,...,...,...
135,2021-02-24,DKI5 (Kebon Jeruk) Jakarta Barat,24,40,28,4,11,7,40,PM25,BAIK
136,2021-02-25,DKI5 (Kebon Jeruk) Jakarta Barat,28,52,31,7,13,23,52,PM25,SEDANG
137,2021-02-26,DKI5 (Kebon Jeruk) Jakarta Barat,24,49,21,7,22,18,49,PM25,BAIK
138,2021-02-27,DKI5 (Kebon Jeruk) Jakarta Barat,39,64,27,10,25,24,64,PM25,SEDANG


- We found that:
1. Index only ranged from 0 to 149, while there are 1830 rows.
2. Date only ranged from month 2 to 11, while there are 12 months.

In [9]:
raw_dataset = raw_dataset.reset_index(drop=True)
raw_dataset

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-08-01,DKI1 (Bunderan HI),51,68,25,8,29,22,68,PM25,SEDANG
1,2021-08-02,DKI1 (Bunderan HI),47,63,24,10,25,28,63,PM25,SEDANG
2,2021-08-03,DKI1 (Bunderan HI),50,68,26,11,19,35,68,PM25,SEDANG
3,2021-08-04,DKI1 (Bunderan HI),52,70,29,8,24,26,70,PM25,SEDANG
4,2021-08-05,DKI1 (Bunderan HI),52,66,29,9,21,27,66,PM25,SEDANG
...,...,...,...,...,...,...,...,...,...,...,...
1825,2021-02-24,DKI5 (Kebon Jeruk) Jakarta Barat,24,40,28,4,11,7,40,PM25,BAIK
1826,2021-02-25,DKI5 (Kebon Jeruk) Jakarta Barat,28,52,31,7,13,23,52,PM25,SEDANG
1827,2021-02-26,DKI5 (Kebon Jeruk) Jakarta Barat,24,49,21,7,22,18,49,PM25,BAIK
1828,2021-02-27,DKI5 (Kebon Jeruk) Jakarta Barat,39,64,27,10,25,24,64,PM25,SEDANG


In [10]:
# Serialize the joined dataset.
PATH_JOINED_DATA = f"../data/interim/joined_dataset.pkl"
joblib.dump(raw_dataset, PATH_JOINED_DATA)

['../data/interim/joined_dataset.pkl']

In [11]:
# Update the configuration parameter.
config = update_config(
    key = "path_joined_data",
    value = PATH_JOINED_DATA,
    params = config,
    config_path = PATH_CONFIG)

Params Updated! 
Key: path_joined_data 
Value: ../data/interim/joined_dataset.pkl



## 3 - Data Validation
---

In [12]:
# Check the data type for each feature.
raw_dataset.dtypes

tanggal        str
stasiun        str
pm10        object
pm25        object
so2         object
co          object
o3          object
no2         object
max         object
critical       str
categori       str
dtype: object

- Several features don't have the same configuration data type.
- We need to handle those error columns.