# **ML PROCESS - Air Quality**
---
**3 - Data Preprocessing**

In [1]:
# Import the required libraries
import yaml
import joblib
import numpy as np
import pandas as pd

from sklearn.preprocessing import (
    OneHotEncoder,
    LabelEncoder,
    StandardScaler
)

from imblearn.under_sampling import RandomUnderSampler as RUS
from imblearn.over_sampling import (
    RandomOverSampler as ROS,
    SMOTE
)

## **1 - Configuration File**
---

In [2]:
# Function to load configuration parameter.
def load_config(path_config):
    """
    Load the configuration file.

    Parameters:
    -----------
    path_config : str
        Configuration file location.

    Returns:
    --------
    params : dict
        Loaded configuration file.
    """

    # Try to load config.yaml file.
    try:
        with open(path_config, 'r') as file:
            params = yaml.safe_load(file)
    except FileNotFoundError as err:
        raise RuntimeError(f"Configuration file not found in {path_config}")

    return params

In [3]:
# Function to update configuration parameter.
def update_config(key, value, params, path_config):
    """
    Update the configuration parameter values.

    Parameters:
    ----------
    key : str
        The key to be updated.

    value : any type supported in Python
        The updated value.

    params :  dict
        Loaded configuration parameters.

    path_config : str
        Configuration file location.

    Returns:
    -------
    config : dict
        Updated configuration parameters.
    """
    
    # To maintain the raw config file imutable.
    params = params.copy()

    # Update the configuration.
    params[key] = value

    # Write the configuration file.
    with open(path_config, 'w') as file:
        yaml.dump(params, file)

    print(f"Params Updated! \nKey: {key} - \nValue: {value}\n")

    # Reload the updated configuration file.
    config = load_config(path_config)

    return config

In [4]:
# Load the config.yaml
PATH_CONFIG = "../config/config.yaml"
config = load_config(PATH_CONFIG)

In [5]:
# Check the configuration parameters.
config

{'columns_datetime': ['tanggal'],
 'columns_int': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'columns_object': ['stasiun', 'critical', 'category'],
 'features': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'impute_co': 11,
 'impute_o3': 29,
 'impute_pm10': {'BAIK': 28, 'TIDAK BAIK': 55},
 'impute_pm25': {'BAIK': 39, 'TIDAK BAIK': 82},
 'impute_so2': 35,
 'imputer_no2': 18,
 'label': 'category',
 'label_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'label_categories_new': ['BAIK', 'TIDAK BAIK'],
 'path_clean_test': ['../data/processed/X_test_clean.pkl',
  '../data/processed/y_test_clean.pkl'],
 'path_clean_train': ['../data/processed/X_train_clean.pkl',
  '../data/processed/y_train_clean.pkl'],
 'path_clean_valid': ['../data/processed/X_valid_clean.pkl',
  '../data/processed/y_valid_clean.pkl'],
 'path_data_raw': '../data/raw/',
 'path_data_test': ['../data/interim/X_test.pkl',
  '../data/interim/y_test.pkl'],
 'path_data_train': ['../data/interim/X_train.pkl',
  

## **2 - Load Data**
---

In [6]:
# Function for load data.
def load_data(config):
    """
    Load every set of data.

    Parameters:
    ----------
    config : dict
        The loaded configuration file.

    Returns:
    -------
    data_train, data_valid, data_test : pd.DataFrame
        The loaded data.
    """

    # Load the train set.
    X_train = joblib.load(config["path_data_train"][0])
    y_train = joblib.load(config["path_data_train"][1])

    # Load the valid set.
    X_valid = joblib.load(config["path_data_valid"][0])
    y_valid = joblib.load(config["path_data_valid"][1])

    # Load the test set.
    X_test = joblib.load(config["path_data_test"][0])
    y_test = joblib.load(config["path_data_test"][1])

    # Concatenate the X and y of each set.
    data_train = pd.concat([X_train, y_train], axis=1)
    data_valid = pd.concat([X_valid, y_valid], axis=1)
    data_test = pd.concat([X_test, y_test], axis=1)

    # Validate the proportion.
    num_all_data = int(data_train.shape[0]) + int(data_valid.shape[0]) + int(data_test.shape[0])

    print(f"Data train proportion : {len(X_train) / num_all_data}")
    print(f"Data valid proportion : {len(X_valid) / num_all_data}")
    print(f"Data test proportion : {len(X_test) / num_all_data}")

    return data_train, data_valid, data_test

In [7]:
# Load the data.
data_train, data_valid, data_test = load_data(config)

Data train proportion : 0.7997793712079426
Data valid proportion : 0.09983452840595698
Data test proportion : 0.10038610038610038


In [8]:
# Sanity check the train data.
data_train.head()

Unnamed: 0,stasiun,pm10,pm25,so2,co,o3,no2,category
224,DKI3 (Jagakarsa),50,76,20,8,71,15,SEDANG
1712,DKI2 (Kelapa Gading),33,58,42,11,43,19,SEDANG
1028,DKI4 (Lubang Buaya),51,78,37,12,23,19,SEDANG
391,DKI3 (Jagakarsa),46,-1,17,17,41,10,BAIK
1547,DKI1 (Bunderan HI),68,102,31,13,20,30,TIDAK SEHAT


## **3 - Join Categories**
---
`SEDANG` + `TIDAK SEHAT` => `TIDAK BAIK`

In [9]:
# Function for join categories.
def join_categories(data, config):
    """
    Join categories SEDANG & TIDAK SEHAT -> TIDAK BAIK.

    Parameters:
    ----------
    data : pd.DataFrame
        The loaded data.

    config : dict
        The loaded configuration file.

    Returns:
    -------
    data : pd.DataFrame
        The loaded data with categories joined.
    """

    # Ensure raw data immutable.
    data = data.copy()

    # Check if label found in data.
    if config["label"] in data.columns.tolist():

        # Rename SEDANG to TIDAK SEHAT.
        data["category"] = data["category"].replace("SEDANG", "TIDAK SEHAT")

        # Rename TIDAK SEHAT to TIDAK BAIK.
        data["category"] = data["category"].replace("TIDAK SEHAT", "TIDAK BAIK")

        return data
    else:
        raise RuntimeError("Label is not detected in the dataset.")

In [10]:
# Update the configuration parameter.
config = update_config(
    key = "label_categories_new",
    value = ["BAIK", "TIDAK BAIK"],
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: label_categories_new - 
Value: ['BAIK', 'TIDAK BAIK']



Join categories in train data.

In [11]:
data_train["category"].value_counts()

category
SEDANG         1044
TIDAK SEHAT     255
BAIK            151
Name: count, dtype: int64

In [12]:
data_train = join_categories(data_train, config)

In [13]:
data_train["category"].value_counts()

category
TIDAK BAIK    1299
BAIK           151
Name: count, dtype: int64

Join categories in valid data.

In [14]:
data_valid["category"].value_counts()

category
SEDANG         130
TIDAK SEHAT     32
BAIK            19
Name: count, dtype: int64

In [15]:
data_valid = join_categories(data_valid, config)

In [16]:
data_valid["category"].value_counts()

category
TIDAK BAIK    162
BAIK           19
Name: count, dtype: int64

Join categories in test data.

In [17]:
data_test["category"].value_counts()

category
SEDANG         131
TIDAK SEHAT     32
BAIK            19
Name: count, dtype: int64

In [18]:
data_test = join_categories(data_test, config)

In [19]:
data_test["category"].value_counts()

category
TIDAK BAIK    163
BAIK           19
Name: count, dtype: int64

## **4 - Handling Missing Value**
---
- Create the `nan_replace()` function.

In [20]:
# Function to replace -1 with NaN.
def nan_replace(data):
    """
    Replace any -1 with NaN (Not a Number).

    Parameters:
    ----------
    data : pd.DataFrame
        The loaded data.

    Returns:
    -------
    data : pd.DataFrame
        The processed data.
    """
      # Ensure the raw data immutable
    data = data.copy()

    # Replace all -1 to NaN.
    data = data.replace(-1, np.nan)

    return data

In [21]:
# Replace the -1 on data_train.
data_train = nan_replace(data_train)
data_train.isnull().sum()

stasiun      0
pm10        45
pm25        67
so2         73
co          12
o3          38
no2         17
category     0
dtype: int64

In [22]:
# Replace the -1 on data_valid.
data_valid = nan_replace(data_valid)
data_valid.isnull().sum()

stasiun      0
pm10         2
pm25        12
so2          6
co           2
o3           7
no2          2
category     0
dtype: int64

In [23]:
# Replace the -1 on data_test.
data_test = nan_replace(data_test)
data_test.isnull().sum()

stasiun      0
pm10         6
pm25         7
so2         18
co           2
o3           3
no2          0
category     0
dtype: int64

### 4.1. `pm10` Imputation

- Create the `calculate_class_mean()` and `impute_class_mean()` function.

In [24]:
# Function to calculate class mean for pm10 and pm25.
def calculate_class_mean(data, column):
    """
    Calculate the class mean for column pm10 and pm25.

    Parameters:
    ----------
    data : pd.DataFrame
        The loaded data.

    column : str
        The column name.

    Returns:
    -------
    impute_baik, impute_tidak_baik : float
        The mean for each class.
    """

    # Ensure raw data immutable.
    data = data.copy()

    # Boolean condition for each class.
    data_baik = data["category"] == "BAIK"
    data_tidak_baik = data["category"] == "TIDAK BAIK"

    # Calculate the class mean.
    impute_baik = int(data[data_baik][column].mean())
    impute_tidak_baik = int(data[data_tidak_baik][column].mean())

    print(f"MEAN {column} class BAIK: {impute_baik}")
    print(f"MEAN {column} class TIDAK BAIK: {impute_tidak_baik}")

    return impute_baik, impute_tidak_baik

In [25]:
# Function to impute missing values in column pm10 and pm25 using class mean.
def impute_class_mean(data, column, impute_baik, impute_tidak_baik):
    """
    Impute the missing value for column pm10 and pm25.

    Parameters:
    ----------
    data : pd.DataFrame
        The loaded data.

    column : str
        The column name.

    impute_baik : float
        The mean for class BAIK.

    impute_tidak_baik : float
        The mean for class TIDAK BAIK.

    Returns:
    -------
    data : pd.DataFrame
        The imputed data.
    """

    # Ensure raw data immutable.
    data = data.copy()

    # Boolean condition for each class.
    data_baik = data["category"] == "BAIK"
    data_tidak_baik = data["category"] == "TIDAK BAIK"

    # Boolean condition for missing values.
    missing_values = data[column].isnull() == True

    # Slice the missing values for each class.
    missing_baik = data[data_baik & missing_values]
    missing_tidak_baik = data[data_tidak_baik & missing_values]

    print(f"Num of missing values in {column} class BAIK before imputation : {len(missing_baik)}")
    print(f"Num of missing values in {column} class TIDAK BAIK before imputation : {len(missing_tidak_baik)}\n")

    # Impute the missing values.
    data.loc[data[data_baik & missing_values].index, column] = impute_baik
    data.loc[data[data_tidak_baik & missing_values].index, column] = impute_tidak_baik


    print(f"Num of missing values in {column} class BAIK after imputation : {data[data_baik][column].isnull().sum()}")
    print(f"Num of missing values in {column} class TIDAK BAIK after imputation : {data[data_tidak_baik][column].isnull().sum()}\n")

    return data

Impute the `pm10` column in train, valid, and test set.

In [26]:
# Calculate the class mean.
column = "pm10"

impute_baik, impute_tidak_baik = calculate_class_mean(
    data = data_train,
    column = column
)

# Update the configuration parameter.
config = update_config(
    key = f"impute_{column}",
    value = {"BAIK": impute_baik,
            "TIDAK BAIK": impute_tidak_baik},
    params = config,
    path_config = PATH_CONFIG
)

MEAN pm10 class BAIK: 28
MEAN pm10 class TIDAK BAIK: 55
Params Updated! 
Key: impute_pm10 - 
Value: {'BAIK': 28, 'TIDAK BAIK': 55}



In [27]:
# Impute the missing value on data_train.
data_train = impute_class_mean(
    data = data_train,
    column = column,
    impute_baik = impute_baik,
    impute_tidak_baik = impute_tidak_baik
)

Num of missing values in pm10 class BAIK before imputation : 9
Num of missing values in pm10 class TIDAK BAIK before imputation : 36

Num of missing values in pm10 class BAIK after imputation : 0
Num of missing values in pm10 class TIDAK BAIK after imputation : 0



In [28]:
# Impute the missing value on data_valid.
data_valid = impute_class_mean(
    data = data_valid,
    column = column,
    impute_baik = impute_baik,
    impute_tidak_baik = impute_tidak_baik
)

Num of missing values in pm10 class BAIK before imputation : 1
Num of missing values in pm10 class TIDAK BAIK before imputation : 1

Num of missing values in pm10 class BAIK after imputation : 0
Num of missing values in pm10 class TIDAK BAIK after imputation : 0



In [29]:
# Impute the missing value on data_test.
data_test = impute_class_mean(
    data = data_test,
    column = column,
    impute_baik = impute_baik,
    impute_tidak_baik = impute_tidak_baik
)

Num of missing values in pm10 class BAIK before imputation : 0
Num of missing values in pm10 class TIDAK BAIK before imputation : 6

Num of missing values in pm10 class BAIK after imputation : 0
Num of missing values in pm10 class TIDAK BAIK after imputation : 0



### 4.2. `pm25` Imputation

Impute the `pm25` column in train, valid, and test set.

In [30]:
# Calculate the class mean.
column = "pm25"

impute_baik, impute_tidak_baik = calculate_class_mean(
    data = data_train,
    column = column
)

# Update the configuration parameter.
config = update_config(
    key = f"impute_{column}",
    value = {"BAIK": impute_baik,
            "TIDAK BAIK": impute_tidak_baik},
    params = config,
    path_config = PATH_CONFIG
)

MEAN pm25 class BAIK: 39
MEAN pm25 class TIDAK BAIK: 82
Params Updated! 
Key: impute_pm25 - 
Value: {'BAIK': 39, 'TIDAK BAIK': 82}



In [31]:
# Impute the missing value on data_train.
data_train = impute_class_mean(
    data = data_train,
    column = column,
    impute_baik = impute_baik,
    impute_tidak_baik = impute_tidak_baik
)

Num of missing values in pm25 class BAIK before imputation : 41
Num of missing values in pm25 class TIDAK BAIK before imputation : 26

Num of missing values in pm25 class BAIK after imputation : 0
Num of missing values in pm25 class TIDAK BAIK after imputation : 0



In [32]:
# Impute the missing value on data_valid.
data_valid = impute_class_mean(
    data = data_valid,
    column = column,
    impute_baik = impute_baik,
    impute_tidak_baik = impute_tidak_baik
)

Num of missing values in pm25 class BAIK before imputation : 6
Num of missing values in pm25 class TIDAK BAIK before imputation : 6

Num of missing values in pm25 class BAIK after imputation : 0
Num of missing values in pm25 class TIDAK BAIK after imputation : 0



In [33]:
# Impute the missing value on data_test.
data_test = impute_class_mean(
    data = data_test,
    column = column,
    impute_baik = impute_baik,
    impute_tidak_baik = impute_tidak_baik
)

Num of missing values in pm25 class BAIK before imputation : 3
Num of missing values in pm25 class TIDAK BAIK before imputation : 4

Num of missing values in pm25 class BAIK after imputation : 0
Num of missing values in pm25 class TIDAK BAIK after imputation : 0



### 4.3. `so2`, `co`, `o3`, and `no2` Imputation

- Create the `calculate_impute_values()` and `impute_missing_values()` function

In [34]:
# Function to calculate impute values for the other columns.
def calculate_impute_values(data):
    """
    Calculate the impute values for column so2, co, o3, and no2.
        - so2 imputed using the mean.
        - co, o3, and no2 imputed using the median.

    Parameters:
    ----------
    data : pd.DataFrame
        The loaded data.

    Returns:
    -------
    impute_values : dict
        The calculated imputed values.
    """

    # Ensure raw data immutable.
    data = data.copy()

    # Calculate the impute values.
    impute_so2 = int(data["so2"].mean())
    impute_co = int(data["co"].median())
    impute_o3 = int(data["o3"].median())
    impute_no2 = int(data["no2"].median())

    impute_values = {
        "so2": impute_so2,
        "co": impute_co,
        "o3": impute_o3,
        "no2": impute_no2
    }

    return impute_values

In [35]:
# Function to impute missing values for the other columns.
def impute_missing_values(data, impute_values):
    """
    Impute the missing values for coumn so2, co, o3, and no2.

    Parameters:
    ----------
    data : pd.DataFrame
        The loaded data.

    impute_values : dict
        The calculated impute values.

    Returns:
    --------
    data : pd.DataFrame
        The imputed data.
    """

    # Ensure raw data immutable.
    data = data.copy()
    print(f"Num of missing values before imputation :\n{data.isnull().sum()}\n")

    # Impute the missing values.
    data = data.fillna(value = impute_values)
    print(f"Num of missing values after imputation :\n{data.isnull().sum()}\n")

    return data

Impute the other columns in train, valid, and test set.

In [36]:
# Calculate the impute values.
impute_values = calculate_impute_values(data_train)

# Update the configuration parameter.
cols = ['so2', 'co', 'o3', 'no2']
param_keys = ['impute_so2', 'impute_co', 'impute_o3', 'imputer_no2']

for col, param_key in zip(cols, param_keys):
    config = update_config(
        key = param_key,
        value = impute_values[col],
        params = config,
        path_config = PATH_CONFIG
    )

Params Updated! 
Key: impute_so2 - 
Value: 35

Params Updated! 
Key: impute_co - 
Value: 11

Params Updated! 
Key: impute_o3 - 
Value: 29

Params Updated! 
Key: imputer_no2 - 
Value: 18



In [37]:
# Impute the missing value on data_train.
data_train = impute_missing_values(
    data = data_train,
    impute_values = impute_values
)

Num of missing values before imputation :
stasiun      0
pm10         0
pm25         0
so2         73
co          12
o3          38
no2         17
category     0
dtype: int64

Num of missing values after imputation :
stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
category    0
dtype: int64



In [38]:
# Impute the missing value on data_valid.
data_valid = impute_missing_values(
    data = data_valid,
    impute_values = impute_values
)

Num of missing values before imputation :
stasiun     0
pm10        0
pm25        0
so2         6
co          2
o3          7
no2         2
category    0
dtype: int64

Num of missing values after imputation :
stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
category    0
dtype: int64



In [39]:
# Impute the missing value on data_test.
data_test = impute_missing_values(
    data = data_test,
    impute_values = impute_values
)

Num of missing values before imputation :
stasiun      0
pm10         0
pm25         0
so2         18
co           2
o3           3
no2          0
category     0
dtype: int64

Num of missing values after imputation :
stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
category    0
dtype: int64



## **5 - Encoding `stasiun`**
---

- Create the `fit_ohe_encoder()` and `transform_ohe_encoder()` function.

In [40]:
# Function to fit the encoder.
def fit_ohe_encoder(data, path_ohe):
    """
    Fit the OHE encoder.

    Parameters:
    ----------
    data : pd.Series
        Categorical input data.

    path_ohe : str
        The OHE encoder location.

    Returns:
    -------
    ohe_encoder : sklearn.preprocessing.OneHotEncoder
        Fitted OHE encoder object.
    """

    # Sklearn only accepts 2D matrix, thus we need to reshape the data.
    col_stasiun = np.array(data).reshape(-1, 1)

    # Create the encoder object.
    ohe_encoder = OneHotEncoder(sparse_output=False, dtype=int)

    # Fit the encoder.
    ohe_encoder.fit(col_stasiun)

    # Serialize the ohe_encoder.
    joblib.dump(ohe_encoder, path_ohe)

    return ohe_encoder

In [41]:
# Function to encode the data.
def transform_ohe_encoder(data, encoder):
    """
    Transform the categorical column using OHE encoder.

    Parameters:
    ----------
    data : pd.DataFrame
        Data to be transformed.

    encoder : sklearn.preprocessing.OneHotEncoder
        The fitted encoder.

    Returns:
    -------
    data : pd.DataFrame
        The concatenated data with OHE  columns.
    """

    # Ensure raw data immutable.
    data = data.copy()

    # Sklearn only accepts 2D matrix, thus we need to reshape the data.
    column = "stasiun"
    X_stasiun = np.array(data[column]).reshape(-1, 1)

    # Encode the data.
    stasiun_features = encoder.transform(X_stasiun)

    # Convert to dataframe.
    stasiun_features = pd.DataFrame(
        stasiun_features.tolist(),
        columns = list(encoder.categories_[0]),
        index = data.index
    )

    # Concat the OHE features with the original data.
    data = pd.concat(
        [stasiun_features, data],
        axis = 1
    )

    # Drop the original column.
    data = data.drop(columns = column)

    # Convert columns type to string.
    new_col = [str(col_name) for col_name in data.columns.tolist()]
    data.columns = new_col

    return data

In [42]:
# Fit the ohe_encoder.
PATH_ENCODER_STASIUN = "../models/ohe_stasiun.pkl"

ohe_stasiun = fit_ohe_encoder(
    data = config["range_stasiun"],
    path_ohe = PATH_ENCODER_STASIUN
)

# Update the configuration parameter.
config = update_config(
    key = "path_fitted_encoder_stasiun",
    value = PATH_ENCODER_STASIUN,
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: path_fitted_encoder_stasiun - 
Value: ../models/ohe_stasiun.pkl



Encode the `stasiun` column in train, valid, and test set.

In [43]:
# Encode the categorical column on data_train.
data_train = transform_ohe_encoder(
    data = data_train,
    encoder = ohe_stasiun
)

data_train.head()

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
224,0,0,1,0,0,50.0,76.0,20.0,8.0,71.0,15.0,TIDAK BAIK
1712,0,1,0,0,0,33.0,58.0,42.0,11.0,43.0,19.0,TIDAK BAIK
1028,0,0,0,1,0,51.0,78.0,37.0,12.0,23.0,19.0,TIDAK BAIK
391,0,0,1,0,0,46.0,39.0,17.0,17.0,41.0,10.0,BAIK
1547,1,0,0,0,0,68.0,102.0,31.0,13.0,20.0,30.0,TIDAK BAIK


In [44]:
# Encode the categorical column on data_valid.
data_valid = transform_ohe_encoder(
    data = data_valid,
    encoder = ohe_stasiun
)

data_valid.head()

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
82,0,0,1,0,0,81.0,118.0,27.0,15.0,37.0,16.0,TIDAK BAIK
1150,0,0,0,1,0,68.0,118.0,40.0,16.0,26.0,27.0,TIDAK BAIK
549,0,0,1,0,0,63.0,83.0,42.0,12.0,27.0,11.0,TIDAK BAIK
850,0,0,1,0,0,27.0,54.0,25.0,6.0,24.0,7.0,TIDAK BAIK
388,0,0,1,0,0,58.0,86.0,22.0,11.0,43.0,9.0,TIDAK BAIK


In [45]:
# Encode the categorical column on data_test.
data_test = transform_ohe_encoder(
    data = data_test,
    encoder = ohe_stasiun
)

data_test.head()

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
1317,0,0,0,1,0,63.0,114.0,39.0,11.0,15.0,21,TIDAK BAIK
1648,0,0,0,0,1,66.0,115.0,35.0,11.0,36.0,20,TIDAK BAIK
803,0,1,0,0,0,30.0,38.0,37.0,11.0,57.0,9,TIDAK BAIK
1125,0,0,1,0,0,73.0,108.0,48.0,13.0,29.0,15,TIDAK BAIK
1503,0,0,0,0,1,70.0,107.0,35.0,11.0,39.0,16,TIDAK BAIK


## **6 - Scaling Data**
---

- Create the `fit_scaler()` and `transform_scaler()` function.

In [46]:
# Function to fit the scaler.
def fit_scaler(data, path_scaler, config):
    """
    Fit the scaler.

    Parameters:
    ----------
    data : pd.DataFrame
        Input data (all features must be in numeric form)

    path_scaler : str
        The scaler location.

    config : dict
        The loaded configuration file.

    Returns:
    -------
    scaler : sklearn.preprocessing.StandardScaler
        Fitted scaler object (storing the mean & std of all features)
    """

    # Split input-output, StandardScaler() only accepts numeric data.
    label = config["label"]
    y = data[label]
    X = data.drop(columns = label)

    # Create scaler object.
    scaler = StandardScaler()

    # Fit the scaler.
    scaler.fit(X)

    # Serialize the scaler.
    joblib.dump(scaler, path_scaler)

    return scaler

In [47]:
# Function to scale the data.
def transform_scaler(data, scaler, config):
    """
    Transform the data using scaler.

    Parameters:
    ----------
    data : pd.DataFrame
        Input data (all features must be in numeric form)

    scaler : sklearn.preprocessing.StamdardScaler
        Fitted scaler object (storing the mean & std of all features)

    config : dict
        The loaded configuration file.

    Returns:
    -------
    data : pd.DataFrame
        The scaled data
    """

    # Ensure raw data immutable.
    data = data.copy()

    # Split input-output, StandardScaler() only accepts numeric data.
    label = config["label"]
    y = data[label]
    X = data.drop(columns = label)

    # Scale the data.
    scaled_data = scaler.transform(X)

    # Convert to dataframe.
    X_scaled = pd.DataFrame(
        scaled_data,
        columns = X.columns,
        index = X.index
    )

    # Concat the X_scaled with y.
    data = pd.concat(
        [X_scaled, y],
        axis = 1
    )

    return data

In [48]:
# Fit the scaler.
PATH_SCALER = "../models/scaler.pkl"

scaler = fit_scaler(
    data = data_train,
    path_scaler = PATH_SCALER,
    config = config
)

# Update the configuration parameter.
config = update_config(
    key = "path_fitted_scaler",
    value = PATH_SCALER,
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: path_fitted_scaler - 
Value: ../models/scaler.pkl



Scale the data in train, valid, and test set.

In [49]:
# Scale the data on data_train.
data_train = transform_scaler(
    data = data_train,
    scaler = scaler,
    config = config
)

print(f"Data shape : {data_train.shape}")
data_train.head()

Data shape : (1450, 12)


Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
224,-0.510754,-0.496765,2.013024,-0.494606,-0.501077,-0.185736,-0.091678,-1.237053,-0.759346,2.73799,-0.478647,TIDAK BAIK
1712,-0.510754,2.013024,-0.496765,-0.494606,-0.501077,-1.345994,-0.814575,0.576125,-0.137985,0.776381,-0.031817,TIDAK BAIK
1028,-0.510754,-0.496765,-0.496765,2.021811,-0.501077,-0.117485,-0.011356,0.164039,0.069135,-0.624768,-0.031817,TIDAK BAIK
391,-0.510754,-0.496765,2.013024,-0.494606,-0.501077,-0.458738,-1.577633,-1.484305,1.104737,0.636267,-1.037184,BAIK
1547,1.95789,-0.496765,-0.496765,-0.494606,-0.501077,1.042774,0.952507,-0.330464,0.276256,-0.83494,1.196965,TIDAK BAIK


In [50]:
# Scale the data on data_valid.
data_valid = transform_scaler(
    data = data_valid,
    scaler = scaler,
    config = config
)

print(f"Data shape : {data_valid.shape}")
data_valid.head()

Data shape : (181, 12)


Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
82,-0.510754,-0.496765,2.013024,-0.494606,-0.501077,1.930031,1.595083,-0.660133,0.690496,0.356037,-0.36694,TIDAK BAIK
1150,-0.510754,-0.496765,-0.496765,2.021811,-0.501077,1.042774,1.595083,0.41129,0.897617,-0.414595,0.861842,TIDAK BAIK
549,-0.510754,-0.496765,2.013024,-0.494606,-0.501077,0.701521,0.189449,0.576125,0.069135,-0.344538,-0.925477,TIDAK BAIK
850,-0.510754,-0.496765,2.013024,-0.494606,-0.501077,-1.755498,-0.975219,-0.824968,-1.173587,-0.55471,-1.372307,TIDAK BAIK
388,-0.510754,-0.496765,2.013024,-0.494606,-0.501077,0.360269,0.309932,-1.072219,-0.137985,0.776381,-1.148892,TIDAK BAIK


In [51]:
# Scale the data on data_test.
data_test = transform_scaler(
    data = data_test,
    scaler = scaler,
    config = config
)

print(f"Data shape : {data_test.shape}")
data_test.head()

Data shape : (182, 12)


Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
1317,-0.510754,-0.496765,-0.496765,2.021811,-0.501077,0.701521,1.434439,0.328873,-0.137985,-1.185227,0.191598,TIDAK BAIK
1648,-0.510754,-0.496765,-0.496765,-0.494606,1.9957,0.906273,1.4746,-0.000796,-0.137985,0.285979,0.07989,TIDAK BAIK
803,-0.510754,2.013024,-0.496765,-0.494606,-0.501077,-1.550746,-1.617794,0.164039,-0.137985,1.757186,-1.148892,TIDAK BAIK
1125,-0.510754,-0.496765,2.013024,-0.494606,-0.501077,1.384027,1.193473,1.070628,0.276256,-0.204423,-0.478647,TIDAK BAIK
1503,-0.510754,-0.496765,-0.496765,-0.494606,1.9957,1.179275,1.153312,-0.000796,-0.137985,0.496152,-0.36694,TIDAK BAIK


## **7 - Label Encoding**
---

- Create the `fit_label_encoder()` and `transform_label_encoder()` function.

In [52]:
# Function to fit label encoder.
def fit_label_encoder(label, path_le):
    """
    Fit the label encoder.

    Paramaters:
    ----------
    label : pd.Series
        Categorical label.

    path_le : str
        The label encoder location.

    Returns:
    -------
    label_encoder : sklearn.preprocessing.LabelEncoder
        Fitted label encoder object.
    """

    # Create the label encoder object.
    label_encoder = LabelEncoder()

    # Fit the label encoder.
    label_encoder.fit(label)

    # Serialize the label encoder.
    joblib.dump(label_encoder, path_le)

    return label_encoder

In [53]:
# Function to encode the label.
def transform_label_encoder(label, encoder):
    """
    Transform the categorical label using label encoder.

    Parameters:
    ----------
    label : pd.Series
        Categorical label.

    encoder : sklearn.preprocessing.LabelEncoder
        Fitted label encoder object.

    Returns:
    -------
    encoded_label : pd.Series
        The encoded label.
    """

    # Ensure raw label immutable.
    label = label.copy()

    # Encode the label.
    encoded_label = pd.Series(
        encoder.transform(label),
        index = label.index,
        name = "category"
    )

    return encoded_label

In [54]:
# Fit the label_encoder.
PATH_ENCODER_LABEL = "../models/label_encoder.pkl"

label = config["label"]
y_train = data_train[label]

label_encoder = fit_label_encoder(
    label = y_train,
    path_le = PATH_ENCODER_LABEL
)

# Update the configuration parameter.
config = update_config(
    key = "path_fitted_encoder_label",
    value = PATH_ENCODER_LABEL,
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: path_fitted_encoder_label - 
Value: ../models/label_encoder.pkl



In [55]:
# Encode the label.
data_train["category"] = transform_label_encoder(
    label = data_train["category"],
    encoder = label_encoder
)

data_valid["category"] = transform_label_encoder(
    label = data_valid["category"],
    encoder = label_encoder
)

data_test["category"] = transform_label_encoder(
    label = data_test["category"],
    encoder = label_encoder
)

In [56]:
# Sanity check the train data.
data_train

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
224,-0.510754,-0.496765,2.013024,-0.494606,-0.501077,-0.185736,-0.091678,-1.237053,-0.759346,2.737990,-0.478647,1
1712,-0.510754,2.013024,-0.496765,-0.494606,-0.501077,-1.345994,-0.814575,0.576125,-0.137985,0.776381,-0.031817,1
1028,-0.510754,-0.496765,-0.496765,2.021811,-0.501077,-0.117485,-0.011356,0.164039,0.069135,-0.624768,-0.031817,1
391,-0.510754,-0.496765,2.013024,-0.494606,-0.501077,-0.458738,-1.577633,-1.484305,1.104737,0.636267,-1.037184,0
1547,1.957890,-0.496765,-0.496765,-0.494606,-0.501077,1.042774,0.952507,-0.330464,0.276256,-0.834940,1.196965,1
...,...,...,...,...,...,...,...,...,...,...,...,...
132,-0.510754,-0.496765,-0.496765,-0.494606,1.995700,-1.209493,-0.653931,-0.330464,-0.966466,-1.395399,-0.366940,1
1296,-0.510754,-0.496765,2.013024,-0.494606,-0.501077,0.292018,0.109127,0.740959,-0.345105,0.075807,-0.925477,1
1431,-0.510754,-0.496765,2.013024,-0.494606,-0.501077,-0.731740,-0.533448,1.317879,-0.966466,0.285979,-1.037184,1
189,-0.510754,2.013024,-0.496765,-0.494606,-0.501077,1.042774,0.149288,1.565131,0.069135,0.496152,-0.255232,1


## **8 - Label Balancing**
---

In [57]:
# Check the label distribution.
data_train["category"].value_counts(normalize=True)

category
1    0.895862
0    0.104138
Name: proportion, dtype: float64

- Seems like the label is highly imbalanced. We need to balancing the label.
- Note that: class `1 (TIDAK BAIK)` and class `0 (BAIK)`

In [58]:
# Function to balancing the label.
def label_balancer(data, balancer_type, config, random_state=123):
    """
    Balancing the category label.

    Parameters:
    ----------
    data : pd.DataFrame
        The scaled data.

    balancer_type : str
        The balancer type.

    config : dict
        The loaded configuration file.

    random_state : int, default = 123
        For reproducibility.

    Returns:
    -------
    X_balanced : pd.DataFrame
        The features with balanced label.

    y_balanced : pd.Series
        The label with balanced label.
    """

    # Ensure the raw data immutable.
    data = data.copy()

    # Split  input-output, imblearn-style similar to sklearn-style.
    label = config["label"]
    y = data[label]
    X = data.drop(columns = label)

    # Set the balancer.
    list_balancer = ["rus", "ros", "sm"]

    if str(balancer_type).lower() not in list_balancer:
        raise RecursionError("The balancer type is invalid.")
    else:
        if str(balancer_type).lower() == "rus":
            balancer = RUS(random_state = random_state)
        elif str(balancer_type).lower() == "ros":
            balancer = ROS(random_state = random_state)
        else:
            balancer = SMOTE(random_state = random_state)

        # Fit resample the balancer.
        X_balanced, y_balanced = balancer.fit_resample(X, y)

        print(f"The label are balanced using {balancer.__class__.__name__}")

        # Check the label distribution.
        print(y_balanced.value_counts())

        return X_balanced, y_balanced

In [59]:
# Label balancing with rus.
X_rus, y_rus = label_balancer(
    data = data_train,
    balancer_type = "rus",
    config = config
)

The label are balanced using RandomUnderSampler
category
0    151
1    151
Name: count, dtype: int64


In [60]:
# Label balancing with ros.
X_ros, y_ros = label_balancer(
    data = data_train,
    balancer_type = "ros",
    config = config
)

The label are balanced using RandomOverSampler
category
1    1299
0    1299
Name: count, dtype: int64


In [61]:
# Label balancing with smote.
X_sm, y_sm = label_balancer(
    data = data_train,
    balancer_type = "sm",
    config = config
)

The label are balanced using SMOTE
category
1    1299
0    1299
Name: count, dtype: int64


## **9 - Data Serialization**
---

In [62]:
# Define data configuration.
X_train = {
    "Undersampling" : X_rus,
    "Oversampling" : X_ros,
    "SMOTE" : X_sm
}

y_train = {
     "Undersampling" : y_rus,
    "Oversampling" : y_ros,
    "SMOTE" : y_sm
}

label = config["label"]

y_valid = data_valid[label]
X_valid = data_valid.drop(columns = label)

y_test = data_test[label]
X_test = data_test.drop(columns = label)

data_configuration = {
    "train": {
        "X_train": X_train,
        "y_train": y_train
    },
    "valid": {
        "X_valid": X_valid,
        "y_valid": y_valid
    },
    "test": {
        "X_test": X_test,
        "y_test": y_test
    }
}

In [63]:
# Serialize the perprocessed data.
PATH_PROCESSED_DATA = "../data/processed/"

for key, value in data_configuration.items():
    config_key = f"path_clean_{key}"
    config_value = []

    for v in value:
        # Get each path.
        path = f"{PATH_PROCESSED_DATA + v}_clean.pkl"
        config_value.append(path)

        # Get each data.
        data = value[v]

        # Serialize the perprocessed data.
        joblib.dump(data, path)

    # Update the configuration parameters.
    config = update_config(
        key = config_key,
        value = config_value,
        params = config,
        path_config = PATH_CONFIG
    )

Params Updated! 
Key: path_clean_train - 
Value: ['../data/processed/X_train_clean.pkl', '../data/processed/y_train_clean.pkl']

Params Updated! 
Key: path_clean_valid - 
Value: ['../data/processed/X_valid_clean.pkl', '../data/processed/y_valid_clean.pkl']

Params Updated! 
Key: path_clean_test - 
Value: ['../data/processed/X_test_clean.pkl', '../data/processed/y_test_clean.pkl']



In [64]:
# Check the configuration parameters.
config

{'columns_datetime': ['tanggal'],
 'columns_int': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'columns_object': ['stasiun', 'critical', 'category'],
 'features': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'impute_co': 11,
 'impute_o3': 29,
 'impute_pm10': {'BAIK': 28, 'TIDAK BAIK': 55},
 'impute_pm25': {'BAIK': 39, 'TIDAK BAIK': 82},
 'impute_so2': 35,
 'imputer_no2': 18,
 'label': 'category',
 'label_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'label_categories_new': ['BAIK', 'TIDAK BAIK'],
 'path_clean_test': ['../data/processed/X_test_clean.pkl',
  '../data/processed/y_test_clean.pkl'],
 'path_clean_train': ['../data/processed/X_train_clean.pkl',
  '../data/processed/y_train_clean.pkl'],
 'path_clean_valid': ['../data/processed/X_valid_clean.pkl',
  '../data/processed/y_valid_clean.pkl'],
 'path_data_raw': '../data/raw/',
 'path_data_test': ['../data/interim/X_test.pkl',
  '../data/interim/y_test.pkl'],
 'path_data_train': ['../data/interim/X_train.pkl',
  