# Preprocessing

## Import Libraries

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

import joblib

import sys
sys.path.append('../src')  
import utils as util

## Load Configuration File

In [18]:
config = util.load_config()

## Load Raw Dataset

In [19]:
def read_raw_data(config: dict, data_dir: str) -> pd.DataFrame:
    df = pd.read_csv(data_dir, encoding='utf-8')
    df = df[[config['data_source']['data_column']]]
    
    data_to_remove = config['data_source']['data_to_remove']
    raw_data = df.drop(data_to_remove)

    return raw_data

In [20]:
raw_dataset_path = '../' + config['data_source']['directory'] + config['data_source']['file_name']  

raw_data = read_raw_data(config, raw_dataset_path)

In [21]:
raw_data.head()

Unnamed: 0,MESSAGE
0,"{""ID"":""2"",""sensor"":{""ZH03B"":{""PM1.0"":34,""PM2.5..."
1,"{""ID"":""1"",""sensor"":{""ZH03B"":{""PM1.0"":32,""PM2.5..."
2,"{""ID"":""2"",""sensor"":{""ZH03B"":{""PM1.0"":34,""PM2.5..."
3,"{""ID"":""1"",""sensor"":{""ZH03B"":{""PM1.0"":32,""PM2.5..."
4,"{""ID"":""2"",""sensor"":{""ZH03B"":{""PM1.0"":34,""PM2.5..."


In [22]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25930 entries, 0 to 25938
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MESSAGE  25930 non-null  object
dtypes: object(1)
memory usage: 405.2+ KB


In [23]:
raw_data.isnull().sum()

MESSAGE    0
dtype: int64

In [24]:
raw_data[config['data_source']['data_column']][0]

'{"ID":"2","sensor":{"ZH03B":{"PM1.0":34,"PM2.5":42,"PM10.0":49},"BME280":{"Temperature":37.93999863,"Pressure":908.9143677,"Altitude":886.4772339,"Humidity":32.13378906},"DHT22":{"Temperature":25.69999886,"Humidity":56,"HeatIndex":25.78777313,"DewPoint":16.28425217,"ComfortRatio":100,"ComfortStatus":0},"MQ7":{"PPM":7.910358906}},"dateTime":{"date":"2024-07-02","time":"13:13:00"}}'

## Preprocessing

### Extracting Features

In [25]:
def extract_features(json_str, config: dict):
    data_dict = json.loads(json_str)
    
    id_sensor = data_dict[config['data_mapping']['id_sensor']]
    datetime = data_dict[config['data_mapping']['dateTime']][config['data_mapping']['date']] + ' ' + data_dict[config['data_mapping']['dateTime']][config['data_mapping']['time']]
    
    pm1 = data_dict[config['data_mapping']['sensor']][config['data_mapping']['ZH03B']][config['data_mapping']['pm1']]
    pm25 = data_dict[config['data_mapping']['sensor']][config['data_mapping']['ZH03B']][config['data_mapping']['pm25']]
    pm10 = data_dict[config['data_mapping']['sensor']][config['data_mapping']['ZH03B']][config['data_mapping']['pm10']]
    co = data_dict[config['data_mapping']['sensor']][config['data_mapping']['MQ7']][config['data_mapping']['co']]
    temperature = data_dict[config['data_mapping']['sensor']][config['data_mapping']['DHT22']][config['data_mapping']['temperature']]
    humidity = data_dict[config['data_mapping']['sensor']][config['data_mapping']['DHT22']][config['data_mapping']['humidity']]
    pressure = data_dict[config['data_mapping']['sensor']][config['data_mapping']['BME280']][config['data_mapping']['pressure']]
    
    return id_sensor, datetime, pm1, pm25, pm10, co, temperature, humidity, pressure

In [26]:
raw_data[config['data_source']['columns']] = raw_data[config['data_source']['data_column']].apply(lambda x: extract_features(x, config)).apply(pd.Series)

raw_data.head()

Unnamed: 0,MESSAGE,id_sensor,datetime,pm1,pm25,pm10,co,temperature,humidity,pressure
0,"{""ID"":""2"",""sensor"":{""ZH03B"":{""PM1.0"":34,""PM2.5...",2,2024-07-02 13:13:00,34,42,49,7.910359,25.699999,56.0,908.914368
1,"{""ID"":""1"",""sensor"":{""ZH03B"":{""PM1.0"":32,""PM2.5...",1,2024-07-02 13:13:04,32,40,45,9.578706,25.9,58.299999,924.873779
2,"{""ID"":""2"",""sensor"":{""ZH03B"":{""PM1.0"":34,""PM2.5...",2,2024-07-02 13:13:30,34,42,48,7.933297,25.699999,56.299999,908.919983
3,"{""ID"":""1"",""sensor"":{""ZH03B"":{""PM1.0"":32,""PM2.5...",1,2024-07-02 13:13:34,32,40,45,9.578706,25.9,58.299999,924.897522
4,"{""ID"":""2"",""sensor"":{""ZH03B"":{""PM1.0"":34,""PM2.5...",2,2024-07-02 13:14:00,34,42,47,7.956267,25.6,56.700001,908.917175


### Cleaning Features

In [27]:
def clean_features(config: dict, data: pd.DataFrame):
    
    # drop kolom "MESSAGE", ubah tipe data datetime menjadi datetime, ubah tipe data id_sensor menjadi int
    data = data.drop(columns=config['data_source']['data_column'])
    data[config['data_source']['datetime_feature']] = pd.to_datetime(data[config['data_source']['datetime_feature']])
    data[config['data_source']['id_sensor']] = data[config['data_source']['id_sensor']].astype(int)

    return data

In [28]:
raw_data = clean_features(config, raw_data)

raw_data.head()

Unnamed: 0,id_sensor,datetime,pm1,pm25,pm10,co,temperature,humidity,pressure
0,2,2024-07-02 13:13:00,34,42,49,7.910359,25.699999,56.0,908.914368
1,1,2024-07-02 13:13:04,32,40,45,9.578706,25.9,58.299999,924.873779
2,2,2024-07-02 13:13:30,34,42,48,7.933297,25.699999,56.299999,908.919983
3,1,2024-07-02 13:13:34,32,40,45,9.578706,25.9,58.299999,924.897522
4,2,2024-07-02 13:14:00,34,42,47,7.956267,25.6,56.700001,908.917175


In [29]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25930 entries, 0 to 25938
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id_sensor    25930 non-null  int64         
 1   datetime     25930 non-null  datetime64[ns]
 2   pm1          25930 non-null  int64         
 3   pm25         25930 non-null  int64         
 4   pm10         25930 non-null  int64         
 5   co           25930 non-null  float64       
 6   temperature  25930 non-null  float64       
 7   humidity     25930 non-null  float64       
 8   pressure     25930 non-null  float64       
dtypes: datetime64[ns](1), float64(4), int64(4)
memory usage: 3.0 MB


In [30]:
data = {
    'sensor_data_1': raw_data[raw_data[config['data_source']['id_sensor']] == config['data_source']['sensor'][0]].reset_index(drop=True),
    'sensor_data_2': raw_data[raw_data[config['data_source']['id_sensor']] == config['data_source']['sensor'][1]].reset_index(drop=True),
    'sensor_data_3': raw_data[raw_data[config['data_source']['id_sensor']] == config['data_source']['sensor'][2]].reset_index(drop=True)
}

In [31]:
data['sensor_data_1'].head()

Unnamed: 0,id_sensor,datetime,pm1,pm25,pm10,co,temperature,humidity,pressure
0,1,2024-07-02 13:13:04,32,40,45,9.578706,25.9,58.299999,924.873779
1,1,2024-07-02 13:13:34,32,40,45,9.578706,25.9,58.299999,924.897522
2,1,2024-07-02 13:14:04,34,42,47,9.578706,25.9,58.299999,924.843933
3,1,2024-07-02 13:14:34,33,41,46,9.578706,25.9,58.299999,924.890137
4,1,2024-07-02 13:15:04,32,40,45,9.578706,25.9,58.400002,924.88501


### Resampling Data with An Hour

In [32]:
data['sensor_data_1'] = data['sensor_data_1'].resample('h', on='datetime').mean()[config['data_source']['num_features']]
data['sensor_data_2'] = data['sensor_data_2'].resample('h', on='datetime').mean()[config['data_source']['num_features']]
data['sensor_data_3'] = data['sensor_data_3'].resample('h', on='datetime').mean()[config['data_source']['num_features']]

data['sensor_data_1'].head()

Unnamed: 0_level_0,id_sensor,pm1,pm25,pm10,co,temperature,humidity,pressure
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-07-02 13:00:00,1.0,29.212766,36.478723,40.840426,9.531837,25.879787,58.532979,924.74208
2024-07-02 14:00:00,1.0,22.733333,28.408333,31.775,9.783101,25.271666,60.930833,924.435543
2024-07-02 15:00:00,1.0,14.808333,18.825,20.825,9.860055,25.4,60.045,924.39799
2024-07-02 16:00:00,1.0,18.166667,22.775,25.341667,10.148229,25.318333,59.451667,924.676247
2024-07-02 17:00:00,1.0,27.208333,33.941667,37.891667,10.477759,24.514166,60.936667,925.327504


## Dump Data

In [16]:
data_path_pkl = '../' + config['train_test_data']['directory'] + config['train_test_data']['file_name_pkl']  
data_path_csv = '../' + config['train_test_data']['directory'] + config['train_test_data']['file_name_csv']  

util.dump_data_pickle(data, data_path_pkl)
util.dump_data_csv(raw_data, data_path_csv)