# Preprocessing

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

import sys
sys.path.append('../src')  
import utils as util

In [31]:
config = util.load_config()

In [21]:
def read_raw_data(config: dict, data_dir: str) -> pd.DataFrame:
    df = pd.read_csv(data_dir, encoding='utf-8')
    df = df[[config['data_source']['data_column']]]
    
    data_to_remove = config['data_source']['data_to_remove']
    raw_data = df.drop(data_to_remove)

    return raw_data

In [22]:
raw_dataset_dir = '../' + config['data_source']['directory'] + config['data_source']['file_name']  

raw_data = read_raw_data(config, raw_dataset_dir)

In [23]:
raw_data.head()

Unnamed: 0,MESSAGE
0,"{""ID"":""2"",""sensor"":{""ZH03B"":{""PM1.0"":34,""PM2.5..."
1,"{""ID"":""1"",""sensor"":{""ZH03B"":{""PM1.0"":32,""PM2.5..."
2,"{""ID"":""2"",""sensor"":{""ZH03B"":{""PM1.0"":34,""PM2.5..."
3,"{""ID"":""1"",""sensor"":{""ZH03B"":{""PM1.0"":32,""PM2.5..."
4,"{""ID"":""2"",""sensor"":{""ZH03B"":{""PM1.0"":34,""PM2.5..."


In [24]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25933 entries, 0 to 25938
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MESSAGE  25933 non-null  object
dtypes: object(1)
memory usage: 405.2+ KB


In [25]:
raw_data.isnull().sum()

MESSAGE    0
dtype: int64

In [26]:
raw_data[config['data_source']['data_column']][0]

'{"ID":"2","sensor":{"ZH03B":{"PM1.0":34,"PM2.5":42,"PM10.0":49},"BME280":{"Temperature":37.93999863,"Pressure":908.9143677,"Altitude":886.4772339,"Humidity":32.13378906},"DHT22":{"Temperature":25.69999886,"Humidity":56,"HeatIndex":25.78777313,"DewPoint":16.28425217,"ComfortRatio":100,"ComfortStatus":0},"MQ7":{"PPM":7.910358906}},"dateTime":{"date":"2024-07-02","time":"13:13:00"}}'

In [27]:
def extract_data(json_str, config: dict):
    data_dict = json.loads(json_str)
    
    id_sensor = data_dict[config['data_mapping']['id_sensor']]
    date = data_dict[config['data_mapping']['dateTime']][config['data_mapping']['date']]
    time = data_dict[config['data_mapping']['dateTime']][config['data_mapping']['time']]
    
    pm1 = data_dict[config['data_mapping']['sensor']][config['data_mapping']['ZH03B']][config['data_mapping']['pm1']]
    pm25 = data_dict[config['data_mapping']['sensor']][config['data_mapping']['ZH03B']][config['data_mapping']['pm25']]
    pm10 = data_dict[config['data_mapping']['sensor']][config['data_mapping']['ZH03B']][config['data_mapping']['pm10']]
    co = data_dict[config['data_mapping']['sensor']][config['data_mapping']['MQ7']][config['data_mapping']['co']]
    temperature = data_dict[config['data_mapping']['sensor']][config['data_mapping']['DHT22']][config['data_mapping']['temperature']]
    humidity = data_dict[config['data_mapping']['sensor']][config['data_mapping']['DHT22']][config['data_mapping']['humidity']]
    pressure = data_dict[config['data_mapping']['sensor']][config['data_mapping']['BME280']][config['data_mapping']['pressure']]
    
    return id_sensor, date, time, pm1, pm25, pm10, co, temperature, humidity, pressure

In [28]:
raw_data[config['data_source']['columns']] = raw_data[config['data_source']['data_column']].apply(lambda x: extract_data(x, config)).apply(pd.Series)
raw_data = raw_data.drop(columns=config['data_source']['data_column'])

raw_data.head()

Unnamed: 0,id_sensor,date,time,pm1,pm25,pm10,co,temperature,humidity,pressure
0,2,2024-07-02,13:13:00,34,42,49,7.910359,25.699999,56.0,908.914368
1,1,2024-07-02,13:13:04,32,40,45,9.578706,25.9,58.299999,924.873779
2,2,2024-07-02,13:13:30,34,42,48,7.933297,25.699999,56.299999,908.919983
3,1,2024-07-02,13:13:34,32,40,45,9.578706,25.9,58.299999,924.897522
4,2,2024-07-02,13:14:00,34,42,47,7.956267,25.6,56.700001,908.917175


In [29]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25933 entries, 0 to 25938
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id_sensor    25933 non-null  object 
 1   date         25933 non-null  object 
 2   time         25933 non-null  object 
 3   pm1          25933 non-null  int64  
 4   pm25         25933 non-null  int64  
 5   pm10         25933 non-null  int64  
 6   co           25933 non-null  float64
 7   temperature  25933 non-null  float64
 8   humidity     25933 non-null  float64
 9   pressure     25933 non-null  float64
dtypes: float64(4), int64(3), object(3)
memory usage: 3.2+ MB


In [34]:
def dump_dataset(config: dict, data: pd.DataFrame, data_dir: str):
    data.to_csv(data_dir, index=False)

In [36]:
data_dir = '../' + config['train_test_data']['directory'] + config['train_test_data']['file_name']  

dump_dataset(config, raw_data, data_dir)