In [1]:
import pandas as pd
import sys
sys.path.insert(1, '/Users/dabiyyu/Downloads/mlprocess/src')
import util as utils
from sklearn.model_selection import train_test_split

In [2]:
config = utils.load_config()

# Load Dataset

In [3]:
dataset = pd.read_excel(config["dataset_path"])
dataset

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.40,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.50,1009.23,96.62,473.90
...,...,...,...,...,...
9563,16.65,49.69,1014.01,91.00,460.03
9564,13.19,39.18,1023.67,66.78,469.62
9565,31.32,74.33,1012.92,36.48,429.57
9566,24.48,69.45,1013.86,62.39,435.74


# Data Validation

In [4]:
dataset.isnull().sum()

AT    0
V     0
AP    0
RH    0
PE    0
dtype: int64

In [5]:
dataset.dtypes

AT    float64
V     float64
AP    float64
RH    float64
PE    float64
dtype: object

In [6]:
dataset.describe()

Unnamed: 0,AT,V,AP,RH,PE
count,9568.0,9568.0,9568.0,9568.0,9568.0
mean,19.651231,54.305804,1013.259078,73.308978,454.365009
std,7.452473,12.707893,5.938784,14.600269,17.066995
min,1.81,25.36,992.89,25.56,420.26
25%,13.51,41.74,1009.1,63.3275,439.75
50%,20.345,52.08,1012.94,74.975,451.55
75%,25.72,66.54,1017.26,84.83,468.43
max,37.11,81.56,1033.3,100.16,495.76


# Data Defense

In [7]:
def check_data(input_data, config):
    # Measure the range of input data
    len_input_data = len(input_data)

    # Check data types
    assert input_data.select_dtypes("float").columns.to_list() == config["float_columns"], "an error occurs in float column(s)."

    # Check range of data
    assert input_data[config["float_columns"][0]].between(config["range_at"][0], config["range_at"][1]).sum() == len_input_data, "an error occurs in AT range."
    assert input_data[config["float_columns"][1]].between(config["range_v"][0], config["range_v"][1]).sum() == len_input_data, "an error occurs in V range."
    assert input_data[config["float_columns"][2]].between(config["range_ap"][0], config["range_ap"][1]).sum() == len_input_data, "an error occurs in AP range."
    assert input_data[config["float_columns"][3]].between(config["range_rh"][0], config["range_rh"][1]).sum() == len_input_data, "an error occurs in RH range."
    assert input_data[config["float_columns"][4]].between(config["range_pe"][0], config["range_pe"][1]).sum() == len_input_data, "an error occurs in pe range."

In [8]:
check_data(dataset, config)

# Data Splitting

In [9]:
X = dataset[config["predictors"]].copy()
y = dataset[config["label"]].copy()

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [11]:
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 42)

In [12]:
utils.pickle_dump(dataset, config["dataset_cleaned_path"])

utils.pickle_dump(X_train, config["train_set_path"][0])
utils.pickle_dump(y_train, config["train_set_path"][1])

utils.pickle_dump(X_valid, config["valid_set_path"][0])
utils.pickle_dump(y_valid, config["valid_set_path"][1])

utils.pickle_dump(X_test, config["test_set_path"][0])
utils.pickle_dump(y_test, config["test_set_path"][1])