**1. Required Libraries**

In [1]:
# Libraries
import pandas as pd
import src.util as utils
from sklearn.model_selection import train_test_split

**2. Load Config File**

In [2]:
config = utils.load_config()

**3. Load Dataset**

In [3]:
dataset = pd.read_csv(config["dataset_path"])
dataset

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1


**4. Data Validation**

In [5]:
# Check missing values
dataset.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [6]:
# Check data types in each column
dataset.dtypes

ph                 float64
Hardness           float64
Solids             float64
Chloramines        float64
Sulfate            float64
Conductivity       float64
Organic_carbon     float64
Trihalomethanes    float64
Turbidity          float64
Potability           int64
dtype: object

In [7]:
dataset.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.093092,176.850538,15666.690297,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711,0.0
50%,7.036752,196.967627,20927.833607,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028,0.0
75%,8.062066,216.667456,27332.762127,8.114887,359.95017,481.792304,16.557652,77.337473,4.50032,1.0
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


**5. Data Defense**

In [8]:
def check_data(input_data, config):
    # Measure the range of input data
    len_input_data = len(input_data)

    # Check data types
    assert input_data.select_dtypes("int").columns.to_list() == config["int_columns"], "an error occurs in int column(s)."
    assert input_data.select_dtypes("float").columns.to_list() == config["float_columns"], "an error occurs in float column(s)."

    # Check range of data
    assert input_data[config["float_columns"][0]].between(config["range_ph"][0], config["range_ph"][1]).sum() == len_input_data, "an error occurs in temperature range."
    assert input_data[config["float_columns"][1]].between(config["range_hardness"][0], config["range_hardness"][1]).sum() == len_input_data, "an error occurs in humidity range."
    assert input_data[config["float_columns"][2]].between(config["range_solids"][0], config["range_solids"][1]).sum() == len_input_data, "an error occurs in pressure range."
    assert input_data[config["float_columns"][3]].between(config["range_chloramines"][0], config["range_chloramines"][1]).sum() == len_input_data, "an error occurs in pm1 range."
    assert input_data[config["float_columns"][4]].between(config["range_sulfates"][0], config["range_sulfates"][1]).sum() == len_input_data, "an error occurs in pm25 range."
    assert input_data[config["float_columns"][5]].between(config["range_org_carbon"][0], config["range_org_carbon"][1]).sum() == len_input_data, "an error occurs in nc05 range."
    assert input_data[config["float_columns"][6]].between(config["range_trihalomethanes"][0], config["range_trihalomethanes"][1]).sum() == len_input_data, "an error occurs in nc1 range."
    assert input_data[config["float_columns"][7]].between(config["range_turbidity"][0], config["range_turbidity"][1]).sum() == len_input_data, "an error occurs in nc25 range."
    assert input_data[config["int_columns"][0]].between(config["range_potability"][0], config["range_potability"][1]).sum() == len_input_data, "an error occurs in tvoc range."
    

**6. Data Splitting**

In [9]:
# Split input output
x = dataset[config["predictors"]].copy()
y = dataset[config["label"]].copy()

In [10]:
# sanity check
x

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075
...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658


In [11]:
# sanity check
y

0       0
1       0
2       0
3       0
4       0
       ..
3271    1
3272    1
3273    1
3274    1
3275    1
Name: Potability, Length: 3276, dtype: int64

In [12]:
# Split the data to Training and Test Data 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42, stratify = y)

In [13]:
# Split Test data into Validation and Test data
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = 0.5, random_state = 42, stratify = y_test)

**7. Check Missing Values**

In [14]:
# Check % missing values in train data
print("Missing values proportion:")
print(x_train.isna().sum().sort_values(ascending = False)/dataset.shape[0])

Missing values proportion:
Sulfate            0.167582
ph                 0.105006
Trihalomethanes    0.035409
Hardness           0.000000
Solids             0.000000
Chloramines        0.000000
Conductivity       0.000000
Organic_carbon     0.000000
Turbidity          0.000000
dtype: float64


**Dump to pickle**

In [15]:
utils.pickle_dump(dataset, config["dataset_cleaned_path"])

utils.pickle_dump(x_train, config["train_set_path"][0])
utils.pickle_dump(y_train, config["train_set_path"][1])

utils.pickle_dump(x_valid, config["valid_set_path"][0])
utils.pickle_dump(y_valid, config["valid_set_path"][1])

utils.pickle_dump(x_test, config["test_set_path"][0])
utils.pickle_dump(y_test, config["test_set_path"][1])