### **1. Load Libraries**

In [1]:
import src.util as utils
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

### **2. Load Config File**

In [2]:
config = utils.load_config()

### **3. Load Dataset**

In [3]:
def load_dataset(config_data: dict) -> pd.DataFrame:
    # Load every set of data
    x_train = utils.pickle_load(config_data["train_set_path"][0])
    y_train = utils.pickle_load(config_data["train_set_path"][1])

    x_valid = utils.pickle_load(config_data["valid_set_path"][0])
    y_valid = utils.pickle_load(config_data["valid_set_path"][1])

    x_test = utils.pickle_load(config_data["test_set_path"][0])
    y_test = utils.pickle_load(config_data["test_set_path"][1])

    # Return 6 set of data
    return x_train, y_train, x_valid, y_valid, x_test, y_test

In [4]:
x_train, y_train, x_valid, y_valid, x_test, y_test = load_dataset(config)

### **4. Handling Missing Values**

In [5]:
# Columns with missing values
missing_column= ["ph", "Sulfate", "Trihalomethanes"]

# make impute function with median
def imputationMedian(data, column_with_null):

    for columns in column_with_null:
        median_value = data[columns].median()
        data[columns].fillna(median_value, inplace = True)

    return data

Impute with median

In [6]:
# Impute in Train set
x_train = imputationMedian(x_train, missing_column)

# Impute in Test set
x_test= imputationMedian(x_test, missing_column)

# Impute in Validation set
x_valid = imputationMedian(x_valid, missing_column)

Check result

In [7]:
# Check imputation result
x_train.isna().any()

ph                 False
Hardness           False
Solids             False
Chloramines        False
Sulfate            False
Conductivity       False
Organic_carbon     False
Trihalomethanes    False
Turbidity          False
dtype: bool

In [8]:
# check imputation result
x_test.isna().any()

ph                 False
Hardness           False
Solids             False
Chloramines        False
Sulfate            False
Conductivity       False
Organic_carbon     False
Trihalomethanes    False
Turbidity          False
dtype: bool

In [9]:
# check imputation result
x_valid.isna().any()

ph                 False
Hardness           False
Solids             False
Chloramines        False
Sulfate            False
Conductivity       False
Organic_carbon     False
Trihalomethanes    False
Turbidity          False
dtype: bool

### **4. Balance Dataset**

In [10]:
# balance data with smote
x_train_smote, y_train_smote = SMOTE(sampling_strategy = "minority",
            random_state = 123).fit_resample(x_train, y_train)


In [11]:
# check balanced data
y_train_smote.value_counts()

1    1398
0    1398
Name: Potability, dtype: int64

### **5. Scaling Dataset**

In [12]:
# Check the values in predictor variables
x_train_smote.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,7.031684,237.083948,35434.280062,7.617649,381.337527,365.032503,13.453872,33.439868,3.825937
1,6.789821,197.300014,22527.681207,5.166942,399.981755,422.482029,15.916605,47.935105,4.512068
2,5.636924,159.13941,27283.780655,6.918727,328.907287,317.830981,13.611408,36.335199,3.007138
3,6.279842,175.89064,11582.505249,7.059985,333.076588,430.322092,15.674785,86.760257,3.278584
4,6.703487,182.129514,24316.875146,7.009806,341.333952,479.537497,20.088643,53.925045,4.223884


In [13]:
# Standard scaler function
def scaler_transform(X, scaler = None):
    """
    Fungsi untuk melakukan standardisasi
    :param X: <pandas DataFrame> sampel data
    :param scaler: <sklearn object> scaler, default None
    :return X_scaled: <pandas Dataframe> sampel data OHE
    :param scaler: <sklearn object> scaler, default None
    """
    if scaler != None:
        pass
    else:
        # Buat & fit encoder
        scaler = StandardScaler()
        scaler.fit(X)

    # Tranform data
    X_scaled = scaler.transform(X)
    X_scaled = pd.DataFrame(X_scaled,
                            columns = X.columns,
                            index = X.index)
    
    return X_scaled, scaler

In [14]:
# Scaling Train, Validation, Test data
x_train_clean, scaler = scaler_transform(X = x_train_smote)

x_test_clean, scaler = scaler_transform(X = x_test)

x_valid_clean, scaler = scaler_transform(X = x_valid)

In [15]:
# Check result
x_train_clean.head(3)

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,-0.041083,1.252736,1.535609,0.317545,1.295205,-0.787365,-0.259992,-2.140418,-0.194425
1,-0.211849,0.008282,0.079171,-1.248129,1.809437,-0.060075,0.513382,-1.195754,0.710196
2,-1.025842,-1.185393,0.615871,-0.128973,-0.150889,-1.384919,-0.210521,-1.951728,-1.27396


In [16]:
# Check result
x_valid_clean.head(3)

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
1205,-1.811385,1.877846,-1.580365,1.193226,-0.017864,0.114231,0.618712,0.001374,0.433179
2285,-0.006669,1.270897,1.360558,-1.482844,0.796206,1.486361,1.439262,1.276686,-0.228583
1177,-1.097549,0.075253,-0.381316,-1.458139,-1.08187,-0.970489,1.329499,0.570689,0.658335


In [17]:
# Check result
x_test_clean.head(3)

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
160,-0.000621,1.503628,-0.310828,1.045218,-0.041799,0.219244,0.637657,-0.131883,-0.233559
1067,-0.000621,0.587566,0.096845,1.709769,1.421283,-0.621837,1.326157,-0.478159,0.537095
3070,-0.000621,0.574936,-0.154073,0.110978,-0.790681,-1.829574,0.234458,2.465344,-0.131067


**Dump to pickle**

In [18]:
utils.pickle_dump(x_train_clean, config["train_feng_set_path"][0])
utils.pickle_dump(y_train_smote, config["train_feng_set_path"][1])

utils.pickle_dump(x_valid_clean, config["valid_feng_set_path"][0])
utils.pickle_dump(y_valid, config["valid_feng_set_path"][1])

utils.pickle_dump(x_test_clean, config["test_feng_set_path"][0])
utils.pickle_dump(y_test, config["test_feng_set_path"][1])