# Data Preprocessing Tools

## Importing the libraries

In [12]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Inspecting the dataset

In [3]:
# Dataset-specific params
DATASET_URL = 'Data.csv'
NUMERIC_FEATURE_COLS = np.s_[:, -2:] # (from features)
NOMINAL_FEATURE_COLS = [0] # (from features)

## Importing the dataset

_**Independent vars (features/inputs)**_ & _**dependent var (output)**_ need to be separated from the dataset to feed the training models.

In [18]:
def import_dataset(dataset_url, feature_cols, dependent_col):
    dataset = pd.read_csv(dataset_url)
    X = dataset.iloc[feature_cols].values
    y = dataset.iloc[dependent_col].values
    
    return X,  y

In [21]:
# debug
# X, y = import_dataset(dataset_url = DATASET_URL)
# print("X", X)
# print("\ny", y)

## Handle missing data

There are some different ways to fill the missing data in the dataset such as _**mean (avarage), median**_, etc of the whole column. Among them, _**mean**_ strategy is commonly used.

We only deal with numeric inputs:

In [5]:
def handle_missing_data(features, numeric_feature_cols, strategy):
    if(numeric_feature_cols is None): return

    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(missing_values = np.nan, strategy = strategy)
    imputer.fit(features[numeric_feature_cols])
    features[numeric_feature_cols] = imputer.transform(features[numeric_feature_cols])                    

In [22]:
# debug
# handle_missing_data(features = X, numeric_feature_cols = NUMERIC_FEATURE_COLS)
# print(X)

## Encoding categorical data

For ML to compute the corelation between independent & dependent variables, categorical (non-contiguous) data such as _string_ need to be encoded as number.

### Encoding the Independent Variable

Independent variables do not have order, hence should not encoded as ordinal numbers (1, 2, 3...) by ```LabalEncoder```, but different unorder form such as matrix of numbers by ```OneHotEncoder```.

In [6]:
def encode_nominal_features(features, nominal_feature_cols):
    if (nominal_feature_cols is None): return features
    
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    transformer = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), nominal_feature_cols)], remainder = 'passthrough') # passthrough to keep uncoded vars in the independent vars
    
    return np.array(transformer.fit_transform(features))

In [23]:
# debug
# X = encode_nominal_features(features = X, nominal_feature_cols = NOMINAL_FEATURE_COLS)
# print(X)

### Encoding the Dependent Variable

Dependent feature can be encoded as ordinal numbers.

In [7]:
def encode_independent_var(independent_var):
    from sklearn.preprocessing import LabelEncoder
    
    return LabelEncoder().fit_transform(independent_var)

In [24]:
# debug
# y = encode_independent_var(independent_var = y)
# print(y)

## Splitting the dataset into the Training set and Test set

Training set should be relatively larger than test set in order to to provide sufficient amount of inputs to train model. Here we use 80% of dataset for training and 20% for test.

In [16]:
def split_dataset(features, independent_var, test_size):
    from sklearn.model_selection import train_test_split
    return train_test_split(features, independent_var, test_size = test_size, random_state = 1) # keep randomness seed of spliting fixed => receive the same training & test set each time execute

In [25]:
# debug
# X_train, X_test, y_train, y_test = split_dataset(features = X, independent_var = y)
# print("X_train", X_train)
# print("\nX_test", X_test)
# print("\ny_train", y_train)
# print("\ny_test", y_test)

## Feature Scaling

In certain (not all) models, scaling to put all the _non-categorical features_ on the same scale to avoid some features to dominate others. This should fit to only training set (_after dataset splitting_), since we treat test set as future data which we don't have yet.

In [9]:
def scale_features(X_train, X_test, numeric_feature_cols):
    if(numeric_feature_cols is None): return
 
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X_train[numeric_feature_cols] = scaler.fit_transform(X_train[numeric_feature_cols])
    X_test[numeric_feature_cols] = scaler.transform(X_test[numeric_feature_cols])

In [26]:
# debug
# scale_features(X_train = X_train, X_test = X_test, numeric_feature_cols = NUMERIC_FEATURE_COLS)
# print("Scaled X_train", X_train)
# print("\nScaled X_test", X_test)

# Tool

In [42]:
def preprocess_dataset(
    dataset_url, 
    feature_cols = np.s_[:, :-1], # all rows, all colums excepts last (from dataset)
    numeric_feature_cols = None, # (from features)
    nominal_feature_cols = None, # (from features)
    dependent_col  = np.s_[:, -1], # all rows, last column (from dataset),
    is_dependent_categorical = False,
    missing_data_handling_strategy = 'mean',
    test_size = 0.2):
    
    X, y = import_dataset(dataset_url, feature_cols, dependent_col)
    handle_missing_data(X, numeric_feature_cols, missing_data_handling_strategy)
    X = encode_nominal_features(X, nominal_feature_cols)
    if(is_dependent_categorical): y = encode_independent_var(y)
    X_train, X_test, y_train, y_test = split_dataset(X, y, test_size)
    scale_features(X_train, X_test, numeric_feature_cols)

    # debug
    print("\nX_train\n", X_train)
    print("\nX_test\n", X_test)
    print("\ny_train\n", y_train)
    print("\ny_test\n", y_test)

    return X_train, X_test, y_train, y_test

In [43]:
# debug
X_train, X_test, y_train, y_test = preprocess_dataset(dataset_url = 'Data.csv', numeric_feature_cols = np.s_[:, -2:], nominal_feature_cols =  [0])


X_train
 [[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]

X_test
 [[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]

y_train
 ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes']

y_test
 ['No' 'Yes']
