# Data Preprocessing Tools

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

_**Independent vars (features/inputs)**_ & _**dependent var (output)**_ need to be separated from the dataset to feed the training models.

In [1]:
def import_dataset(dataset_url, feature_cols, dependent_col):
    dataset = pd.read_csv(dataset_url)
    X = dataset.iloc[feature_cols].values
    y = dataset.iloc[dependent_col].values
    
    return dataset, X,  y

## Handle missing numeric data

There are some different ways to fill the missing data in the dataset such as _**mean (avarage), median**_, etc of the whole column. Among them, _**mean**_ strategy is commonly used.

We only deal with numeric inputs:

In [1]:
def handle_missing_numeric_data(features, numeric_feature_cols, strategy):
    if(numeric_feature_cols is None): return

    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(missing_values = np.nan, strategy = strategy)
    imputer.fit(features[numeric_feature_cols])
    features[numeric_feature_cols] = imputer.transform(features[numeric_feature_cols])                    

## Encoding categorical data

For ML to compute the corelation between independent & dependent variables, categorical (non-contiguous) data such as _string_ need to be encoded as number.

### Encoding the Independent Variable

Independent variables do not have order, hence should not encoded as ordinal numbers (1, 2, 3...) by ```LabalEncoder```, but different unorder form such as matrix of numbers by ```OneHotEncoder```.

In [7]:
def encode_nominal_features(features, nominal_feature_cols):
    if (nominal_feature_cols is None): return features
    
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    
    # @drop (remove) first encoded column to avoid "dummy variable trap" or data redundancy
    encoder = OneHotEncoder(drop='first')
    # @remainder passthrough to keep uncoded vars in the independent vars (features)
    transformer = ColumnTransformer(transformers = [('encoder', encoder, nominal_feature_cols)], remainder = 'passthrough') 
    
    return np.array(transformer.fit_transform(features))

### Encoding the Dependent Variable

Dependent feature can be encoded as ordinal numbers.

In [9]:
def encode_independent_var(independent_var):
    from sklearn.preprocessing import LabelEncoder
    
    return LabelEncoder().fit_transform(independent_var)

## Splitting the dataset into the Training set and Test set

Training set should be relatively larger than test set in order to to provide sufficient amount of inputs to train model. Here we use 80% of dataset for training and 20% for test.

In [11]:
def split_dataset(features, independent_var, test_size):
    from sklearn.model_selection import train_test_split
    return train_test_split(features, independent_var, test_size = test_size, random_state = 0) # keep randomness seed of spliting fixed => receive the same training & test set each time execute

## Feature Scaling

In certain (not all) models, scaling to put all the _non-categorical features_ on the same scale to avoid some features to dominate others. This should fit to only training set (_after dataset splitting_), since we treat test set as future data which we don't have yet.

In [13]:
def scale_features(X_train, X_test, numeric_feature_cols):
    if(numeric_feature_cols is None): return
 
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    if(X_train): X_train[numeric_feature_cols] = scaler.fit_transform(X_train[numeric_feature_cols])
    if(X_test): X_test[numeric_feature_cols] = scaler.transform(X_test[numeric_feature_cols])

# Tool

In [2]:
def preprocess_dataset(
    dataset_url, 
    feature_cols = np.s_[:, :-1], # (from dataset) all rows, all colums excepts last 
    numeric_feature_cols = None, # (from features) if provide, handle missing data & scale. TODO: separate scale
    nominal_feature_cols = None, # (from features) if provide, encode w/ OneHotEncoder
    dependent_col  = np.s_[:, -1], # (from dataset) all rows, last column 
    is_dependent_categorical = False, # if True, encode w/ LabelEncoder
    missing_data_handling_strategy = 'mean',
    test_size = 0.2, # split x% of dataset into test set
    debug = False):
    
    dataset, X, y = import_dataset(dataset_url, feature_cols, dependent_col)
    handle_missing_numeric_data(X, numeric_feature_cols, missing_data_handling_strategy)
    X = encode_nominal_features(X, nominal_feature_cols)
    if(is_dependent_categorical): y = encode_independent_var(y)
        
    if(test_size):
        X_train, X_test, y_train, y_test = split_dataset(X, y, test_size)
    else:
        X_train, X_test, y_train, y_test = X, None, y, None
        
    scale_features(X_train, X_test, numeric_feature_cols)

    if(debug):
        print("\ndataset\n", dataset)
        print("\nX_train\n", X_train)
        print("\nX_test\n", X_test)
        print("\ny_train\n", y_train)
        print("\ny_test\n", y_test)

    return dataset, X_train, X_test, y_train, y_test

NameError: name 'np' is not defined