# Ignite ML Preprocessing!

This document contains example of Ignite ML Preprocessing Python API.

In [1]:
from sklearn.datasets import make_regression
from sklearn.datasets import make_classification
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Lets prepare a classification dataset using scikit-learn.

In [2]:
x, y = make_classification()
x_train, x_test, y_train, y_test = train_test_split(x, y)

### 1.1 Normalization preprocessing

In [3]:
from ggml.preprocessing import NormalizationTrainer
normalizer = NormalizationTrainer().fit(x_train)

# Just to test.
x_test_normalized = normalizer.transform(x_test)

In [4]:
from ggml.classification import DecisionTreeClassificationTrainer
trainer = DecisionTreeClassificationTrainer()
model_without_normalization = trainer.fit(x_train, y_train)
model_with_normalization = trainer.fit(normalizer.transform(x_train), y_train)

print("Without normalization: %f" % accuracy_score(
    y_test, 
    model_without_normalization.predict(x_test)
))
print("With normalization: %f" % accuracy_score(
    y_test, 
    model_with_normalization.predict(x_test_normalized)
))

Without normalization: 0.880000
With normalization: 0.880000


### 1.2 Binarization preprocessor

In [5]:
from ggml.preprocessing import BinarizationTrainer
binarizer = BinarizationTrainer(threshold=0.5).fit([[]])

binarizer.transform(np.random.rand(10))

array([1., 1., 1., 1., 0., 1., 1., 1., 1., 0.])

### 1.3 Imputing preprocessor

In [6]:
from ggml.preprocessing import ImputerTrainer
imputer = ImputerTrainer().fit([[1, 1, 1], [2, 2, 2]])
imputer.transform([[None, 4, 5], [4, None, 5], [4, 5, None]])

array([[1.5, 4. , 5. ],
       [4. , 1.5, 5. ],
       [4. , 5. , 1.5]])

### 1.4 One hot encoding preprocessor

In [7]:
from ggml.preprocessing import EncoderTrainer
encoder = EncoderTrainer(encoded_features=[0]).fit([
    [42],
    [43],
    [42],
    [43]
])
encoder.transform([
    [42],
    [43],
    [42],
    [43]
])

array([[1., 0., 1.],
       [0., 1., 0.],
       [1., 0., 1.],
       [0., 1., 0.]])

### 1.6 MinMax scaling preprocessor

In [9]:
from ggml.preprocessing import MinMaxScalerTrainer
scaler = MinMaxScalerTrainer().fit([[1, 1, 1], [2, 2, 2]])
scaler.transform([
    [1, 1, 1],
    [1.5, 1.5, 1.5],
    [2, 2, 2]
])

array([[0. , 0. , 0. ],
       [0.5, 0.5, 0.5],
       [1. , 1. , 1. ]])

### 1.7 MaxAbs scaling preprocessor

In [10]:
from ggml.preprocessing import MaxAbsScalerTrainer
scaler = MaxAbsScalerTrainer().fit([[1, 1, 1], [2, 2, 2]])
scaler.transform([
    [1, 1, 1],
    [1.5, 1.5, 1.5],
    [2, 2, 2]
])

array([[0.5 , 0.5 , 0.5 ],
       [0.75, 0.75, 0.75],
       [1.  , 1.  , 1.  ]])