### Min Max Scaler

In [1]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

data = np.random.randint(0,100,(10,2))
data

array([[92, 37],
       [90, 89],
       [91, 87],
       [67, 90],
       [74, 96],
       [57, 27],
       [58, 17],
       [19, 78],
       [26, 42],
       [ 5, 56]])

In [2]:
scaler_model = MinMaxScaler()
scaler_model.fit(data) # fit to train set
scaled_data = scaler_model.transform(data) # and transform for both test & train
scaled_data



array([[ 1.        ,  0.25316456],
       [ 0.97701149,  0.91139241],
       [ 0.98850575,  0.88607595],
       [ 0.71264368,  0.92405063],
       [ 0.79310345,  1.        ],
       [ 0.59770115,  0.12658228],
       [ 0.6091954 ,  0.        ],
       [ 0.16091954,  0.7721519 ],
       [ 0.24137931,  0.3164557 ],
       [ 0.        ,  0.49367089]])

In [3]:
# Try to reverse engineer the the result
data_max = np.ndarray.max(data,0)
data_min = np.ndarray.min(data,0)
print('max:', scaled_data[:,0]*(data_max[0]-data_min[0])+data_min[0])
print('min:', scaled_data[:,1]*(data_max[1]-data_min[1])+data_min[1])

max: [ 92.  90.  91.  67.  74.  57.  58.  19.  26.   5.]
min: [ 37.  89.  87.  90.  96.  27.  17.  78.  42.  56.]


In [4]:
# In the case, that data is DataFrame we need to initate the DataFrame again (return type is np.array)
# X_train = pd.DataFrame(data=scaler.transform(X_train),columns = X_train.columns,index=X_train.index)

### Train Test Split

In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd
data = pd.DataFrame(
    data=np.random.randint(0,101,(50,4)),
    columns=['f1','f2','f3','label']
)
X_data = data.drop('label',axis=1)
y_data = data['label']

X_train,X_test,y_train,y_test = train_test_split(X_data,y_data,test_size=0.3,random_state=101)

X_train.shape

(35, 3)

### One Hot

In [None]:
# CategoricalEncoder is for one_hot string but it didn't release yet
from sklearn.preprocessing import CategoricalEncoder
enc = CategoricalEncoder(handle_unknown='ignore')

In [8]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# define example
data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
values = array(data)
print(values)

# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

# invert first example
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
print(inverted)

['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']
[0 0 2 0 1 1 2 0 2 1]
[[ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]
 [ 0.  0.  1.]
 [ 0.  1.  0.]]
['cold']
