# Scaling and clipping the data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import transformers

In [13]:
np.random.seed(1234)
X = np.random.rand(20,10)
scaler = transformers.GeneralScaler(scaler_str=None, clip=True)
scaler.fit(X,y=None)

In [14]:
np.min(X), np.max(X)

(0.006208516587129398, 0.9946538286442945)

In [15]:
X_aux = scaler.transform(X)
np.min(X_aux), np.max(X_aux)

(0.006208516587129398, 0.9946538286442945)

In [16]:
X_test=np.vstack((-2*X,2*X))
np.min(X_test), np.max(X_test)

(-1.989307657288589, 1.989307657288589)

Notice that the scaler is clipping the max value

In [17]:
X_test_scaled = scaler.transform(X_test)
np.min(X_test_scaled), np.max(X_test_scaled)

(0.006208516587129398, 0.9946538286442945)

We can also specify "manually" the `clip_range`.

Imagine we know all features are between -10 and 10. we can specify a two vectors [-10,-10,...] and [10,10,...].

In [60]:
X = np.random.rand(20,10)
n_features = X.shape[1]

min_all_data = np.zeros(n_features)-10
max_all_data = np.zeros(n_features)+10

scaler = transformers.GeneralScaler(scaler_str=None, clip=True, clip_range=[min_all_data, max_all_data])

In [61]:
X_test = np.vstack((-2*X,2*X))
X_test_scaled = scaler.transform(X_test)

In [64]:
X_test=np.vstack((-200*X,200*X))
print(f"min={np.min(X_test)}, max={np.max(X_test)}")

X_test_scaled = scaler.transform(X_test)
print(f"min={np.min(X_test_scaled)}, max={np.max(X_test_scaled)}")

min=-197.856513643292, max=197.856513643292
min=-10.0, max=10.0


We can also have a different clipping ranges in different features.


In [102]:
min_all_data = np.array([0,0,0])
max_all_data = np.array([1,0.5,2])
scaler = transformers.GeneralScaler(scaler_str=None, clip=True, clip_range=[min_all_data, max_all_data])

In [103]:
np.random.seed(1234)
X = np.random.rand(20,3)
X_test = np.vstack((-2*X,2*X))
X_test_scaled = scaler.transform(X_test)
X_test.shape

(40, 3)

In [104]:
np.max(X_test,axis=0)

array([1.        , 0.5       , 1.91627871])

In [105]:
n_features = X_test.shape[1]
for col_id in range(n_features):
    print(f"min col {col_id} is {np.min(X_test_scaled[:,col_id])},\
            max col {col_id} is {np.max(X_test_scaled[:,col_id])}")

min col 0 is 0.0,            max col 0 is 1.0
min col 1 is 0.0,            max col 1 is 0.5
min col 2 is 0.0,            max col 2 is 1.9162787073674103
