In [24]:
import ray
from ray.data.preprocessors import StandardScaler

import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [25]:
data_raw = load_breast_cancer()
dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"])
dataset_df["target"] = data_raw["target"]
train_df, test_df = train_test_split(dataset_df, test_size=0.3)

Create Ray data from the pands df

In [26]:
train_dataset = ray.data.from_pandas(train_df)
valid_dataset = ray.data.from_pandas(test_df)
test_dataset = ray.data.from_pandas(test_df.drop("target", axis=1))

In [27]:
train_dataset.take(1), train_dataset.count()

([PandasRow({'mean radius': 10.9,
             'mean texture': 12.96,
             'mean perimeter': 68.69,
             'mean area': 366.8,
             'mean smoothness': 0.07515,
             'mean compactness': 0.03718,
             'mean concavity': 0.00309,
             'mean concave points': 0.006588,
             'mean symmetry': 0.1442,
             'mean fractal dimension': 0.05743,
             'radius error': 0.2818,
             'texture error': 0.7614,
             'perimeter error': 1.808,
             'area error': 18.54,
             'smoothness error': 0.006142,
             'compactness error': 0.006134,
             'concavity error': 0.001835,
             'concave points error': 0.003576,
             'symmetry error': 0.01637,
             'fractal dimension error': 0.002665,
             'worst radius': 12.36,
             'worst texture': 18.2,
             'worst perimeter': 78.07,
             'worst area': 470.0,
             'worst smoothness': 0.1171,
    

Create a preprocessor to scale some columns

In [41]:
columns_to_scale = ["mean radius", "mean texture"]
preprocessor = StandardScaler(columns=columns_to_scale)
transformed_ds = preprocessor.fit(train_dataset)

Shuffle Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 50.78it/s]
Shuffle Reduce: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 259.15it/s]


In [42]:
transformed_ds.stats_

ArrowRow({'mean(mean radius)': 14.095836683417085,
          'mean(mean texture)': 19.298768844221105,
          'std(mean radius)': 3.6256362347170907,
          'std(mean texture)': 4.33583539640955})

Let's scale all the features

In [43]:
preprocessor = StandardScaler(data_raw["feature_names"])
preprocessor

StandardScaler(columns=['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension'], ddof=0, stats=None)

In [44]:
transformed_ds = preprocessor.fit(train_dataset)

Shuffle Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 49.66it/s]
Shuffle Reduce: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 68.02it/s]


In [45]:
transformed_ds, transformed_ds.stats_

(StandardScaler(columns=['mean radius' 'mean texture' 'mean perimeter' 'mean area'
  'mean smoothness' 'mean compactness' 'mean concavity'
  'mean concave points' 'mean symmetry' 'mean fractal dimension'
  'radius error' 'texture error' 'perimeter error' 'area error'
  'smoothness error' 'compactness error' 'concavity error'
  'concave points error' 'symmetry error' 'fractal dimension error'
  'worst radius' 'worst texture' 'worst perimeter' 'worst area'
  'worst smoothness' 'worst compactness' 'worst concavity'
  'worst concave points' 'worst symmetry' 'worst fractal dimension'], ddof=0, stats={'mean(mean radius)': 14.095836683417085, 'mean(mean texture)': 19.298768844221105, 'mean(mean perimeter)': 91.78675879396987, 'mean(mean area)': 655.0567839195979, 'mean(mean smoothness)': 0.09567917085427136, 'mean(mean compactness)': 0.10374748743718593, 'mean(mean concavity)': 0.08858220778894473, 'mean(mean concave points)': 0.048591806532663324, 'mean(mean symmetry)': 0.1812886934673367, '

### Compare it to scikit-learn preprocessor

In [32]:
import sklearn
sckit_preprocessor  = sklearn.preprocessing.StandardScaler()
transformed_ds_scikit= sckit_preprocessor.fit(train_df)

In [33]:
print(transformed_ds_scikit.feature_names_in_)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension'
 'target']


In [46]:
print(transformed_ds_scikit.mean_)

[1.40958367e+01 1.92987688e+01 9.17867588e+01 6.55056784e+02
 9.56791709e-02 1.03747487e-01 8.85822078e-02 4.85918065e-02
 1.81288693e-01 6.28974874e-02 4.13853518e-01 1.23734925e+00
 2.95375201e+00 4.14821005e+01 7.08565578e-03 2.57469925e-02
 3.22460065e-02 1.18536457e-02 2.09743769e-02 3.88469874e-03
 1.62071683e+01 2.56024372e+01 1.07077487e+02 8.77387688e+02
 1.31264246e-01 2.50892387e-01 2.69024035e-01 1.13090399e-01
 2.88766583e-01 8.38249749e-02 6.25628141e-01]


In [47]:
print(transformed_ds_scikit.scale_)

[3.62563623e+00 4.33583540e+00 2.50317303e+01 3.64389348e+02
 1.40150966e-02 5.35671053e-02 8.11709029e-02 3.90213135e-02
 2.77667021e-02 7.21478787e-03 2.91757470e-01 5.74675669e-01
 2.16239353e+00 4.93857480e+01 2.83995351e-03 1.81512332e-02
 3.00282431e-02 6.25792495e-03 8.73985649e-03 2.80078066e-03
 4.89076557e+00 6.17509916e+00 3.42533460e+01 5.74364757e+02
 2.28811782e-02 1.58237769e-01 2.08300007e-01 6.56490870e-02
 6.24174040e-02 1.84666335e-02 4.83960298e-01]


In [48]:
import pandas as pd
import ray
from ray.data.preprocessors import MinMaxScaler

Generate two datasets 

In [67]:
dataset = ray.data.range_table(8)
dataset1, dataset2 = dataset.split(2)

Read progress: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 3908.50it/s]


In [68]:
print(dataset1.take())

[{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}]


In [69]:
print(dataset2.take())

[{'value': 4}, {'value': 5}, {'value': 6}, {'value': 7}]


Create preprocessor and transform the datasets

In [81]:
preprocessor = MinMaxScaler(["value"])

In [82]:
dataset1_transformed = preprocessor.fit_transform(dataset1)
print(dataset1_transformed.take())

Shuffle Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 682.72it/s]
Shuffle Reduce: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 344.44it/s]
Map_Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 311.21it/s]

[{'value': 0.0}, {'value': 0.3333333333333333}, {'value': 0.6666666666666666}, {'value': 1.0}]





In [84]:
# we have already fitted on this preprocessor, so no need to fit
dataset2_transformed = preprocessor.transform(dataset2)
print(dataset2_transformed.take())

Map_Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 253.85it/s]

[{'value': 1.3333333333333333}, {'value': 1.6666666666666667}, {'value': 2.0}, {'value': 2.3333333333333335}]





Finally, let's call `transform_batch`

In [89]:
batch = pd.DataFrame({"value": list(range(8, 12))})
batch

Unnamed: 0,value
0,8
1,9
2,10
3,11


In [90]:
batch_transformed = preprocessor.transform_batch(batch)
print(batch_transformed)

      value
0  2.666667
1  3.000000
2  3.333333
3  3.666667
