In [346]:
#Preprocessing Layers in TensorFlow: Numerical Features

In [347]:
import tensorflow as tf
import numpy as np

**Normalization** is a crucial preprocessing step in machine learning, especially when dealing with numerical features. It scales the data to a specific range, typically between -1 and 1 or 0 and 1, improving model convergence and performance.

In [348]:
#Create a sample data
input_data = tf.random.normal((5, 3))
input_data

<tf.Tensor: shape=(5, 3), dtype=float32, numpy=
array([[ 0.6901324 ,  1.1230894 , -0.4044459 ],
       [-0.33085328, -1.759656  ,  0.51515794],
       [ 0.8582079 ,  0.71087277,  0.74407357],
       [ 1.3581398 ,  0.25683084, -0.894563  ],
       [-0.5311769 , -1.7492015 ,  2.863261  ]], dtype=float32)>

In [349]:
# Create normalization layer
normalization_layer = tf.keras.layers.Normalization(axis=-1)

In [350]:
# Adapt layer to data
normalization_layer.adapt(input_data)

In [351]:
# Apply layer to data
normalized_data = normalization_layer(input_data)

In [352]:
print(normalized_data)

tf.Tensor(
[[ 0.3890413   1.1419976  -0.7483021 ]
 [-1.0232832  -1.1982902  -0.03825024]
 [ 0.62153935  0.80734944  0.13850193]
 [ 1.3130927   0.43874636 -1.1267353 ]
 [-1.30039    -1.189803    1.774786  ]], shape=(5, 3), dtype=float32)


In [353]:
# Create some example data
data = np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32)

In [354]:
norm_layers = tf.keras.layers.Normalization(axis=None)

In [355]:
norm_layers.adapt(data)

In [356]:
normalized_data2 = norm_layers(data)

In [357]:
normalized_data2

<tf.Tensor: shape=(5, 1), dtype=float32, numpy=
array([[-1.4142135 ],
       [-0.70710677],
       [ 0.        ],
       [ 0.70710677],
       [ 1.4142135 ]], dtype=float32)>

In [358]:
#3d tensor

In [359]:
input_data_3d = tf.random.normal((3, 5, 4))

In [360]:
input_data_3d

<tf.Tensor: shape=(3, 5, 4), dtype=float32, numpy=
array([[[ 1.1394294e-01, -7.9690677e-01, -7.5026549e-02, -2.0668525e-02],
        [ 1.2138311e+00,  1.9032490e-01,  6.3382113e-01, -2.6454967e-01],
        [ 6.0395432e-01,  2.6996925e-01, -4.6654311e-01,  2.2336443e-01],
        [-1.1920804e+00,  5.3192407e-02, -2.5607584e+00,  5.6495464e-01],
        [ 4.8633823e-01,  4.0342247e-01, -1.4581898e-01, -1.1946557e+00]],

       [[-1.0027706e-01, -2.5753412e-01,  8.9890547e-02,  1.5033059e+00],
        [-3.2993096e-01, -4.6450320e-01, -2.4872078e-01,  7.9795748e-01],
        [ 1.8046671e-01, -5.3034770e-01, -3.7006596e-01,  1.2195622e+00],
        [ 8.7622964e-01, -8.2825997e-04, -4.7032577e-01,  6.7932314e-01],
        [ 1.8567688e+00,  1.8781151e-01,  1.5186881e+00, -4.4864246e-01]],

       [[-4.1385281e-01,  4.4842219e-01,  1.8187754e+00, -8.7808192e-01],
        [ 1.2056322e-01, -1.0891794e+00,  1.0221947e-01, -1.3304704e-01],
        [ 7.5128150e-01, -4.4769579e-01, -2.0908813e-01, 

In [361]:
norm_layers_3d = tf.keras.layers.Normalization(axis=None)

In [362]:
norm_layers_3d.adapt(input_data_3d)

In [363]:
normalized_3d_data = norm_layers_3d(input_data_3d)

In [364]:
normalized_3d_data

<tf.Tensor: shape=(3, 5, 4), dtype=float32, numpy=
array([[[ 0.04275896, -1.0330054 , -0.18042456, -0.1162247 ],
        [ 1.3417882 ,  0.1329703 ,  0.6567641 , -0.40426195],
        [ 0.6214897 ,  0.22703472, -0.6428274 ,  0.17199184],
        [-1.4997274 , -0.02899082, -3.1162126 ,  0.5754289 ],
        [ 0.48257855,  0.3846504 , -0.26403436, -1.5027689 ]],

       [[-0.21024682, -0.3959762 ,  0.01435173,  1.6836742 ],
        [-0.4814809 , -0.6404182 , -0.38556713,  0.85061824],
        [ 0.12132724, -0.71818423, -0.52888256,  1.348557  ],
        [ 0.94306207, -0.09279224, -0.647295  ,  0.7105045 ],
        [ 2.1011336 ,  0.13000184,  1.7018414 , -0.62168574]],

       [[-0.5805973 ,  0.4377976 ,  2.0562613 , -1.1288778 ],
        [ 0.05057787, -1.3781955 ,  0.02891288, -0.24895   ],
        [ 0.79549134, -0.62056774, -0.33875874, -1.8172241 ],
        [ 0.22502117,  1.9747344 ,  2.2433414 , -0.2059314 ],
        [ 0.32863188, -0.72268003, -0.67686754, -1.1536391 ]]],
      dtype=f

The **tf.keras.layers.Discretization** layer in TensorFlow is used to convert continuous numerical features into discrete categories (bins). This can be useful for tasks like:

1. **Feature engineering:** Creating new categorical features from continuous ones.

2. **Categorical modeling:** Using categorical features as input to models like decision trees, random forests, or gradient boosting machines.

**Key arguments:**


*   **num_bins**: The number of bins to create.
*   **bin_edge**s: A list or array specifying the edges of the bins. If not provided, the layer will automatically calculate bin edges based on the input data.



In [365]:
input_data = tf.random.uniform((10, 1), minval=0, maxval=100)

In [366]:
# Define the bin boundaries for discretization
bin_boundaries = [20, 40, 60, 80, 100]

In [367]:
# Create a Discretization layer with specified bin boundaries
discretization_layer = tf.keras.layers.Discretization(bin_boundaries=bin_boundaries)

In [368]:
# Apply the layer to the input data
discretized_data = discretization_layer(input_data)

In [369]:
print("Input Data:\n", input_data.numpy())
print("Discretized Data (Bin Indexes):\n", discretized_data.numpy())

Input Data:
 [[11.892486 ]
 [29.69886  ]
 [42.734016 ]
 [82.27815  ]
 [44.752716 ]
 [79.05449  ]
 [ 3.440833 ]
 [ 5.6203723]
 [84.64247  ]
 [90.46111  ]]
Discretized Data (Bin Indexes):
 [[0]
 [1]
 [2]
 [4]
 [2]
 [3]
 [0]
 [0]
 [4]
 [4]]


In [370]:
import tensorflow as tf

# Create a sample input tensor with a different range
input_data = tf.random.uniform((10, 1), minval=-50, maxval=300)

# Discretize the input data into 5 bins
discretization_layer = tf.keras.layers.Discretization(bin_boundaries=[20,60,150,280])
discretized_data = discretization_layer(input_data)

print("Input Data:\n", input_data.numpy())
print("Discretized Data:\n", discretized_data.numpy())


Input Data:
 [[-48.07038  ]
 [235.02579  ]
 [-15.926788 ]
 [-26.573605 ]
 [ 26.909676 ]
 [  7.0105247]
 [251.46362  ]
 [102.75955  ]
 [-28.914278 ]
 [ 76.64446  ]]
Discretized Data:
 [[0]
 [3]
 [0]
 [0]
 [1]
 [0]
 [3]
 [2]
 [0]
 [2]]


The **tf.data.Dataset** API provides a way to build complex input pipelines from simple, reusable pieces. It allows you to load, preprocess, and iterate through data in an efficient and scalable manner.

In [371]:
data = tf.constant([3,2,5,6,1,7,9])

In [372]:
data.numpy()

array([3, 2, 5, 6, 1, 7, 9], dtype=int32)

In [373]:
dataset = tf.data.Dataset.from_tensor_slices(data)

In [374]:
for element in dataset:
  print(element.numpy())

3
2
5
6
1
7
9


In [375]:
# Define a simple transformation function
def preprocess(x):
    return x * 2

In [376]:

# Apply transformations
dataset = dataset.map(preprocess)  # Apply the preprocessing function



# Print elements of the transformed dataset
for element in dataset:
    print(element.numpy())

6
4
10
12
2
14
18


In [377]:
dataset = dataset.batch(2)          # Batch elements
# Print elements of the transformed dataset
for element in dataset:
    print(element.numpy())


[6 4]
[10 12]
[ 2 14]
[18]


In [378]:
dataset = dataset.shuffle(buffer_size=10)  # Shuffle dataset
# Print elements of the transformed dataset
for element in dataset:
    print(element.numpy())

[10 12]
[6 4]
[ 2 14]
[18]
