# Pruning in ML/AI

Pruning as a concept was originally introduced to the field of deep learning by Yann LeCun in an eerie titled paper “Optimal Brain Damage”. The word pruning means trimming or cutting away the excess; in the context of machine learning and artificial intelligence, it involves removing the redundant or the least important parts of a model or search space.

# Using Pruning to Regularize a Decision Tree Classifier

We’ll be training a DecisionTreeClassifier model on the [Titanic dataset](https://www.kaggle.com/c/titanic/) available on Kaggle. In this example, we’ll use pruning as a regularization technique for the overfitting-prone DecisionTreeClassifier.

Load, clean, and split the data.

In [None]:
!python -m pip install pip --upgrade --user -q
!python -m pip install numpy pandas seaborn matplotlib scipy sklearn statsmodels tensorflow keras --user -q

In [None]:
import IPython
IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.preprocessing import LabelEncoder

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/agconti/kaggle-titanic/master/data/train.csv")
data = data.loc[:,("Survived","Pclass","Sex","Age","SibSp","Parch","Fare")]

In [None]:
data.dropna(inplace=True)
#'inplace=True' applies the code to the 'data' object.
le = LabelEncoder()
data.Sex = le.fit_transform(data.Sex)

In [None]:
x = data.iloc[:,1:]   # Second column until the last column
y = data.iloc[:,0]    # First column (Survived) is our target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3)

In [None]:
y

In [None]:
y.unique()

  Create a baseline model, train and evaluate it.

In [None]:
from sklearn.tree import DecisionTreeClassifier

Let’s visualize the tree.

In [None]:
dt_classifier = DecisionTreeClassifier(random_state = 42)
dt_classifier.fit(x_train, y_train)  #train parameters: features and target
pred = dt_classifier.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)
#parameters: targets to be predicted and predictions from new data used before

In [None]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt_classifier, 
                   feature_names=x.columns,  
                   class_names=["Died", "Survived"],
                   filled=True)

Prune the tree by searching for the optimum depth.

In [None]:
max_depth = []
acc = []
for i in range(1,30):
 dt_classifier = DecisionTreeClassifier(max_depth=i, random_state = 42)
 dt_classifier.fit(x_train, y_train)
 pred = dt_classifier.predict(x_test)
 acc.append(accuracy_score(y_test, pred))
 max_depth.append(i)

In [None]:
print(max(acc))

In [None]:
print(acc)

In [None]:
depth = acc.index(max(acc)) + 1

In [None]:
depth

Let’s visualize the pruned tree.

In [None]:
dt_classifier = DecisionTreeClassifier(max_depth=depth, random_state = 42)
dt_classifier.fit(x_train, y_train)
pred = dt_classifier.predict(x_test)
accuracy_score(y_test, pred)

In [None]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt_classifier, 
                   feature_names=x.columns,  
                   class_names=["Died", "Survived"],
                   filled=True)

# Compressing a Neural Network 

In this section,  we illustrate the use of pruning for compressing a convolutional neural network model.

  Install tensorflow-model-optimization and create the baseline model

In [None]:
! pip install -q tensorflow-model-optimization
import tempfile
import os
import tensorflow as tf
import numpy as np
from tensorflow import keras

# Load MNIST dataset
mnist = keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Normalize the input image so that each pixel value is between 0 to 1.
train_images = train_images / 255.0
test_images = test_images / 255.0

# Define the model architecture.
model = keras.Sequential([
  keras.layers.InputLayer(input_shape=(28, 28)),
  keras.layers.Reshape(target_shape=(28, 28, 1)),
  keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation='relu'),
  keras.layers.MaxPooling2D(pool_size=(2, 2)),
  keras.layers.Flatten(),
  keras.layers.Dense(10)
])

# Train the classification model
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.fit(
  train_images,
  train_labels,
  epochs=4,
  validation_split=0.1,
) 

  Evaluate and save the baseline model

In [None]:
_, baseline_model_accuracy = model.evaluate(
    test_images, test_labels, verbose=0)
print('Baseline test accuracy:', baseline_model_accuracy)
_, keras_file = tempfile.mkstemp('.h5')
tf.keras.models.save_model(model, keras_file, include_optimizer=False)

Prune the neural network.

In [None]:
import tensorflow_model_optimization as tfmot
prune_low_magnitude = tfmot.sparsity.keras.prune_low_magnitude

# Compute end step to finish pruning after 2 epochs.
batch_size = 128
epochs = 2
validation_split = 0.1
num_images = train_images.shape[0] * (1 - validation_split)
end_step = np.ceil(num_images / batch_size).astype(np.int32) * epochs

#Define model for pruning.
pruning_params = {
'pruning_schedule':
tfmot.sparsity.keras.PolynomialDecay(initial_sparsity=0.50,
final_sparsity=0.80,
begin_step=0,
end_step=end_step)
}
model_for_pruning = prune_low_magnitude(model, **pruning_params)

# prune_low_magnitude requires a recompile.
model_for_pruning.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])

logdir = tempfile.mkdtemp()
callbacks = [
tfmot.sparsity.keras.UpdatePruningStep(),
tfmot.sparsity.keras.PruningSummaries(log_dir=logdir),
]

model_for_pruning.fit(train_images, train_labels,
batch_size=batch_size, epochs=epochs,
validation_split=validation_split,
callbacks=callbacks)

In [None]:
 _, pruned_keras_file = tempfile.mkstemp('.h5')
 tf.keras.models.save_model(model_for_pruning, pruned_keras_file, include_optimizer=False) 

Evaluate and compare with baseline.

In [None]:
_, model_for_pruning_accuracy = model_for_pruning.evaluate(
  test_images, test_labels, verbose=0)
print('Baseline test accuracy:', baseline_model_accuracy)
print('Pruned test accuracy:', model_for_pruning_accuracy) 

There is a very small drop in performance, now let’s compare the size of the two models.

In [None]:
def get_gzipped_model_size(file):
  # Returns size of gzipped model, in bytes.
  import os
  import zipfile
  _, zipped_file = tempfile.mkstemp('.zip')
  with zipfile.ZipFile(zipped_file, 'w', compression=zipfile.ZIP_DEFLATED) as f:
    f.write(file)
  return os.path.getsize(zipped_file)

print("Size of gzipped baseline Keras model: %.2f bytes" % (get_gzipped_model_size(keras_file)))
print("Size of gzipped pruned Keras model: %.2f bytes" % (get_gzipped_model_size(pruned_keras_file))) 