https://www.tensorflow.org/decision_forests/tutorials/beginner_colab

In [3]:
!pip install tensorflow_decision_forests

Collecting wurlitzer (from tensorflow_decision_forests)
  Downloading wurlitzer-3.1.1-py3-none-any.whl.metadata (2.5 kB)
Collecting keras<2.16,>=2.15.0 (from tensorflow~=2.15.0->tensorflow_decision_forests)
  Downloading keras-2.15.0-py3-none-any.whl.metadata (2.4 kB)
Downloading wurlitzer-3.1.1-py3-none-any.whl (8.6 kB)
Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: wurlitzer, keras
  Attempting uninstall: keras
    Found existing installation: keras 3.3.3
    Uninstalling keras-3.3.3:
      Successfully uninstalled keras-3.3.3
Successfully installed keras-2.15.0 wurlitzer-3.1.1


In [4]:
!pip install tensorflow_decision_forests



In [5]:
import os
# Keep using Keras 2
os.environ['TF_USE_LEGACY_KERAS'] = '1'

import tensorflow_decision_forests as tfdf

import numpy as np
import pandas as pd
import tensorflow as tf
import tf_keras
import math

2024-06-28 19:51:03.726112: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-28 19:51:03.726240: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-28 19:51:03.867798: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
# Check the version of TensorFlow Decision Forests
print("Found TensorFlow Decision Forests v" + tfdf.__version__)

Found TensorFlow Decision Forests v1.8.1


In [7]:
# Download the dataset
!wget -q https://storage.googleapis.com/download.tensorflow.org/data/palmer_penguins/penguins.csv -O /tmp/penguins.csv

# Load a dataset into a Pandas Dataframe.
dataset_df = pd.read_csv("/tmp/penguins.csv")

# Display the first 3 examples.
dataset_df.head(3)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007


In [8]:
# Encode the categorical labels as integers.
#
# Details:
# This stage is necessary if your classification label is represented as a
# string since Keras expects integer classification labels.
# When using `pd_dataframe_to_tf_dataset` (see below), this step can be skipped.

# Name of the label column.
label = "species"

classes = dataset_df[label].unique().tolist()
print(f"Label classes: {classes}")

dataset_df[label] = dataset_df[label].map(classes.index)

Label classes: ['Adelie', 'Gentoo', 'Chinstrap']


In [9]:
# Split the dataset into a training and a testing dataset.

def split_dataset(dataset, test_ratio=0.30):
  """Splits a panda dataframe in two."""
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]


train_ds_pd, test_ds_pd = split_dataset(dataset_df)
print("{} examples in training, {} examples for testing.".format(
    len(train_ds_pd), len(test_ds_pd)))

243 examples in training, 101 examples for testing.


In [10]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label)

In [11]:

# Specify the model.
model_1 = tfdf.keras.RandomForestModel(verbose=2)

# Train the model.
model_1.fit(train_ds)

Use 4 thread(s) for training
Use /tmp/tmp_jkrp8w0 as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'island': <tf.Tensor 'data:0' shape=(None,) dtype=string>, 'bill_length_mm': <tf.Tensor 'data_1:0' shape=(None,) dtype=float64>, 'bill_depth_mm': <tf.Tensor 'data_2:0' shape=(None,) dtype=float64>, 'flipper_length_mm': <tf.Tensor 'data_3:0' shape=(None,) dtype=float64>, 'body_mass_g': <tf.Tensor 'data_4:0' shape=(None,) dtype=float64>, 'sex': <tf.Tensor 'data_5:0' shape=(None,) dtype=string>, 'year': <tf.Tensor 'data_6:0' shape=(None,) dtype=int64>}
Label: Tensor("data_7:0", shape=(None,), dtype=int64)
Weights: None
Normalized tensor features:
 {'island': SemanticTensor(semantic=<Semantic.CATEGORICAL: 2>, tensor=<tf.Tensor 'data:0' shape=(None,) dtype=string>), 'bill_length_mm': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'Cast:0' shape=(None,) dtype=float32>), 'bill_depth_mm': SemanticTensor(semantic=<Semantic.NUMERIC

<tf_keras.src.callbacks.History at 0x7b20f0cc3e50>

In [12]:
model_1.compile(metrics=["accuracy"])
evaluation = model_1.evaluate(test_ds, return_dict=True)
print()

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")


loss: 0.0000
accuracy: 0.9406


In [13]:
model_1.save("/tmp/my_saved_model")

In [14]:
tfdf.model_plotter.plot_model_in_colab(model_1, tree_idx=0, max_depth=3)

In [15]:
model_1.summary()

Model: "random_forest_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (7):
	bill_depth_mm
	bill_length_mm
	body_mass_g
	flipper_length_mm
	island
	sex
	year

No weights

Variable Importance: INV_MEAN_MIN_DEPTH:
    1. "flipper_length_mm"  0.457025 ################
    2.    "bill_length_mm"  0.438146 ##############
    3.     "bill_depth_mm"  0.306332 #####
    4.            "island"  0.295918 ####
    5.       "body_mass_g"  0.282664 ###
    6.               "sex"  0.237997 
    7.              "year"  0.237412 

Variable Importance: NUM_AS_ROOT:
    1. "flipper_length_mm" 154.000000 ################
    2.    "bill_length_mm" 81.000000 ########
    3.     "bill

In [16]:
# The input features
model_1.make_inspector().features()

["bill_depth_mm" (1; #1),
 "bill_length_mm" (1; #2),
 "body_mass_g" (1; #3),
 "flipper_length_mm" (1; #4),
 "island" (4; #5),
 "sex" (4; #6),
 "year" (1; #7)]

In [17]:
# The feature importances
model_1.make_inspector().variable_importances()

{'INV_MEAN_MIN_DEPTH': [("flipper_length_mm" (1; #4), 0.4570245263234256),
  ("bill_length_mm" (1; #2), 0.43814621175935337),
  ("bill_depth_mm" (1; #1), 0.30633172527879127),
  ("island" (4; #5), 0.2959181589810922),
  ("body_mass_g" (1; #3), 0.282663964003),
  ("sex" (4; #6), 0.23799699153556922),
  ("year" (1; #7), 0.23741235814247647)],
 'SUM_SCORE': [("bill_length_mm" (1; #2), 26560.16192469187),
  ("flipper_length_mm" (1; #4), 25006.0942095723),
  ("island" (4; #5), 8526.813015668653),
  ("bill_depth_mm" (1; #1), 7670.761390669271),
  ("body_mass_g" (1; #3), 5453.9790897630155),
  ("sex" (4; #6), 126.21945172548294),
  ("year" (1; #7), 62.408071715326514)],
 'NUM_AS_ROOT': [("flipper_length_mm" (1; #4), 154.0),
  ("bill_length_mm" (1; #2), 81.0),
  ("bill_depth_mm" (1; #1), 44.0),
  ("body_mass_g" (1; #3), 15.0),
  ("island" (4; #5), 6.0)],
 'NUM_NODES': [("bill_length_mm" (1; #2), 628.0),
  ("flipper_length_mm" (1; #4), 390.0),
  ("bill_depth_mm" (1; #1), 370.0),
  ("body_mass_g