In [14]:
import tensorflow_decision_forests as tfdf

import os
import numpy as np
import pandas as pd
import tensorflow as tf
import math

try:
  from wurlitzer import sys_pipes
except:
  from colabtools.googlelog import CaptureLog as sys_pipes

from IPython.core.magic import register_line_magic
from IPython.display import Javascript

In [4]:
print("Found TensorFlow Decision Forests v" + tfdf.__version__)


Found TensorFlow Decision Forests v1.5.0


In [8]:
# Download the dataset
!wget -q https://storage.googleapis.com/download.tensorflow.org/data/palmer_penguins/penguins.csv -O penguins.csv

# Load a dataset into a Pandas Dataframe.
dataset_df = pd.read_csv("penguins.csv")

# Display the first 3 examples.
dataset_df.head(3)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007


In [9]:
# Encode the categorical label into an integer.
#
# Details:
# This stage is necessary if your classification label is represented as a
# string. Note: Keras expected classification labels to be integers.

# Name of the label column.
label = "species"

classes = dataset_df[label].unique().tolist()
print(f"Label classes: {classes}")

dataset_df[label] = dataset_df[label].map(classes.index)

dataset_df

Label classes: ['Adelie', 'Gentoo', 'Chinstrap']


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,0,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,0,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,0,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,0,Torgersen,,,,,,2007
4,0,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...
339,2,Dream,55.8,19.8,207.0,4000.0,male,2009
340,2,Dream,43.5,18.1,202.0,3400.0,female,2009
341,2,Dream,49.6,18.2,193.0,3775.0,male,2009
342,2,Dream,50.8,19.0,210.0,4100.0,male,2009


In [10]:
# Split the dataset into a training and a testing dataset.

def split_dataset(dataset, test_ratio=0.30):
  """Splits a panda dataframe in two."""
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]


train_ds_pd, test_ds_pd = split_dataset(dataset_df)
print("{} examples in training, {} examples for testing.".format(
    len(train_ds_pd), len(test_ds_pd)))

234 examples in training, 110 examples for testing.


In [11]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label)

2023-09-22 15:50:30.594344: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-22 15:50:30.690695: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-22 15:50:30.690935: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [13]:
# Specify the model.
model_1 = tfdf.keras.RandomForestModel()

# Optionally, add evaluation metrics.
model_1.compile(
    metrics=["accuracy"])

model_1.fit(x=train_ds)

Use /tmp/tmpjogpm51w as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.194934. Found 234 examples.
Training model...
Model trained in 0:00:00.055773
Compiling model...
Model compiled.


[INFO 23-09-22 15:52:10.0521 -03 kernel.cc:1243] Loading model from path /tmp/tmpjogpm51w/model/ with prefix 8b5d49c4e7b64919
[INFO 23-09-22 15:52:10.0592 -03 decision_forest.cc:660] Model loaded with 300 root(s), 4372 node(s), and 7 input feature(s).
[INFO 23-09-22 15:52:10.0592 -03 abstract_model.cc:1311] Engine "RandomForestGeneric" built
[INFO 23-09-22 15:52:10.0592 -03 kernel.cc:1075] Use fast generic engine


<keras.src.callbacks.History at 0x7f4ba78d2ca0>

In [17]:
y_pred = model_1.predict(x=test_ds)



In [22]:
import sys
sys.path.append('../') 
import utils_exec_models as ut

In [26]:
from sklearn.metrics import classification_report

print(classification_report(
    test_ds_pd.species.values, 
    y_pred, 
#     target_names=target_names
))


ValueError: Classification metrics can't handle a mix of multiclass and continuous-multioutput targets

In [27]:
test_ds_pd.species

3      0
4      0
6      0
7      0
8      0
      ..
330    2
331    2
333    2
337    2
343    2
Name: species, Length: 110, dtype: int64

In [28]:
y_pred

array([[0.63333285, 0.15999992, 0.20666654],
       [0.99999917, 0.        , 0.        ],
       [0.99666584, 0.        , 0.00333333],
       [0.99666584, 0.00333333, 0.        ],
       [0.99999917, 0.        , 0.        ],
       [0.9933325 , 0.        , 0.00666667],
       [0.99666584, 0.        , 0.00333333],
       [0.99999917, 0.        , 0.        ],
       [0.99999917, 0.        , 0.        ],
       [0.99999917, 0.        , 0.        ],
       [0.99999917, 0.        , 0.        ],
       [0.9899992 , 0.00666667, 0.00333333],
       [0.99666584, 0.        , 0.00333333],
       [0.99666584, 0.        , 0.00333333],
       [0.76666605, 0.00333333, 0.22999986],
       [0.9933325 , 0.        , 0.00666667],
       [0.99666584, 0.00333333, 0.        ],
       [0.9399992 , 0.        , 0.06000001],
       [0.99999917, 0.        , 0.        ],
       [0.99999917, 0.        , 0.        ],
       [0.9933325 , 0.00333333, 0.00333333],
       [0.99999917, 0.        , 0.        ],
       [0.