In [11]:
import pathlib
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing as pp

In [2]:
from freeman.utils.support_tf import LogLevelManager as llm
llm.set(2)

In [3]:
FILE_NAME_BASE = "petfinder-mini"
FILE_NAME_ZIP = f"{FILE_NAME_BASE}.zip"
FILE_NAME_CSV = f"{FILE_NAME_BASE}.csv"
WEB_PATH = "http://storage.googleapis.com/download.tensorflow.org/data"
DATA_URL = f"{WEB_PATH}/{FILE_NAME_ZIP}"

tf.keras.utils.get_file(
    FILE_NAME_ZIP, DATA_URL, extract=True, cache_dir="."
)

'./datasets/petfinder-mini.zip'

In [4]:
FILE_PATH_CSV = f"./datasets/{FILE_NAME_BASE}/{FILE_NAME_CSV}"
dataframe = pd.read_csv(FILE_PATH_CSV)
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11537 entries, 0 to 11536
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Type           11537 non-null  object
 1   Age            11537 non-null  int64 
 2   Breed1         11537 non-null  object
 3   Gender         11537 non-null  object
 4   Color1         11537 non-null  object
 5   Color2         11537 non-null  object
 6   MaturitySize   11537 non-null  object
 7   FurLength      11537 non-null  object
 8   Vaccinated     11537 non-null  object
 9   Sterilized     11537 non-null  object
 10  Health         11537 non-null  object
 11  Fee            11537 non-null  int64 
 12  Description    11528 non-null  object
 13  PhotoAmt       11537 non-null  int64 
 14  AdoptionSpeed  11537 non-null  int64 
dtypes: int64(4), object(11)
memory usage: 1.3+ MB


In [5]:
dataframe.head()

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,Description,PhotoAmt,AdoptionSpeed
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,Nibble is a 3+ month old ball of cuteness. He ...,1,2
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,I just found it alone yesterday near my apartm...,2,0
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,Their pregnant mother was dumped by her irresp...,7,3
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,"Good guard dog, very alert, active, obedience ...",8,2
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,This handsome yet cute boy is up for adoption....,3,2


In [6]:
# AdoptionSpeed값 4는 입양되지 않음을 나타냄
dataframe["target"] = np.where(dataframe["AdoptionSpeed"]==4, 0, 1)

In [7]:
dataframe = dataframe.drop(columns=["AdoptionSpeed", "Description"])

In [8]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(
    f"train      shape: {train.shape}\n"
    f"validation shape: {val.shape}\n"
    f"test       shape: {test.shape}\n"
)

train      shape: (7383, 14)
validation shape: (1846, 14)
test       shape: (2308, 14)



In [9]:
def df_to_ds(df, shuffle=True, batch_size=32):
    df = df.copy()
    labels = df.pop("target")
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size).prefetch(batch_size)
    return ds

In [21]:
train_ds = df_to_ds(train, batch_size=5)
[(train_features, train_labels)] = train_ds.take(1)
print(
    f"Every feature     : {list(train_features.keys())}\n"
    # f"A batch           : {train_features}\n"
    f"A batch of ages   : {train_features['PhotoAmt']}\n"
    f"A batch of targets: {train_labels}"
)

Every feature     : ['Type', 'Age', 'Breed1', 'Gender', 'Color1', 'Color2', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Fee', 'PhotoAmt']
A batch of ages   : [4 4 1 2 3]
A batch of targets: [1 1 1 1 1]


In [14]:
def get_normalization_layer(name, dataset):
    normalizer = pp.Normalization(axis=None)
    feature_ds = dataset.map(lambda x, y: x[name])
    normalizer.adapt(feature_ds)
    return normalizer

In [22]:
photo_count_col = train_features["PhotoAmt"]
layer = get_normalization_layer("PhotoAmt", train_ds)
layer(photo_count_col)

<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([ 0.11851166,  0.11851166, -0.81862754, -0.5062478 , -0.19386807],
      dtype=float32)>

In [23]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    if dtype == "string":
        index = pp.StringLookup(max_tokens=max_tokens)
    else:
        index = pp.IntegerLookup(max_tokens=max_tokens)
    feature_ds = dataset.map(lambda x, y: x[name])
    index.adapt(feature_ds)
    encoder = pp.CategoryEncoding(num_tokens=index.vocabulary_size())
    return lambda feature: encoder(index(feature))

In [24]:
type_col = train_features["Type"]
layer = get_category_encoding_layer("Type", train_ds, "string")
layer(type_col)

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0., 1., 1.], dtype=float32)>

In [25]:
type_col = train_features["Age"]
layer = get_category_encoding_layer("Age", train_ds, "int64", 5)
layer(type_col)

<tf.Tensor: shape=(5,), dtype=float32, numpy=array([1., 1., 1., 1., 0.], dtype=float32)>