# Classify structured data with feature columns

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

## Use Pandas to create a dataframe

In [None]:
dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'
tf.keras.utils.get_file('petfinder_mini.zip', 
                        dataset_url,
                        #extract=True,
                        cache_dir='.')

In [None]:
!unzip datasets/petfinder_mini.zip -d datasets/

In [2]:
df = pd.read_csv('datasets/petfinder-mini/petfinder-mini.csv')

In [47]:
df.head()

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,PhotoAmt,target
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,1,1
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,2,1
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,7,1
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,8,1
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,3,1


In [48]:
df.shape

(11537, 14)

## Create target variable

El objetivo en el dataset original es predecir cuanto tiempo pasará hasta que una mascota sea adoptada (ej: primer semana, primer mes, etc). Para simplicar el problema se transforma en un problema de clasificación binaria, si la mascota fue adoptada o no.

In [5]:
df['target'] = np.where(df['AdoptionSpeed']==4, 0, 1)

In [6]:
df['target'].value_counts()

1    8457
0    3080
Name: target, dtype: int64

In [7]:
df = df.drop(columns=['AdoptionSpeed', 'Description'])

## Split the dataframe into train, validation, and test

In [8]:
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

In [9]:
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

7383 train examples
1846 validation examples
2308 test examples


## Create an input pipeline using tf.data

In [10]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [11]:
batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

## Understand the input pipeline

In [12]:
for feature_batch, label_batch in train_ds.take(1):
    print('Every feature:', list(feature_batch.keys()))
    print('A batch of ages:', feature_batch['Age'])
    print('A batch of targets:', label_batch )

Every feature: ['Type', 'Age', 'Breed1', 'Gender', 'Color1', 'Color2', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Fee', 'PhotoAmt']
A batch of ages: tf.Tensor([ 1  1 48  2  1], shape=(5,), dtype=int64)
A batch of targets: tf.Tensor([1 1 1 1 1], shape=(5,), dtype=int64)


## Demonstrate several types of feature columns

In [13]:
example_batch = next(iter(train_ds))[0]

In [50]:
df.iloc[10:15]

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,PhotoAmt,target
10,Cat,3,Domestic Long Hair,Female,Black,Brown,Large,Long,Yes,No,Healthy,50,2,1
11,Dog,2,Mixed Breed,Male,Brown,Cream,Medium,Long,Yes,No,Healthy,0,1,1
12,Dog,3,Mixed Breed,Female,Brown,Cream,Medium,Medium,Not Sure,Not Sure,Healthy,0,2,1
13,Dog,78,Terrier,Male,Black,White,Medium,Medium,Not Sure,Not Sure,Healthy,0,2,0
14,Cat,6,Domestic Short Hair,Female,Brown,No Color,Small,Short,Yes,Yes,Healthy,0,1,1


In [51]:
example_batch = df.iloc[10:15].to_dict('series')

In [52]:
def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

### Numeric columns

In [53]:
photo_count = feature_column.numeric_column('PhotoAmt')
demo(photo_count)

[[2.]
 [1.]
 [2.]
 [2.]
 [1.]]


### Bucketized columns

In [17]:
age = feature_column.numeric_column('Age')
age_buckets = feature_column.bucketized_column(age, boundaries=[1, 3, 5])
demo(age_buckets)

[[0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]]


In [18]:
train[train['Age']>=96]

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,PhotoAmt,target
8380,Dog,96,Poodle,Female,White,No Color,Small,Medium,Yes,Not Sure,Healthy,0,2,1
1715,Cat,120,Domestic Short Hair,Female,Black,No Color,Small,Short,Not Sure,Yes,Healthy,0,1,0
11260,Cat,180,Domestic Medium Hair,Male,Black,White,Medium,Medium,Yes,Yes,Healthy,0,1,1
5155,Dog,132,Schnauzer,Female,Black,No Color,Small,Short,Yes,Yes,Healthy,0,0,0
6185,Cat,112,Domestic Short Hair,Male,Black,White,Small,Short,No,No,Healthy,0,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
796,Dog,120,Silky Terrier,Female,Golden,Gray,Small,Medium,Not Sure,Not Sure,Healthy,0,1,1
7920,Dog,120,Pug,Male,Black,Cream,Small,Short,Not Sure,No,Healthy,0,3,1
3001,Dog,96,Old English Sheepdog,Male,Black,Golden,Medium,Long,Yes,Yes,Healthy,0,9,1
2442,Dog,96,Mixed Breed,Male,Black,No Color,Small,Short,Yes,Yes,Healthy,0,1,0


In [55]:
train['Age'].value_counts()

2      1633
3       941
1       871
4       588
12      554
       ... 
66        1
88        1
46        1
100       1
255       1
Name: Age, Length: 91, dtype: int64

In [56]:
age_bondaries=sorted(np.unique(np.percentile(train['Age'], q=[10,25,50,75,90,99])))

In [57]:
age_bondaries

[1.0, 2.0, 4.0, 12.0, 36.0, 87.18000000000029]

In [24]:
age = feature_column.numeric_column('Age')
age_buckets = feature_column.bucketized_column(age, boundaries=age_bondaries)
demo(age_buckets)

[[0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]]


### Categorical columns

In [27]:
list(train['Color1'].unique())

['Brown', 'Black', 'Golden', 'Yellow', 'Cream', 'Gray', 'White']

In [28]:
color1_type = feature_column.categorical_column_with_vocabulary_list('Color1', list(train['Color1'].unique()))

color1_type_one_hot = feature_column.indicator_column(color1_type)
demo(color1_type_one_hot)

[[0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]]


### Hashed feature columns

In [44]:
breed1_hashed = feature_column.categorical_column_with_hash_bucket('Breed1', hash_bucket_size=10)
breed1_hashed_ind = feature_column.indicator_column(breed1_hashed)
demo(breed1_hashed_ind)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]]


### Embedding columns

In [30]:
train['Breed1'].value_counts()

Mixed Breed             2964
Domestic Short Hair     1684
Domestic Medium Hair     551
Tabby                    159
Domestic Long Hair       146
                        ... 
Foxhound                   1
Chinese Crested Dog        1
Papillon                   1
Toy Fox Terrier            1
Dilute Calico              1
Name: Breed1, Length: 143, dtype: int64

In [32]:
len(list(train['Breed1'].unique()))

143

In [33]:
breed1 = feature_column.categorical_column_with_vocabulary_list('Breed1', train.Breed1.unique())
breed1_embedding = feature_column.embedding_column(breed1, dimension=5)
demo(breed1_embedding)

[[-0.17369679  0.5931823   0.18107922 -0.22624251 -0.75781304]
 [ 0.49188387  0.67511773  0.14235087 -0.5880355  -0.3434984 ]
 [-0.04414639  0.4697199   0.66400033  0.39997977  0.47212583]
 [-0.04414639  0.4697199   0.66400033  0.39997977  0.47212583]
 [ 0.3559675  -0.04685352  0.12526591 -0.48353416 -0.75784826]]
