# Classify structured data with feature columns

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

## Use Pandas to create a dataframe

In [None]:
dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'
tf.keras.utils.get_file('petfinder_mini.zip', 
                        dataset_url,
                        #extract=True,
                        cache_dir='.')

In [None]:
!unzip datasets/petfinder_mini.zip -d datasets/

In [2]:
df = pd.read_csv('datasets/petfinder-mini/petfinder-mini.csv')

In [60]:
df.head()

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,PhotoAmt,target
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,1,1
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,2,1
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,7,1
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,8,1
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,3,1


In [4]:
df.shape

(11537, 15)

## Create target variable

El objetivo en el dataset original es predecir cuanto tiempo pasará hasta que una mascota sea adoptada (ej: primer semana, primer mes, etc). Para simplicar el problema se transforma en un problema de clasificación binaria, si la mascota fue adoptada o no.

In [5]:
df['target'] = np.where(df['AdoptionSpeed']==4, 0, 1)

In [6]:
df['target'].value_counts()

1    8457
0    3080
Name: target, dtype: int64

In [7]:
df = df.drop(columns=['AdoptionSpeed', 'Description'])

## Split the dataframe into train, validation, and test

In [8]:
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

In [9]:
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

7383 train examples
1846 validation examples
2308 test examples


## Create an input pipeline using tf.data

In [10]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

## Choose which columns to use

In [45]:
feature_columns = []

# numeric cols
for header in ['PhotoAmt', 'Fee', 'Age']:
    feature_columns.append(feature_column.numeric_column(header))

In [46]:
# bucketized cols
age = feature_column.numeric_column('Age')
age_buckets = feature_column.bucketized_column(age, boundaries=sorted(np.unique(np.percentile(train['Age'], q=[10,25,50,75,90,99]))))
feature_columns.append(age_buckets)

In [47]:
# indicator_columns
indicator_column_names = ['Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health']
for col_name in indicator_column_names:
    categorical_column = feature_column.categorical_column_with_vocabulary_list(col_name, train[col_name].unique())
    indicator_column = feature_column.indicator_column(categorical_column)
    feature_columns.append(indicator_column)

In [48]:
# embedding columns
breed1 = feature_column.categorical_column_with_vocabulary_list('Breed1', train.Breed1.unique())
breed1_embedding = feature_column.embedding_column(breed1, dimension=6)
feature_columns.append(breed1_embedding)

In [None]:
# crossed columns
age_type_feature = feature_column.crossed_column([age_buckets, animal_type], hash_bucket_size=100)
feature_columns.append(feature_column.indicator_column(age_type_feature))

In [61]:
feature_columns

[NumericColumn(key='PhotoAmt', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Fee', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 BucketizedColumn(source_column=NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(1.0, 2.0, 4.0, 12.0, 36.0, 96.0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Type', vocabulary_list=('Dog', 'Cat'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Color1', vocabulary_list=('Golden', 'Brown', 'Black', 'White', 'Gray', 'Yellow', 'Cream'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Color2', vocabulary_list=('Cream', 'No Color', 'White', 'Brown', 'Go

### Transform dataframe to TF Dataset

In [15]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

### Create a feature layer

In [49]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

## Create, compile, and train the model

In [50]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dropout(.1),
  layers.Dense(1, activation='sigmoid')
])

In [18]:
METRICS = [keras.metrics.BinaryAccuracy(name='accuracy'),
           keras.metrics.Precision(name='precision'),
           keras.metrics.Recall(name='recall'),
           keras.metrics.AUC(name='auc'),
]

In [62]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=METRICS)

In [64]:
history= model.fit(train_ds,
                   validation_data=val_ds,
                   epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [65]:
results = model.evaluate(test_ds, return_dict=True)



In [22]:
results

{'loss': 0.5301904678344727,
 'accuracy': 0.7430675625801086,
 'precision': 0.7635843753814697,
 'recall': 0.9429075717926025,
 'auc': 0.7230902910232544}

In [109]:
test

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,PhotoAmt,target
10846,Dog,7,Mixed Breed,Female,White,No Color,Medium,Short,No,Yes,Healthy,0,5,0
11239,Cat,7,Domestic Short Hair,Male,White,No Color,Small,Short,Yes,Yes,Healthy,30,4,0
354,Dog,3,Mixed Breed,Female,Brown,No Color,Medium,Medium,No,No,Healthy,0,5,0
700,Cat,2,Domestic Medium Hair,Male,Golden,Yellow,Small,Medium,No,No,Healthy,0,3,1
5158,Dog,24,Jack Russell Terrier,Male,Brown,White,Medium,Short,Yes,No,Healthy,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5782,Cat,14,Domestic Medium Hair,Female,Black,Brown,Medium,Medium,No,No,Healthy,0,2,0
11125,Dog,2,Mixed Breed,Female,Brown,No Color,Medium,Short,No,No,Healthy,80,1,1
8061,Dog,8,Labrador Retriever,Female,Golden,No Color,Large,Short,Yes,No,Healthy,300,2,1
2509,Cat,15,Domestic Medium Hair,Female,White,No Color,Medium,Medium,No,No,Healthy,0,1,1


In [110]:
test.iloc[0].to_dict()

{'Type': 'Dog',
 'Age': 7,
 'Breed1': 'Mixed Breed',
 'Gender': 'Female',
 'Color1': 'White',
 'Color2': 'No Color',
 'MaturitySize': 'Medium',
 'FurLength': 'Short',
 'Vaccinated': 'No',
 'Sterilized': 'Yes',
 'Health': 'Healthy',
 'Fee': 0,
 'PhotoAmt': 5,
 'target': 0}

In [112]:
pd.DataFrame([test.iloc[0].to_dict()]).to_dict('series')

{'Type': 0    Dog
 Name: Type, dtype: object, 'Age': 0    7
 Name: Age, dtype: int64, 'Breed1': 0    Mixed Breed
 Name: Breed1, dtype: object, 'Gender': 0    Female
 Name: Gender, dtype: object, 'Color1': 0    White
 Name: Color1, dtype: object, 'Color2': 0    No Color
 Name: Color2, dtype: object, 'MaturitySize': 0    Medium
 Name: MaturitySize, dtype: object, 'FurLength': 0    Short
 Name: FurLength, dtype: object, 'Vaccinated': 0    No
 Name: Vaccinated, dtype: object, 'Sterilized': 0    Yes
 Name: Sterilized, dtype: object, 'Health': 0    Healthy
 Name: Health, dtype: object, 'Fee': 0    0
 Name: Fee, dtype: int64, 'PhotoAmt': 0    5
 Name: PhotoAmt, dtype: int64, 'target': 0    0
 Name: target, dtype: int64}

In [113]:
model(pd.DataFrame([test.iloc[0].to_dict()]).to_dict('series'))

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.508274]], dtype=float32)>