# Predict customer retention by deep learning

# 1. Classify structured data with feature columns

## Dataset

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn
from sklearn import model_selection,pipeline,compose,preprocessing
from tensorflow import feature_column
from tensorflow.keras import layers
# OOP python for feature engineering
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
path='https://raw.githubusercontent.com/rstudio/keras-customer-churn/master/data/WA_Fn-UseC_-Telco-Customer-Churn.csv'
dataframe=pd.read_csv(path)
dataframe.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
dataframe.shape

(7043, 21)

In [4]:
dataframe.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


## Create target variable

In [5]:
# In the original dataset "4" indicates the pet was not adopted. ( No need in this case)
#dataframe['target'] = np.where(dataframe['AdoptionSpeed']==4, 0, 1)
dataframe.rename(columns = {'Churn': 'target'}, inplace=True)
dataframe['target'] = np.where(dataframe['target']=="Yes", 1, 0)
# Drop un-used columns.
dataframe = dataframe.drop(columns=['customerID'])
# Convert total charge from object to float:
dataframe["TotalCharges"] = dataframe["TotalCharges"].replace(' ', np.nan).astype(float)
# Remove NA
dataframe.dropna(inplace=True)
dataframe.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,target
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [6]:
train, test = model_selection.train_test_split(dataframe, test_size=0.2)
train, val = model_selection.train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

4500 train examples
1125 validation examples
1407 test examples


Next, we will wrap the dataframes with tf.data. This will enable us to use feature columns as a bridge to map from the columns in the Pandas dataframe to features used to train the model. If we were working with a very large CSV file (so large that it does not fit into memory), we would use tf.data to read it from disk directly. That is not covered in this tutorial.

In [7]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('target')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [8]:
batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size) # need suffle train ?
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [9]:
print(len(list(train_ds))) #902 batchs, each batch 5 sample --> 4507 sample
print(len(list(val_ds)))
print(len(list(test_ds)))

900
225
282


In [10]:
[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of MonthlyCharges:', train_features['MonthlyCharges'])
print('A batch of targets:', label_batch )

Every feature: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges']
A batch of MonthlyCharges: tf.Tensor([110.15  20.25  71.05  74.    73.  ], shape=(5,), dtype=float64)
A batch of targets: tf.Tensor([0 0 0 0 0], shape=(5,), dtype=int32)


Demonstrate several types of feature columns

In [11]:
# We will use this batch to demonstrate several types of feature columns
example_batch = next(iter(train_ds))[0]
example_batch # contain only x train, not include y train

{'gender': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Male', b'Male', b'Male', b'Male', b'Male'], dtype=object)>,
 'SeniorCitizen': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 1, 0, 0], dtype=int64)>,
 'Partner': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'No', b'No', b'No', b'No', b'No'], dtype=object)>,
 'Dependents': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'No', b'Yes', b'No', b'No', b'No'], dtype=object)>,
 'tenure': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([ 3, 19, 57, 60, 47], dtype=int64)>,
 'PhoneService': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Yes', b'No', b'Yes', b'Yes', b'Yes'], dtype=object)>,
 'MultipleLines': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Yes', b'No phone service', b'Yes', b'No', b'No'], dtype=object)>,
 'InternetService': <tf.Tensor: shape=(5,), dtype=string, numpy=
 array([b'Fiber optic', b'DSL', b'Fiber optic', b'DSL', b'DSL'],
       dtype=object)>,
 'OnlineSecurity': <tf.Tensor: s

In [12]:
# A utility method to create a feature column
# and to transform a batch of data
def demo(feature_column):
  feature_layer = layers.DenseFeatures(feature_column)
  print(feature_layer(example_batch).numpy())

### Numeric columns
The output of a feature column becomes the input to the model (using the demo function defined above, we will be able to see exactly how each column from the dataframe is transformed). A [numeric column](https://www.tensorflow.org/api_docs/python/tf/feature_column/numeric_column) is the simplest type of column. It is used to represent real valued features. When using this column, your model will receive the column value from the dataframe unchanged.

In [13]:
example_batch['MonthlyCharges']

<tf.Tensor: shape=(5,), dtype=float64, numpy=array([ 81.35,  25.35, 104.9 ,  61.4 ,  86.95])>

In [14]:
MonthlyCharges = feature_column.numeric_column('MonthlyCharges') # feature column from tensorflow
MonthlyCharges

NumericColumn(key='MonthlyCharges', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)

In [15]:
demo(MonthlyCharges) # show one sample from feature column



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[[ 81.35]
 [ 25.35]
 [104.9 ]
 [ 61.4 ]
 [ 86.95]]


### Bucketized columns
Often, you don't want to feed a number directly into the model, but instead split its value into different categories based on numerical ranges. Consider raw data that represents a person's age. Instead of representing age as a numeric column, we could split the age into several buckets using a [bucketized column](https://www.tensorflow.org/api_docs/python/tf/feature_column/bucketized_column). Notice the one-hot values below describe which age range each row matches.

In [16]:
example_batch['MonthlyCharges']

<tf.Tensor: shape=(5,), dtype=float64, numpy=array([ 81.35,  25.35, 104.9 ,  61.4 ,  86.95])>

In [17]:
MonthlyCharges = feature_column.numeric_column('MonthlyCharges')
MonthlyCharges_buckets = feature_column.bucketized_column(MonthlyCharges, boundaries=[1, 50, 200])
MonthlyCharges_buckets

BucketizedColumn(source_column=NumericColumn(key='MonthlyCharges', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(1, 50, 200))

In [18]:
demo(MonthlyCharges_buckets)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[[0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]


### Categorical columns
In this dataset, Type is represented as a string (e.g. 'Dog', or 'Cat'). We cannot feed strings directly to a model. Instead, we must first map them to numeric values. The categorical vocabulary columns provide a way to represent strings as a one-hot vector (much like you have seen above with age buckets). The vocabulary can be passed as a list using [categorical_column_with_vocabulary_list](https://www.tensorflow.org/api_docs/python/tf/feature_column/categorical_column_with_vocabulary_list), or loaded from a file using [categorical_column_with_vocabulary_file](https://www.tensorflow.org/api_docs/python/tf/feature_column/categorical_column_with_vocabulary_file).

In [19]:
example_batch['PhoneService']

<tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Yes', b'No', b'Yes', b'Yes', b'Yes'], dtype=object)>

In [20]:
PhoneService_type = feature_column.categorical_column_with_vocabulary_list(
      'PhoneService', dataframe.PhoneService.unique())

PhoneService_type

VocabularyListCategoricalColumn(key='PhoneService', vocabulary_list=('No', 'Yes'), dtype=tf.string, default_value=-1, num_oov_buckets=0)

In [21]:
PhoneService_type_one_hot = feature_column.indicator_column(PhoneService_type)
demo(PhoneService_type_one_hot)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[[0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]]


### Embedding columns
Suppose instead of having just a few possible strings, we have thousands (or more) values per category. For a number of reasons, as the number of categories grow large, it becomes infeasible to train a neural network using one-hot encodings. We can use an embedding column to overcome this limitation. Instead of representing the data as a one-hot vector of many dimensions, an [embedding column](https://www.tensorflow.org/api_docs/python/tf/feature_column/embedding_column) represents that data as a lower-dimensional, dense vector in which each cell can contain any number, not just 0 or 1. The size of the embedding (8, in the example below) is a parameter that must be tuned.

Key point: using an embedding column is best when a categorical column has many possible values. We are using one here for demonstration purposes, so you have a complete example you can modify for a different dataset in the future.

In [22]:
example_batch['MultipleLines']

<tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Yes', b'No phone service', b'Yes', b'No', b'No'], dtype=object)>

In [23]:
# Notice the input to the embedding column is the categorical column
# we previously created
MultipleLines = feature_column.categorical_column_with_vocabulary_list(
      'MultipleLines', dataframe.MultipleLines.unique())
MultipleLines_embedding = feature_column.embedding_column(MultipleLines, dimension=4)
demo(MultipleLines_embedding)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[[ 0.07954893 -0.23234229  0.02463544  0.64123756]
 [ 0.5544977   0.08571963  0.46231398 -0.715725  ]
 [ 0.07954893 -0.23234229  0.02463544  0.64123756]
 [ 0.00909556  0.44260988  0.22604911  0.59846556]
 [ 0.00909556  0.44260988  0.22604911  0.59846556]]


### Hashed feature columns

Another way to represent a categorical column with a large number of values is to use a [categorical_column_with_hash_bucket](https://www.tensorflow.org/api_docs/python/tf/feature_column/categorical_column_with_hash_bucket). This feature column calculates a hash value of the input, then selects one of the `hash_bucket_size` buckets to encode a string. When using this column, you do not need to provide the vocabulary, and you can choose to make the number of hash_buckets significantly smaller than the number of actual categories to save space.

Key point: An important downside of this technique is that there may be collisions in which different strings are mapped to the same bucket. In practice, this can work well for some datasets regardless.

In [24]:
MultipleLines_hashed = feature_column.categorical_column_with_hash_bucket(
      'MultipleLines', hash_bucket_size=10)
demo(feature_column.indicator_column(MultipleLines_hashed))



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]]


### Crossed feature columns
Combining features into a single feature, better known as [feature crosses](https://developers.google.com/machine-learning/glossary/#feature_cross), enables a model to learn separate weights for each combination of features. Here, we will create a new feature that is the cross of Age and Type. Note that `crossed_column` does not build the full table of all possible combinations (which could be very large). Instead, it is backed by a `hashed_column`, so you can choose how large the table is.

In [25]:
demo(MonthlyCharges_buckets)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[[0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]


In [26]:
PhoneService_type_one_hot = feature_column.indicator_column(PhoneService_type)
demo(PhoneService_type_one_hot)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[[0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]]


In [27]:
#crossed_feature = feature_column.crossed_column([MonthlyCharges_buckets, PhoneService_type], hash_bucket_size=10)
#demo(feature_column.indicator_column(crossed_feature))

## Choose which columns to use
We have seen how to use several types of feature columns. Now we will use them to train a model. The goal of this tutorial is to show you the complete code (e.g. mechanics) needed to work with feature columns. We have selected a few columns to train our model below arbitrarily.

Key point: If your aim is to build an accurate model, try a larger dataset of your own, and think carefully about which features are the most meaningful to include, and how they should be represented.

In [28]:
#train.head()

dataframe_remove_target = dataframe.iloc[:,:-1]
dataframe_remove_target.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65


In [29]:
numerical_column =list(dataframe_remove_target.select_dtypes('number').columns)

In [30]:
feature_columns = []
# numeric cols
for header in numerical_column:
  feature_columns.append(feature_column.numeric_column(header))

In [31]:
# bucketized cols (bin numeric column, suggest for monthly charge and total charge)
#age = feature_column.numeric_column('Age')
#age_buckets = feature_column.bucketized_column(age, boundaries=[1, 2, 3, 4, 5])
#feature_columns.append(age_buckets)

In [32]:
# indicator_columns (for category)
categorical_column = list(dataframe_remove_target.select_dtypes('object').columns) # Remove target column
for col_name in categorical_column:
  categorical_column = feature_column.categorical_column_with_vocabulary_list( col_name, dataframe_remove_target[col_name].unique()) # Define One hot encoder
  indicator_column = feature_column.indicator_column(categorical_column) # convert to one hot
  feature_columns.append(indicator_column)

In [33]:
# embedding columns (for category with a lot of unique item, not apply here)
#breed1 = feature_column.categorical_column_with_vocabulary_list(
#      'Breed1', dataframe.Breed1.unique())
#breed1_embedding = feature_column.embedding_column(breed1, dimension=8)
#feature_columns.append(breed1_embedding)

In [34]:
# crossed columns (no need here)
#age_type_feature = feature_column.crossed_column([age_buckets, animal_type], #hash_bucket_size=100)
#feature_columns.append(feature_column.indicator_column(age_type_feature))

In [35]:
feature_columns

[NumericColumn(key='SeniorCitizen', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='tenure', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='MonthlyCharges', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='TotalCharges', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='gender', vocabulary_list=('Female', 'Male'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Partner', vocabulary_list=('Yes', 'No'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='Dependents', vocabulary_list=('No', 'Yes'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalC

Now that we have defined our feature columns, we will use a DenseFeatures layer to input them to our Keras model.

In [36]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

Earlier, we used a small batch size to demonstrate how feature columns worked. We create a new input pipeline with a larger batch size.

In [37]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

## Create, compile, and train the model

In [38]:
train.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
target                int32
dtype: object

In [39]:
METRICS = [
    tf.keras.metrics.TruePositives(name='tp'),
    tf.keras.metrics.FalsePositives(name='fp'),
    tf.keras.metrics.TrueNegatives(name='tn'),
    tf.keras.metrics.FalseNegatives(name='fn'), 
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall'),
    tf.keras.metrics.AUC(name='auc'),
]

In [63]:
#tf.keras.backend.set_floatx('float64') ## remove warning : Sequential model should only have a single input tensor, but we receive a <class 'dict'> 

#METRICS= tf.keras.metrics.AUC()
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dropout(.1),
  layers.Dense(1)
])
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), #from_logits=True
              #metrics=[tf.keras.metrics.Accuracy()]
              metrics=['accuracy']
              )

In [58]:

model.fit(train_ds,
          validation_data=val_ds,
          epochs=10)

Epoch 1/10


To change all layers to have dtype float32 by default, call `tf.keras.backend.set_floatx('float32')`. To change just this layer, pass dtype='float32' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2bcffd6fcd0>

In [59]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.7931769722814499


In [60]:
y_pred=np.argmax(model.predict(test_ds), axis=-1)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [61]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_features_7 (DenseFeatu multiple                  0         
_________________________________________________________________
dense_27 (Dense)             multiple                  5888      
_________________________________________________________________
dense_28 (Dense)             multiple                  16512     
_________________________________________________________________
dropout_9 (Dropout)          multiple                  0         
_________________________________________________________________
dense_29 (Dense)             multiple                  129       
Total params: 22,529
Trainable params: 22,529
Non-trainable params: 0
_________________________________________________________________


Key point: You will typically see best results with deep learning with much larger and more complex datasets. When working with a small dataset like this one, we recommend using a decision tree or random forest as a strong baseline. The goal of this tutorial is not to train an accurate model, but to demonstrate the mechanics of working with structured data, so you have code to use as a starting point when working with your own datasets in the future.

# 2. Classify structured data using Keras Preprocessing Layers

In [22]:
import numpy as np
import pandas as pd
import tensorflow as tf

import sklearn as sk
from sklearn import model_selection,pipeline,compose,preprocessing
from sklearn.base import BaseEstimator, TransformerMixin # OOP python for feature engineering

from tensorflow import feature_column
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing # same name in sklearn api

In [23]:
path='https://raw.githubusercontent.com/rstudio/keras-customer-churn/master/data/WA_Fn-UseC_-Telco-Customer-Churn.csv'
dataframe=pd.read_csv(path)
dataframe.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [24]:
dataframe.shape

(7043, 21)

In [25]:
dataframe.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


## Create target variable

In [26]:
# In the original dataset "4" indicates the pet was not adopted. ( No need in this case)
#dataframe['target'] = np.where(dataframe['AdoptionSpeed']==4, 0, 1)
dataframe.rename(columns = {'Churn': 'target'}, inplace=True)
dataframe['target'] = np.where(dataframe['target']=="Yes", 1, 0)
# Drop un-used columns.
dataframe = dataframe.drop(columns=['customerID'])
# Convert total charge from object to float:
dataframe["TotalCharges"] = dataframe["TotalCharges"].replace(' ', np.nan).astype(float)
# Remove NA
dataframe.dropna(inplace=True)
dataframe.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,target
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [27]:
train, test = model_selection.train_test_split(dataframe, test_size=0.2)
train, val = model_selection.train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

4500 train examples
1125 validation examples
1407 test examples


Next, we will wrap the dataframes with tf.data. This will enable us to use feature columns as a bridge to map from the columns in the Pandas dataframe to features used to train the model. If we were working with a very large CSV file (so large that it does not fit into memory), we would use tf.data to read it from disk directly. That is not covered in this tutorial.

In [28]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('target')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [29]:
batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size) # need suffle train ?
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [30]:
print(len(list(train_ds))) #902 batchs, each batch 5 sample --> 4507 sample
print(len(list(val_ds)))
print(len(list(test_ds)))

900
225
282


In [31]:
[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of MonthlyCharges:', train_features['MonthlyCharges'])
print('A batch of targets:', label_batch )

Every feature: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges']
A batch of MonthlyCharges: tf.Tensor([100.2   24.35  25.85  49.9   79.3 ], shape=(5,), dtype=float64)
A batch of targets: tf.Tensor([0 0 0 0 0], shape=(5,), dtype=int64)


Demonstrate several types of feature columns

In [32]:
# We will use this batch to demonstrate several types of feature columns
example_batch = next(iter(train_ds))[0]
#example_batch # contain only x train, not include y train

In [33]:
# A utility method to create a feature column
# and to transform a batch of data
def demo(feature_column):
  feature_layer = layers.DenseFeatures(feature_column)
  print(feature_layer(example_batch).numpy())

## Demonstrate the use of preprocessing layers.

The Keras preprocessing layers API allows you to build Keras-native input processing pipelines. You will use 3 preprocessing layers to demonstrate the feature preprocessing code.

*   [`Normalization`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/Normalization) - Feature-wise normalization of the data.
*   [`CategoryEncoding`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/CategoryEncoding) - Category encoding layer.
*   [`StringLookup`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/StringLookup) - Maps strings from a vocabulary to integer indices.
*   [`IntegerLookup`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/IntegerLookup) - Maps integers from a vocabulary to integer indices.

You can find a list of available preprocessing layers [here](https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing).

Numeric column

In [34]:
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for our feature.
  normalizer = preprocessing.Normalization()

  # Prepare a Dataset that only yields our feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
  normalizer.adapt(feature_ds)

  return normalizer

In [35]:
MonthlyCharges = train_features['MonthlyCharges']
layer = get_normalization_layer('MonthlyCharges', train_ds)
layer(MonthlyCharges)

<tf.Tensor: shape=(5, 1), dtype=float32, numpy=
array([[ 1.1780951 ],
       [-1.3549694 ],
       [-1.3048759 ],
       [-0.5017089 ],
       [ 0.48012498]], dtype=float32)>

Category columns

In [36]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a StringLookup layer which will turn strings into integer indices
  if dtype == 'string':
    index = preprocessing.StringLookup(max_tokens=max_tokens)
  else:
    index = preprocessing.IntegerLookup(max_values=max_tokens)

  # Prepare a Dataset that only yields our feature
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Create a Discretization for our integer indices.
  encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

  # Prepare a Dataset that only yields our feature.
  feature_ds = feature_ds.map(index)

  # Learn the space of possible indices.
  encoder.adapt(feature_ds)

  # Apply one-hot encoding to our indices. The lambda function captures the
  # layer so we can use them, or include them in the functional model later.
  return lambda feature: encoder(index(feature))

In [37]:
PhoneService_col = train_features['PhoneService']
PhoneService_col

<tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Yes', b'Yes', b'No', b'Yes', b'Yes'], dtype=object)>

In [38]:
layer = get_category_encoding_layer('PhoneService', train_ds, 'string')
layer(PhoneService_col)

<tf.Tensor: shape=(5, 4), dtype=float32, numpy=
array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]], dtype=float32)>

Often, you don't want to feed a number directly into the model, but instead use a one-hot encoding of those inputs. Consider raw data that represents a pet's age.

## Choose which columns to use
You have seen how to use several types of preprocessing layers. Now you will use them to train a model. You will be using [Keras-functional API](https://www.tensorflow.org/guide/keras/functional) to build the model. The Keras functional API is a way to create models that are more flexible than the [tf.keras.Sequential](https://www.tensorflow.org/api_docs/python/tf/keras/Sequential) API.

The goal of this tutorial is to show you the complete code (e.g. mechanics) needed to work with preprocessing layers. A few columns have been selected arbitrarily to train our model.

Key point: If your aim is to build an accurate model, try a larger dataset of your own, and think carefully about which features are the most meaningful to include, and how they should be represented.

In [39]:
batch_size = 256
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [41]:
dataframe_remove_target = dataframe.iloc[:,:-1]
numerical_column =list(dataframe_remove_target.select_dtypes('number').columns)

In [42]:
all_inputs = []
encoded_features = []

# Numeric features.
for header in numerical_column:
  numeric_col = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_ds)
  encoded_numeric_col = normalization_layer(numeric_col)
  all_inputs.append(numeric_col)
  encoded_features.append(encoded_numeric_col)

In [None]:
# Categorical features encoded as integers.
#age_col = tf.keras.Input(shape=(1,), name='Age', dtype='int64')
#encoding_layer = get_category_encoding_layer('Age', train_ds, dtype='int64',
#                                             max_tokens=5)
#encoded_age_col = encoding_layer(age_col)
#all_inputs.append(age_col)
#encoded_features.append(encoded_age_col)

In [43]:
# Categorical features encoded as string.
categorical_column = list(dataframe_remove_target.select_dtypes('object').columns) # Remove target column
for header in categorical_column:
  categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
  encoding_layer = get_category_encoding_layer(header, train_ds, dtype='string',
                                               max_tokens=5)
  encoded_categorical_col = encoding_layer(categorical_col)
  all_inputs.append(categorical_col)
  encoded_features.append(encoded_categorical_col)


In [44]:
all_inputs

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'SeniorCitizen')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'tenure')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'MonthlyCharges')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'TotalCharges')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'gender')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Partner')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Dependents')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'PhoneService')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'MultipleLines')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'InternetService')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'OnlineSecurity')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'OnlineBackup')>,
 <KerasTensor: shape=(None, 1) dtype=string (c

In [47]:
encoded_features

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'normalization_4')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'normalization_5')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'normalization_6')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'normalization_7')>,
 <KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'category_encoding_2')>,
 <KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'category_encoding_3')>,
 <KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'category_encoding_4')>,
 <KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'category_encoding_5')>,
 <KerasTensor: shape=(None, 5) dtype=float32 (created by layer 'category_encoding_6')>,
 <KerasTensor: shape=(None, 5) dtype=float32 (created by layer 'category_encoding_7')>,
 <KerasTensor: shape=(None, 5) dtype=float32 (created by layer 'category_encoding_8')>,
 <KerasTensor: shape=(None, 5) dtype=float32 (cr

In [45]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(all_inputs, output)
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy"])

In [63]:
# rankdir='LR' is used to make the graph horizontal.
tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [64]:
model.fit(train_ds, epochs=10, validation_data=val_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f815f40fa00>

In [65]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.7761194109916687


In [66]:
model.save('churn_classifier')
reloaded_model = tf.keras.models.load_model('churn_classifier')

INFO:tensorflow:Assets written to: churn_classifier/assets


In [67]:
sample={'SeniorCitizen':0, 'tenure':20, 'MonthlyCharges':30, 'TotalCharges':100, 'gender':'Female', 
'Partner':'Yes', 'Dependents':'Yes', 'PhoneService':'Yes', 'MultipleLines':'Yes', 
'InternetService':'DSL', 
'OnlineSecurity':'Yes', 'OnlineBackup':'Yes', 'DeviceProtection':'Yes', 'TechSupport':'Yes', 'StreamingTV':'Yes', 
'StreamingMovies':'Yes', 'Contract':'One year', 'PaperlessBilling':'Yes', 'PaymentMethod':'Mailed check'}

input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
predictions = reloaded_model.predict(input_dict)
prob = tf.nn.sigmoid(predictions[0])
print(predictions, prob)
print(
    "This particular pet had a %.1f percent probability "
    "of getting adopted." % (100 * prob)
)

[[-2.7634501]] tf.Tensor([0.05933154], shape=(1,), dtype=float32)
This particular pet had a 5.9 percent probability of getting adopted.
