# Predict customer retention by deep learning

# 1. Classify structured data with feature columns

## Dataset

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn
from sklearn import model_selection,pipeline,compose,preprocessing
from tensorflow import feature_column
from tensorflow.keras import layers
# OOP python for feature engineering
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
path='https://raw.githubusercontent.com/rstudio/keras-customer-churn/master/data/WA_Fn-UseC_-Telco-Customer-Churn.csv'
dataframe=pd.read_csv(path)
dataframe.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
dataframe.shape

(7043, 21)

In [5]:
dataframe.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


## Create target variable

In [6]:
# In the original dataset "4" indicates the pet was not adopted. ( No need in this case)
#dataframe['target'] = np.where(dataframe['AdoptionSpeed']==4, 0, 1)
dataframe.rename(columns = {'Churn': 'target'}, inplace=True)
# Drop un-used columns.
dataframe = dataframe.drop(columns=['customerID'])
# Convert total charge to 
dataframe.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,target
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
train, test = model_selection.train_test_split(dataframe, test_size=0.2)
train, val = model_selection.train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

4507 train examples
1127 validation examples
1409 test examples


Next, we will wrap the dataframes with tf.data. This will enable us to use feature columns as a bridge to map from the columns in the Pandas dataframe to features used to train the model. If we were working with a very large CSV file (so large that it does not fit into memory), we would use tf.data to read it from disk directly. That is not covered in this tutorial.

In [8]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('target')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [9]:
batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size) # need suffle train ?
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [10]:
print(len(list(train_ds))) #902 batchs, each batch 5 sample --> 4507 sample
print(len(list(val_ds)))
print(len(list(test_ds)))

902
226
282


In [11]:
[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of MonthlyCharges:', train_features['MonthlyCharges'])
print('A batch of targets:', label_batch )

Every feature: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges']
A batch of MonthlyCharges: tf.Tensor([106.    79.75  64.15  96.55  80.3 ], shape=(5,), dtype=float64)
A batch of targets: tf.Tensor([b'Yes' b'No' b'No' b'Yes' b'Yes'], shape=(5,), dtype=string)


Demonstrate several types of feature columns

In [12]:
# We will use this batch to demonstrate several types of feature columns
example_batch = next(iter(train_ds))[0]
example_batch # contain only x train, not include y train

{'Contract': <tf.Tensor: shape=(5,), dtype=string, numpy=
 array([b'Two year', b'Two year', b'Two year', b'Month-to-month',
        b'Month-to-month'], dtype=object)>,
 'Dependents': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Yes', b'No', b'Yes', b'Yes', b'No'], dtype=object)>,
 'DeviceProtection': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Yes', b'Yes', b'Yes', b'Yes', b'Yes'], dtype=object)>,
 'InternetService': <tf.Tensor: shape=(5,), dtype=string, numpy=
 array([b'DSL', b'Fiber optic', b'DSL', b'Fiber optic', b'Fiber optic'],
       dtype=object)>,
 'MonthlyCharges': <tf.Tensor: shape=(5,), dtype=float64, numpy=array([ 90.15, 105.  ,  38.5 , 110.75,  86.2 ])>,
 'MultipleLines': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Yes', b'Yes', b'No phone service', b'Yes', b'No'], dtype=object)>,
 'OnlineBackup': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Yes', b'No', b'Yes', b'Yes', b'No'], dtype=object)>,
 'OnlineSecurity': <tf.Tensor: shape=(5,), dty

In [13]:
# A utility method to create a feature column
# and to transform a batch of data
def demo(feature_column):
  feature_layer = layers.DenseFeatures(feature_column)
  print(feature_layer(example_batch).numpy())

### Numeric columns
The output of a feature column becomes the input to the model (using the demo function defined above, we will be able to see exactly how each column from the dataframe is transformed). A [numeric column](https://www.tensorflow.org/api_docs/python/tf/feature_column/numeric_column) is the simplest type of column. It is used to represent real valued features. When using this column, your model will receive the column value from the dataframe unchanged.

In [14]:
example_batch['MonthlyCharges']

<tf.Tensor: shape=(5,), dtype=float64, numpy=array([ 90.15, 105.  ,  38.5 , 110.75,  86.2 ])>

In [35]:
MonthlyCharges = feature_column.numeric_column('MonthlyCharges') # feature column from tensorflow
MonthlyCharges

NumericColumn(key='MonthlyCharges', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)

In [36]:
demo(MonthlyCharges) # show one sample from feature column

[[ 90.15]
 [105.  ]
 [ 38.5 ]
 [110.75]
 [ 86.2 ]]


### Bucketized columns
Often, you don't want to feed a number directly into the model, but instead split its value into different categories based on numerical ranges. Consider raw data that represents a person's age. Instead of representing age as a numeric column, we could split the age into several buckets using a [bucketized column](https://www.tensorflow.org/api_docs/python/tf/feature_column/bucketized_column). Notice the one-hot values below describe which age range each row matches.

In [17]:
example_batch['MonthlyCharges']

<tf.Tensor: shape=(5,), dtype=float64, numpy=array([ 90.15, 105.  ,  38.5 , 110.75,  86.2 ])>

In [37]:
MonthlyCharges = feature_column.numeric_column('MonthlyCharges')
MonthlyCharges_buckets = feature_column.bucketized_column(MonthlyCharges, boundaries=[1, 50, 200])
MonthlyCharges_buckets

BucketizedColumn(source_column=NumericColumn(key='MonthlyCharges', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(1, 50, 200))

In [38]:
demo(MonthlyCharges_buckets)

[[0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]


In [20]:
age_buckets = feature_column.bucketized_column(age, boundaries=[10, 50, 100])
demo(age_buckets)

[[0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]]


### Categorical columns
In this dataset, Type is represented as a string (e.g. 'Dog', or 'Cat'). We cannot feed strings directly to a model. Instead, we must first map them to numeric values. The categorical vocabulary columns provide a way to represent strings as a one-hot vector (much like you have seen above with age buckets). The vocabulary can be passed as a list using [categorical_column_with_vocabulary_list](https://www.tensorflow.org/api_docs/python/tf/feature_column/categorical_column_with_vocabulary_list), or loaded from a file using [categorical_column_with_vocabulary_file](https://www.tensorflow.org/api_docs/python/tf/feature_column/categorical_column_with_vocabulary_file).

In [21]:
example_batch['PhoneService']

<tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Yes', b'Yes', b'No', b'Yes', b'Yes'], dtype=object)>

In [39]:
PhoneService_type = feature_column.categorical_column_with_vocabulary_list(
      'PhoneService', dataframe.PhoneService.unique())

PhoneService_type

VocabularyListCategoricalColumn(key='PhoneService', vocabulary_list=('No', 'Yes'), dtype=tf.string, default_value=-1, num_oov_buckets=0)

In [40]:
PhoneService_type_one_hot = feature_column.indicator_column(PhoneService_type)
demo(PhoneService_type_one_hot)

[[0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]]


### Embedding columns
Suppose instead of having just a few possible strings, we have thousands (or more) values per category. For a number of reasons, as the number of categories grow large, it becomes infeasible to train a neural network using one-hot encodings. We can use an embedding column to overcome this limitation. Instead of representing the data as a one-hot vector of many dimensions, an [embedding column](https://www.tensorflow.org/api_docs/python/tf/feature_column/embedding_column) represents that data as a lower-dimensional, dense vector in which each cell can contain any number, not just 0 or 1. The size of the embedding (8, in the example below) is a parameter that must be tuned.

Key point: using an embedding column is best when a categorical column has many possible values. We are using one here for demonstration purposes, so you have a complete example you can modify for a different dataset in the future.

In [24]:
example_batch['MultipleLines']

<tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Yes', b'Yes', b'No phone service', b'Yes', b'No'], dtype=object)>

In [41]:
# Notice the input to the embedding column is the categorical column
# we previously created
MultipleLines = feature_column.categorical_column_with_vocabulary_list(
      'MultipleLines', dataframe.MultipleLines.unique())
MultipleLines_embedding = feature_column.embedding_column(MultipleLines, dimension=4)
demo(MultipleLines_embedding)

[[-0.05753563  0.80201674  0.11358701  0.21236438]
 [-0.05753563  0.80201674  0.11358701  0.21236438]
 [ 0.3575344  -0.8090959   0.23449668  0.19561546]
 [-0.05753563  0.80201674  0.11358701  0.21236438]
 [-0.6946381   0.5753186   0.11128629  0.04255931]]


### Hashed feature columns

Another way to represent a categorical column with a large number of values is to use a [categorical_column_with_hash_bucket](https://www.tensorflow.org/api_docs/python/tf/feature_column/categorical_column_with_hash_bucket). This feature column calculates a hash value of the input, then selects one of the `hash_bucket_size` buckets to encode a string. When using this column, you do not need to provide the vocabulary, and you can choose to make the number of hash_buckets significantly smaller than the number of actual categories to save space.

Key point: An important downside of this technique is that there may be collisions in which different strings are mapped to the same bucket. In practice, this can work well for some datasets regardless.

In [42]:
MultipleLines_hashed = feature_column.categorical_column_with_hash_bucket(
      'MultipleLines', hash_bucket_size=10)
demo(feature_column.indicator_column(MultipleLines_hashed))

[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]]


### Crossed feature columns
Combining features into a single feature, better known as [feature crosses](https://developers.google.com/machine-learning/glossary/#feature_cross), enables a model to learn separate weights for each combination of features. Here, we will create a new feature that is the cross of Age and Type. Note that `crossed_column` does not build the full table of all possible combinations (which could be very large). Instead, it is backed by a `hashed_column`, so you can choose how large the table is.

In [45]:
demo(MonthlyCharges_buckets)

[[0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]


In [47]:
PhoneService_type_one_hot = feature_column.indicator_column(PhoneService_type)
demo(PhoneService_type_one_hot)

[[0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]]


In [43]:
crossed_feature = feature_column.crossed_column([MonthlyCharges_buckets, PhoneService_type], hash_bucket_size=10)
demo(feature_column.indicator_column(crossed_feature))

[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]]


## Choose which columns to use
We have seen how to use several types of feature columns. Now we will use them to train a model. The goal of this tutorial is to show you the complete code (e.g. mechanics) needed to work with feature columns. We have selected a few columns to train our model below arbitrarily.

Key point: If your aim is to build an accurate model, try a larger dataset of your own, and think carefully about which features are the most meaningful to include, and how they should be represented.

In [None]:
feature_columns = []

# numeric cols
for header in ['PhotoAmt', 'Fee', 'Age']:
  feature_columns.append(feature_column.numeric_column(header))

In [None]:
# bucketized cols
age = feature_column.numeric_column('Age')
age_buckets = feature_column.bucketized_column(age, boundaries=[1, 2, 3, 4, 5])
feature_columns.append(age_buckets)

In [None]:
# indicator_columns
indicator_column_names = ['Type', 'Color1', 'Color2', 'Gender', 'MaturitySize',
                          'FurLength', 'Vaccinated', 'Sterilized', 'Health']
for col_name in indicator_column_names:
  categorical_column = feature_column.categorical_column_with_vocabulary_list( col_name, dataframe[col_name].unique()) # One hot encoder
  indicator_column = feature_column.indicator_column(categorical_column) #
  feature_columns.append(indicator_column)

In [None]:
# embedding columns
breed1 = feature_column.categorical_column_with_vocabulary_list(
      'Breed1', dataframe.Breed1.unique())
breed1_embedding = feature_column.embedding_column(breed1, dimension=8)
feature_columns.append(breed1_embedding)

In [None]:
# crossed columns
age_type_feature = feature_column.crossed_column([age_buckets, animal_type], hash_bucket_size=100)
feature_columns.append(feature_column.indicator_column(age_type_feature))