In [1]:
import os
import tempfile
import math
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(font_scale = 1.5)

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, recall_score, \
                            precision_score, make_scorer, confusion_matrix, roc_curve, precision_recall_curve

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [3]:
customer_data = pd.read_csv('../data-sources/customer-churn/customer-churn.csv')
customer_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
customer_data.rename(columns={'customerID': 'CustomerID', 'gender': 'Gender', 'tenure': 'Tenure'}, inplace=True)

In [5]:
def ColumnTransformer(cell):
    if cell == 0:
        return 'No'
    else:
        return 'Yes'

customer_data['SeniorCitizen'] = customer_data['SeniorCitizen'].apply(ColumnTransformer)

In [6]:
def ColumnTransformer(cell):
    if cell == 'Electronic check':
        return 'ElCh'
    elif cell == 'Mailed check':
        return 'MaCh'
    elif cell == 'Bank transfer (automatic)':
        return 'BaTr-A'
    else:
        return 'CrCa-A'
    
customer_data['PaymentMethod'] = customer_data['PaymentMethod'].apply(ColumnTransformer)

In [7]:
customer_data.drop(customer_data[customer_data['Tenure'] == 0].index, inplace=True)
customer_data.drop('CustomerID', axis=1, inplace=True)

customer_data['TotalCharges'] = customer_data['TotalCharges'].astype(float)

In [12]:
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Gender            7032 non-null   object 
 1   SeniorCitizen     7032 non-null   object 
 2   Partner           7032 non-null   object 
 3   Dependents        7032 non-null   object 
 4   Tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  Contract          7032 non-null   object 
 15  PaperlessBilling  7032 non-null   object 
 16  PaymentMethod     7032 non-null   object 


In [11]:
train_data, test_data = train_test_split(customer_data, test_size=0.30, stratify=customer_data['Churn'])

print('Train dataset shape: {}'.format(train_data.shape))
print('Test dataset shape: {}'.format(test_data.shape))

Train dataset shape: (4922, 20)
Test dataset shape: (2110, 20)


In [14]:
numeric_feature_names = ['Tenure', 'MonthlyCharges', 'TotalCharges']

categorical_features_with_vocabulary = {'Gender': sorted(list(train_data['Gender'].unique())),
                                        'SeniorCitizen': sorted(list(train_data['SeniorCitizen'].unique())),
                                        'Partner': sorted(list(train_data['Partner'].unique())),
                                        'Dependents': sorted(list(train_data['Dependents'].unique())),
                                        'PhoneServices': sorted(list(train_data['PhoneService'].unique())),
                                        'MultipleLines': sorted(list(train_data['MultipleLines'].unique())),
                                        'InternetService': sorted(list(train_data['InternetService'].unique())),
                                        'OnlineSecurity': sorted(list(train_data['OnlineSecurity'].unique())),
                                        'OnlineBackup': sorted(list(train_data['OnlineBackup'].unique())),
                                        'DeviceProtection': sorted(list(train_data['DeviceProtection'].unique())),
                                        'TechSupport': sorted(list(train_data['TechSupport'].unique())),
                                        'StreamTV': sorted(list(train_data['StreamingTV'].unique())),
                                        'StreamingMovies': sorted(list(train_data['StreamingMovies'].unique())),
                                        'Contract': sorted(list(train_data['Contract'].unique())),
                                        'PaperlessBilling': sorted(list(train_data['PaperlessBilling'].unique())),
                                        'PaymentMethod': sorted(list(train_data['PaymentMethod'].unique()))
                                       }

categorical_feature_names = list(categorical_features_with_vocabulary.keys())

feature_names = numeric_feature_names + categorical_feature_names

column_defaults = [[0.0] if feature_name in numeric_feature_names else ['NA'] for feature_name in train_data.columns]

target_name = 'Churn'

target_labels = ['Yes', 'No']

In [16]:
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

target_label_lookup = StringLookup(vocabulary=target_labels, mask_token=None, num_oov_indices=0)