# Predict customer retention by deep learning

In [32]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn
from sklearn import model_selection,pipeline,compose,preprocessing
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
# OOP python for feature engineering
from sklearn.base import BaseEstimator, TransformerMixin

In [12]:
path='https://raw.githubusercontent.com/rstudio/keras-customer-churn/master/data/WA_Fn-UseC_-Telco-Customer-Churn.csv'
df=pd.read_csv(path)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [13]:
target="Churn"
X = df.drop(columns=target)
y= df[target]
print(X.shape)
print(y.shape)
X_train_a, X_test, y_train_a, y_test = model_selection.train_test_split(X,y, test_size=0.2, 
                                    random_state=2018, stratify=y)

X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train_a, y_train_a,test_size=0.2, 
                                    random_state=2018, stratify=y_train_a)

(7043, 20)
(7043,)


In [14]:
# Drop columns features
class DropSomeColumns(BaseEstimator, TransformerMixin):

    def __init__(self, feature_name):
        self.feature_name = feature_name

    def fit(self, X: pd.DataFrame, y: pd.Series):
        # there is nothing to fit
        return self

    def transform(self, X:pd.DataFrame):
        X = X.copy() # creating a copy to avoid changes to original dataset
        X = X.drop(columns=self.feature_name)
        #X = X[self.feature_name]
        return X

In [15]:
# Remove "" in object column and convert to number
class Convert_to_number(BaseEstimator, TransformerMixin):

    def __init__(self, feature_name):
        self.feature_name = feature_name

    def fit(self, X: pd.DataFrame, y: pd.Series):
        # there is nothing to fit
        return self

    def transform(self, X:pd.DataFrame):
        X = X.copy() # creating a copy to avoid changes to original dataset
        X[self.feature_name] = X[self.feature_name].replace(' ', np.nan).astype(float)
        #X = X[self.feature_name]
        return X

In [18]:
pipe=pipeline.Pipeline(steps=[
                    ("DropSomeColumns",DropSomeColumns("customerID")),
                    ("Convert_to_number",Convert_to_number("TotalCharges"))
                    ]
             )

In [19]:
X_train=pipe.transform(X_train)
X_val=pipe.transform(X_val)
X_test=pipe.transform(X_test)

In [20]:
# Define categorical columns
categorical = list(X_train.select_dtypes('object').columns)
# Define numerical columns
numerical = list(X_train.select_dtypes('number').columns)

In [33]:
# Combine categorical and numerical pipelines
preprocessor = compose.ColumnTransformer(transformers=[
                                                #('cat', preprocessing.OneHotEncoder(), categorical),
                                                ('cat', sklearn.preprocessing.OrdinalEncoder(), categorical),
                                                ('num', sklearn.preprocessing.StandardScaler(), numerical)
                                                      ])

In [34]:
preprocessor.fit(X_train)

ColumnTransformer(transformers=[('cat', OrdinalEncoder(),
                                 ['gender', 'Partner', 'Dependents',
                                  'PhoneService', 'MultipleLines',
                                  'InternetService', 'OnlineSecurity',
                                  'OnlineBackup', 'DeviceProtection',
                                  'TechSupport', 'StreamingTV',
                                  'StreamingMovies', 'Contract',
                                  'PaperlessBilling', 'PaymentMethod']),
                                ('num', StandardScaler(),
                                 ['SeniorCitizen', 'tenure', 'MonthlyCharges',
                                  'TotalCharges'])])

In [37]:
# Transform data in case one hot encoder
#cat_columns = preprocessor.named_transformers_['cat'].get_feature_names(categorical)
#columns = np.append(cat_columns, numerical)

# transform x_train, val, test
##X_train= pd.DataFrame(preprocessor.transform(X_train), columns=columns)
#X_val= pd.DataFrame(preprocessor.transform(X_val), columns=columns)
#X_test= pd.DataFrame(preprocessor.transform(X_test), columns=columns)

In [38]:
# Transform data in case Ordinal encoder
columns=X_train.columns
X_train= pd.DataFrame(preprocessor.transform(X_train), columns=columns)
X_val= pd.DataFrame(preprocessor.transform(X_val), columns=columns)
X_test= pd.DataFrame(preprocessor.transform(X_test), columns=columns)

In [40]:
# Label encoder target
le = sklearn.preprocessing.LabelEncoder()
le.fit(y_train)
y_train=le.transform(y_train)
y_val=le.transform(y_val)
y_test=le.transform(y_test)