# Data Cleaning
This notebook will read in the Telco_customer_churn.xlsx file in the data
directory and perform the following operations:
1. Remove irrelevant features.
1. Update Yes/No columns to use 0/1.
1. Save off the mean and standard deviation for each numeric column so that they can be standardized for training the model, and predictions can be made for new data points.  These values will be saved in data/mean_std.csv.
1. Create one-hot encodings for categorical values.
1. Save the cleaned dataset in data/cleaned_telco_data.csv.

In [26]:
import pandas as pd

In [27]:
data_dir = "../data"
ibm_churn_df = pd.read_excel(data_dir + "/Telco_customer_churn.xlsx")

print(ibm_churn_df.shape)

(7043, 33)


In [28]:
# we're going to drop the columns that aren't relevant to the classification
# or are redundant

features_to_drop = ['CustomerID', 'Count', 'Country', 'State', 'City', 'Lat Long',
                    'Latitude', 'Longitude', 'Total Charges', 'Churn Label',
                    'Churn Score', 'Churn Reason']
churn_df = ibm_churn_df.drop(features_to_drop, axis=1)

# Rename Churn Value for ease of use
churn_df = churn_df.rename(columns={'Churn Value': 'Churn'})

In [29]:
yes_no_features = ['Senior Citizen', 'Dependents', 'Partner', 'Paperless Billing',
                   'Phone Service']

multi_cat_features = ['Gender', 'Contract', 'Payment Method', 'Multiple Lines',
                      'Online Security', 'Online Backup', 'Device Protection', 'Tech Support',
                      'Internet Service', 'Streaming TV', 'Streaming Movies']

numeric_features = ['Monthly Charges', 'Tenure Months', 'CLTV']

for col in yes_no_features:
  # Change the "Yes" to a 1, and "No" to a 0
  churn_df[col] = churn_df[col].map({'Yes': 1, 'No': 0})

In [30]:
# Scale our numeric values, saving off the mean and standard deviation so that
# we can make predictions about new data

mean_std_list = []
for f in numeric_features:
  mean_std_list.append({
      'mean': churn_df[f].mean(),
      'std': churn_df[f].std(),
      'name': f
  })

mean_std_df = pd.DataFrame(mean_std_list)
mean_std_df.to_csv(data_dir + "/mean_std.csv")

# Scale the numeric values
for f in numeric_features:
  churn_df[f] = churn_df[f].apply(lambda x: (x - churn_df[f].mean()) / churn_df[f].std())

In [31]:
# Next, use one-hot encoding for the categorical features
churn_df = pd.get_dummies(churn_df, columns=multi_cat_features, dtype=int)
churn_df.to_csv(data_dir + "/cleaned_telco_data.csv")