In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix


df = pd.read_csv('input/telco_customer_churn.csv')

# Drop customerID, since it is not related to customer churn.
df.drop(['customerID', ], axis=1, inplace=True) # axis=0 for rows, axis=1 for columns

df = pd.read_csv('input/telco_customer_churn.csv')

# Drop customerID, since it is not related to customer churn.
df.drop(['customerID', ], axis=1, inplace=True) # axis=0 for rows, axis=1 for columns

# Remove white space in columns for later plotting of tree.
df.replace(' ', '_', regex=True, inplace=True)

# Check for missing data by investigating whether the dtype corresponds to what the column should include
df.dtypes

# Look into the dtype == object columns
for col in df.select_dtypes(include='object'):
    if len(col) < 7:
        tabs = '\t\t\t'
    elif len(col) < 15:
        tabs = '\t\t'
    else:
        tabs = '\t'
    print(f'{col}:{tabs}{df[col].unique()}')

# Seems like TotalCharges might suffer from missing data points.
df['TotalCharges'].unique()

# The following raises a ValueError:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

# From the above error, we saw that some rows in the TotalCharges column were '_' as opposed to a numeric value.
len(df.loc[df['TotalCharges'] == '_'])

df.loc[df['TotalCharges'] == '_']

# We see that all of the rows with TotalCharges = '_' also have tenure == 0. 
# This means that they are new customers who have not yet been charged for anything.
# For this reason, it is safe to rewrite the '_' values to 0.
df.loc[(df['TotalCharges'] == '_'), 'TotalCharges'] = 0.0

# Checking the TotalCharges column of tenure == 0 customers, we see that it has now been rewritten to 0.
df.loc[df['tenure'] == 0]

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])
df.info()

# Let's convert all 'Yes'/'No' columns to 1/0 instead.
# df['Churn'].unique()
cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
for col in cols:
    df[col] = pd.Series(np.where(df[col].values == 'Yes', 1, 0), df.index)
    df[col] = pd.to_numeric(df[col])

df.head()

# Convert Male / Female to 1 / 0 similar to what we did above.
df['gender'] = pd.Series(np.where(df['gender'].values == 'Male', 1, 0), df.index)
df['gender'] = pd.to_numeric(df['gender'])

# Divide the data into independent variables X and dependent variable y (Churn).
X = df.drop('Churn', axis=1).copy()
X.head()

y = df['Churn'].copy()
y.head()

# Now, let us handle the categorical columns in the X dataframe of independent variables.
# We need to convert to one-hot encoding as this is suitable for trees. Superior to converting a categorical
# variable to e.g. 1, 2, 3, 4, as one-hot encoding does not imply that 1 is close to 2 than it is to 3 and 4.
X.info()

# We see that there are a couple of dtype == object columns that are not binary. These, we will one-hot encode.
cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']
X_encoded = pd.get_dummies(X, columns=cols)
X_encoded.head()

y.unique()



