In [1]:
import pandas as pd
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
# Get churn data from customers table
df = pd.read_csv('telco_churn_customers.csv')

In [3]:
df.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,...,device_protection,tech_support,streaming_tv,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,churn
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,1,No,...,No,Yes,Yes,No,2,Yes,2,65.6,593.3,No
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,1,No,...,No,No,No,Yes,1,No,2,59.9,542.4,No
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,2,No,...,Yes,No,No,No,1,Yes,1,73.9,280.85,Yes
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,2,No,...,Yes,No,Yes,Yes,1,Yes,1,98.0,1237.85,Yes
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,2,No,...,No,Yes,Yes,No,1,Yes,2,83.9,267.4,Yes


Write a function, peekatdata(dataframe), that takes a dataframe as input and computes and returns the following:

creates dataframe object head_df (df of the first 5 rows) and prints contents to screen

creates dataframe object tail_df (df of the last 5 rows) and prints contents to screen

creates tuple object shape_tuple (tuple of (nrows, ncols)) and prints tuple to screen

creates dataframe object describe_df (summary statistics of all numeric variables) and prints contents to screen.

prints to screen the information about a DataFrame including the index dtype and column dtypes, non-null values and memory usage.

In [4]:
def peekatdata(dataframe):
    head_df = df.head()
    tail_df = df.tail()
    head_tail = head_df.append(tail_df)
    print('Head and Tail\n\n', head_tail)
    print('--'*55)
    shape_tuple = df.shape
    print('Shape\n\n', shape_tuple)
    print('--'*20)
    describe_df = df.describe()
    print('Describe\n\n', describe_df)
    print('--'*20)
    print('Index\n\n', df.index)
    print('--'*20)
    print('Data Types\n\n', df.dtypes)
    print('--'*20)
    print('Null Value Count\n\n', df.isnull().sum())
    print('--'*55)
    print('Memory\n\n', df.memory_usage)

In [16]:
# peekatdata(df)

In [6]:
# Some of the total charges have empty space string ' ', replace with NaN
df.replace(' ', np.nan, inplace=True)

# Replace NaN with 0
df.fillna(0, inplace=True)

# Turn the column from type object to float
df["total_charges"] = df.total_charges.astype(float)

In [17]:
# Write a function, df_value_counts(dataframe), that takes a dataframe as input and computes and returns the values 
# by frequency for each column. The function should decide whether or not to bin the data for the value counts.

def df_value_counts(df):
    for col in df.columns: 
        n = df[col].unique().shape[0] 
        col_bins = min(n,10) 
        if df[col].dtype in ['int64','float64'] and n > 10:
            print('%s:' % col)
            print(df[col].value_counts(bins=col_bins, sort=False)) 
        else: 
            print(df[col].value_counts()) 
        print('\n')

# df_value_counts(df)

In [8]:
# Transform churn such that "yes" = 1 and "no" = 0

def transform_churn(df):
    df['churn'] = df['churn'].replace({'Yes': 1, 'No': 0})
    return df

transform_churn(df).head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,...,device_protection,tech_support,streaming_tv,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,churn
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,1,No,...,No,Yes,Yes,No,2,Yes,2,65.6,593.3,0
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,1,No,...,No,No,No,Yes,1,No,2,59.9,542.4,0
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,2,No,...,Yes,No,No,No,1,Yes,1,73.9,280.85,1
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,2,No,...,Yes,No,Yes,Yes,1,Yes,1,98.0,1237.85,1
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,2,No,...,No,Yes,Yes,No,1,Yes,2,83.9,267.4,1


In [9]:
# Compute a new feature, tenure_year, that is a result of translating tenure from months to years.

def tenure_year(df):
    tenure_year = (df.tenure / 12)
    df['tenure_year']= tenure_year.round(2) # creates new column for above
    return df

tenure_year(df).head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,...,tech_support,streaming_tv,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,churn,tenure_year
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,1,No,...,Yes,Yes,No,2,Yes,2,65.6,593.3,0,0.75
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,1,No,...,No,No,Yes,1,No,2,59.9,542.4,0,0.75
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,2,No,...,No,No,No,1,Yes,1,73.9,280.85,1,0.33
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,2,No,...,No,Yes,Yes,1,Yes,1,98.0,1237.85,1,1.08
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,2,No,...,Yes,Yes,No,1,Yes,2,83.9,267.4,1,0.25


In [10]:
# Figure out a way to capture the information contained in dependents and partner into a single variable of dtype int.
# Transform the data and place in a new column household_type_id.

def household_type_id(df):
    df["household_type_id"] = df["partner"].map(str) + df["dependents"]
    df['household_type_id'] = df['household_type_id'].replace({'YesYes': 3, 'NoNo': 0, 'YesNo': 2, 'NoYes': 1})
    return df

household_type_id(df).head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,...,streaming_tv,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,churn,tenure_year,household_type_id
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,1,No,...,Yes,No,2,Yes,2,65.6,593.3,0,0.75,3
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,1,No,...,No,Yes,1,No,2,59.9,542.4,0,0.75,0
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,2,No,...,No,No,1,Yes,1,73.9,280.85,1,0.33,0
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,2,No,...,Yes,Yes,1,Yes,1,98.0,1237.85,1,1.08,2
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,2,No,...,Yes,No,1,Yes,2,83.9,267.4,1,0.25,2


In [11]:
# Figure out a way to capture the information contained in streaming_tv and streaming_movies into a single
# variable of dtype int. Transform the data and place in a new column streaming_services.

def streaming_services(df):
    df["streaming_services"] = df["partner"].map(str) + df["dependents"]
    df['streaming_services'] = df['streaming_services'].replace({'YesYes': 3, 'NoNo': 0, 'YesNo': 2, 'NoYes': 1})
    return df

streaming_services(df).head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,...,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,churn,tenure_year,household_type_id,streaming_services
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,1,No,...,No,2,Yes,2,65.6,593.3,0,0.75,3,3
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,1,No,...,Yes,1,No,2,59.9,542.4,0,0.75,0,0
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,2,No,...,No,1,Yes,1,73.9,280.85,1,0.33,0,0
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,2,No,...,Yes,1,Yes,1,98.0,1237.85,1,1.08,2,2
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,2,No,...,No,1,Yes,2,83.9,267.4,1,0.25,2,2


In [12]:
# Figure out a way to capture the information contained in phone_service and multiple_lines into a single variable
# of dtype int. Write a function that will transform the data and place in a new column named phone_id.
def phone_info(df):
    df["phone_id"] = df["phone_service"].map(str) + df["multiple_lines"]
    df['phone_id'] = df['phone_id'].replace({'YesYes': 2, 'NoNo phone service': 0, 'YesNo': 1})
    return df

phone_info(df).head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,...,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,churn,tenure_year,household_type_id,streaming_services,phone_id
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,1,No,...,2,Yes,2,65.6,593.3,0,0.75,3,3,1
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,1,No,...,1,No,2,59.9,542.4,0,0.75,0,0,2
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,2,No,...,1,Yes,1,73.9,280.85,1,0.33,0,0,1
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,2,No,...,1,Yes,1,98.0,1237.85,1,1.08,2,2,1
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,2,No,...,1,Yes,2,83.9,267.4,1,0.25,2,2,1


In [13]:
# Figure out a way to capture the information contained in online_security and online_backup into a single variable 
# of dtype int. Transform the data and place in a new column online_security_backup.
def online_security_info(df):
    df["online_security"].map(str) + df["online_backup"]
    df['online_security'] = df['online_security'].replace({'No internet serviceNo internet service': 0,
                                                           'NoNo': 1, 
                                                           'NoYes': 2,
                                                           'YesNo': 3,
                                                           'YesYes': 4})
    return df

online_security_info(df).head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,...,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,churn,tenure_year,household_type_id,streaming_services,phone_id
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,1,No,...,2,Yes,2,65.6,593.3,0,0.75,3,3,1
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,1,No,...,1,No,2,59.9,542.4,0,0.75,0,0,2
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,2,No,...,1,Yes,1,73.9,280.85,1,0.33,0,0,1
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,2,No,...,1,Yes,1,98.0,1237.85,1,1.08,2,2,1
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,2,No,...,1,Yes,2,83.9,267.4,1,0.25,2,2,1


In [14]:
# Split the data into train (70%) & test (30%) samples.
X = df.drop(['churn'], axis=1)
y = df[['churn']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=123)

X_train.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,...,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,tenure_year,household_type_id,streaming_services,phone_id
1479,2187-PKZAY,Male,0,No,No,12,Yes,No,2,No,...,Yes,1,Yes,3,79.95,1043.4,1.0,0,0,1
2377,3402-XRIUO,Female,1,Yes,No,22,Yes,Yes,1,Yes,...,No,1,Yes,2,63.55,1381.8,1.83,2,2,2
6613,9397-TZSHA,Female,0,No,No,69,Yes,Yes,3,No internet service,...,No internet service,3,No,4,24.6,1678.05,5.75,0,0,2
6468,9153-BTBVV,Female,0,Yes,No,71,Yes,Yes,3,No internet service,...,No internet service,3,No,3,25.0,1753.0,5.92,2,2,2
2668,3793-MMFUH,Female,1,No,No,13,Yes,Yes,2,No,...,Yes,1,Yes,1,95.05,1290.0,1.08,0,0,2


In [15]:
# Variable Encoding: encode the values in each non-numeric feature such that they are numeric.
