In [1]:
import pandas as pd
import numpy as np
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

from telco_pipeline import get_data_from_sql, peekatdata, split, encode

### Acquistion

In [2]:
# 1. Use function to bring in data using sql query
df = get_data_from_sql()

### Data Prep

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
payment_type_id             7043 non-null int64
internet_service_type_id    7043 non-null int64
contract_type_id            7043 non-null int64
customer_id                 7043 non-null object
gender                      7043 non-null object
senior_citizen              7043 non-null int64
partner                     7043 non-null object
dependents                  7043 non-null object
tenure                      7043 non-null int64
phone_service               7043 non-null object
multiple_lines              7043 non-null object
online_security             7043 non-null object
online_backup               7043 non-null object
device_protection           7043 non-null object
tech_support                7043 non-null object
streaming_tv                7043 non-null object
streaming_movies            7043 non-null object
paperless_billing           7043 non-null object
monthly_charges 

In [4]:
# 2. Create a function that returns info on dataframe
peekatdata(df)

DataFrame Shape:

(7043, 24)

Info about:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
payment_type_id             7043 non-null int64
internet_service_type_id    7043 non-null int64
contract_type_id            7043 non-null int64
customer_id                 7043 non-null object
gender                      7043 non-null object
senior_citizen              7043 non-null int64
partner                     7043 non-null object
dependents                  7043 non-null object
tenure                      7043 non-null int64
phone_service               7043 non-null object
multiple_lines              7043 non-null object
online_security             7043 non-null object
online_backup               7043 non-null object
device_protection           7043 non-null object
tech_support                7043 non-null object
streaming_tv                7043 non-null object
streaming_movies            7043 non-null object
paperless_billing     

In [5]:
# 1. Create a function to decide whether to bin data for the value counts

def df_value_counts(df):
    valcount_df = pd.DataFrame(df.apply(lambda x: x.value_counts(dropna=False).count()))
    return valcount_df

valcount_df = df_value_counts(df)
valcount_df

Unnamed: 0,0
payment_type_id,4
internet_service_type_id,3
contract_type_id,3
customer_id,7043
gender,2
senior_citizen,2
partner,2
dependents,2
tenure,73
phone_service,2


In [6]:
# 2. decide if rows or columns should be dropped

# total charges has 11 values that are blank space
df["total_charges"].value_counts(dropna=False)

20.2       11
           11
19.75       9
20.05       8
19.9        8
19.65       8
19.55       7
45.3        7
19.45       6
20.25       6
20.15       6
20.45       5
20.3        5
20.35       4
69.9        4
75.3        4
69.6        4
20.5        4
50.15       4
19.85       4
70.6        4
74.7        4
44          4
19.2        4
44.4        4
19.4        4
49.9        4
19.95       4
69.95       4
20.4        4
19.5        4
69.65       4
19.3        4
86.05       3
74.9        3
44.75       3
35.9        3
1284.2      3
220.45      3
70.15       3
383.65      3
74.35       3
50.6        3
20          3
24.4        3
74.6        3
79.55       3
20.9        3
84.5        3
70.45       3
80.55       3
70.3        3
85          3
69.1        3
19.1        3
85.5        3
20.55       3
50.45       3
24.8        3
45.1        3
305.55      3
69.25       3
2317.1      3
69.55       3
19.25       3
45.7        3
55.7        3
70.1        3
20.1        3
25.25       3
50.75       3
74.3  

In [7]:
# function that replaces the blank values with NaN and
# returns the percent of missing values in each column
def percent_missing(df):
    df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
    return df.isnull().sum() / len(df) * 100

In [8]:
percent_missing(df)

payment_type_id             0.000000
internet_service_type_id    0.000000
contract_type_id            0.000000
customer_id                 0.000000
gender                      0.000000
senior_citizen              0.000000
partner                     0.000000
dependents                  0.000000
tenure                      0.000000
phone_service               0.000000
multiple_lines              0.000000
online_security             0.000000
online_backup               0.000000
device_protection           0.000000
tech_support                0.000000
streaming_tv                0.000000
streaming_movies            0.000000
paperless_billing           0.000000
monthly_charges             0.000000
total_charges               0.156183
churn                       0.000000
contract_type               0.000000
internet_service_type       0.000000
payment_type                0.000000
dtype: float64

In [9]:
# total charges has .16% of values missing
# I'm going to drop those rows based on that low percentage

df = df.dropna(axis=0)
df.isnull().sum()

payment_type_id             0
internet_service_type_id    0
contract_type_id            0
customer_id                 0
gender                      0
senior_citizen              0
partner                     0
dependents                  0
tenure                      0
phone_service               0
multiple_lines              0
online_security             0
online_backup               0
device_protection           0
tech_support                0
streaming_tv                0
streaming_movies            0
paperless_billing           0
monthly_charges             0
total_charges               0
churn                       0
contract_type               0
internet_service_type       0
payment_type                0
dtype: int64

In [10]:
# create lists to pass into a prepare function 

# create list of columns to be converted into a float types
float_cols = ["total_charges"] 

# create list of columns to be converted to object types
obj_cols = ["payment_type_id", "internet_service_type_id", "contract_type_id", "senior_citizen", "churn"]

# create list of columns to be encoded
encode_cols = [""]

# create list of columns to be scaled
scale_cols = ["monthly_charges"]

In [11]:
# 3. transform churn values such that "Yes" = 1 and "No" = 0
df["churn"] = df["churn"].map({"Yes": 1, "No": 0})
df.isnull().sum()

payment_type_id             0
internet_service_type_id    0
contract_type_id            0
customer_id                 0
gender                      0
senior_citizen              0
partner                     0
dependents                  0
tenure                      0
phone_service               0
multiple_lines              0
online_security             0
online_backup               0
device_protection           0
tech_support                0
streaming_tv                0
streaming_movies            0
paperless_billing           0
monthly_charges             0
total_charges               0
churn                       0
contract_type               0
internet_service_type       0
payment_type                0
dtype: int64

In [12]:
# 4. Compute new feature tenure_year translating tenure from months to years
# I'm not sure how to deal with the fractions of years. Needs tweaking

df["tenure_year"] = df["tenure"] / 12

In [13]:
# 5. Create a function that adds a new column phone_id that captures
# phone_service and multiple_lines into a single int variable

def combine_phone(df):
    df["phone_id"] = df["multiple_lines"].map({"Yes": 1, "No": 0, "No phone service": 0}).astype(int)

In [14]:
# 6. Add a new column household_type_id that combines 
# dependents and partner and is int type (0 = none, 1 = partner,
#                                 2 = dependents, 3 = partner & dependents)

df["dependents"] = df["dependents"].map({"Yes": 2, "No": 0}).astype(int)
df["partner"] = df["partner"].map({"Yes": 1, "No": 0}).astype(int)

df["household_type_id"] = df["dependents"] + df["partner"]

In [15]:
# 7. Add a new column streaming_services of dtype int that combines
# info from streaming_tv and streaming_movies

# dependents and partner and is int type (0 = none, 1 = streaming tv,
#                                 2 = streaming movies, 3 = both)

df["streaming_tv"] = df["streaming_tv"].map({"Yes": 1, "No": 0, "No internet service": 0}).astype(int)
df["streaming_movies"] = df["streaming_movies"].map({"Yes": 2, "No": 0, "No internet service": 0}).astype(int)

df["streaming_services"] = df["streaming_movies"] + df["streaming_tv"]

In [16]:
# 8. add a column online_security_backup of dtype int that combines
# online_security and online_backup

# dependents and partner and is int type (0 = none, 1 = online security,
#                                 2 = online backup, 3 = both)

df["online_security"] = df["online_security"].map({"Yes": 1, "No": 0, "No internet service": 0}).astype(int)
df["online_backup"] = df["online_backup"].map({"Yes": 2, "No": 0, "No internet service": 0}).astype(int)

df["online_security_backup"] = df["online_security"] + df["online_backup"]

In [17]:
# 9. Split the data 70/30 train/test using split function

train, test = split(df=df, target="churn", train_prop=.70, seed=123)

train.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type,tenure_year,household_type_id,streaming_services,online_security_backup
5968,4,3,1,8439-LTUGF,Male,0,0,0,10,Yes,...,20.0,198.6,0,Month-to-month,,Credit card (automatic),0.833333,0,0,0
3585,1,2,1,5339-PXDVH,Male,0,0,0,4,Yes,...,90.65,367.95,0,Month-to-month,Fiber optic,Electronic check,0.333333,0,1,1
6987,3,3,3,9068-FHQHD,Female,0,1,2,40,Yes,...,20.15,777.35,0,Two year,,Bank transfer (automatic),3.333333,3,0,0
5060,3,2,2,9475-NNDGC,Male,0,1,0,71,Yes,...,113.15,7953.25,0,One year,Fiber optic,Bank transfer (automatic),5.916667,1,3,2
3992,3,2,1,7268-IGMFD,Male,1,0,0,18,Yes,...,93.9,1743.9,0,Month-to-month,Fiber optic,Bank transfer (automatic),1.5,0,1,0


In [20]:
# 10. Encode each non-numeric feature such that they are numeric using
# encode function
col_name = "contract_type"

encode(train, test, col_name)
train.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type,tenure_year,household_type_id,streaming_services,online_security_backup
5968,4,3,1,8439-LTUGF,Male,0,0,0,10,Yes,...,20.0,198.6,0,Month-to-month,,Credit card (automatic),0.833333,0,0,0
3585,1,2,1,5339-PXDVH,Male,0,0,0,4,Yes,...,90.65,367.95,0,Month-to-month,Fiber optic,Electronic check,0.333333,0,1,1
6987,3,3,3,9068-FHQHD,Female,0,1,2,40,Yes,...,20.15,777.35,0,Two year,,Bank transfer (automatic),3.333333,3,0,0
5060,3,2,2,9475-NNDGC,Male,0,1,0,71,Yes,...,113.15,7953.25,0,One year,Fiber optic,Bank transfer (automatic),5.916667,1,3,2
3992,3,2,1,7268-IGMFD,Male,1,0,0,18,Yes,...,93.9,1743.9,0,Month-to-month,Fiber optic,Bank transfer (automatic),1.5,0,1,0


In [None]:
# 11. Scale monthly_charges and total_charges


### Data Exploration

In [None]:
# 1. Could the month they signed up influence churn? 
# (Plot the rate of churn on a line chart where x is the tenure 
# and y is the rate of churn (customers churned/total customers)).



In [None]:
# 2. Are there features that indicate higher propensity to churn?


In [None]:
# 3. Is there a price threshold for specific services where the liklihood
# of churn increases? What services and at what price point?


In [None]:
# 4. Looking at churn rate for month-to-month vs. 1-year contract customers
# afther their 12th month of service, is the rate of churn different?


In [None]:
# 5. Use a t-test to find out of the monthly charges of those who have
# churned is significantly higher thatn those who have not. Control for:
# (phone_id, internet_service_type_id, online_security_backup, device_protection, 
# tech_support, and contract_type_id)


In [None]:
# 6. Perform a correlation test, stating hypothesis and conclusion clearly
# that states if montly charges can be explained by internet_service_type


In [None]:
# 7. 


In [None]:
# 8. Create visualizations exploring interactions of variables (independent
# with independent and independent with dependent). The goal is to identify
# features that are related to churn, identify integrity issues, understand
# how the data works.


In [None]:
# 9. 


In [None]:
# 10. 


### Modeling

In [None]:
# 1. Feature selection: can you remove any features that provide limited
# to no additional info?


In [None]:
# 2. Train (fit, transform, evaluate) multiple models and select the best
# performing model.


In [None]:
# 3. Compare eval metrics across all the models and select best performing


In [None]:
# 4. Test the final model (tranform, evaluate) on your out-of-sample data.
# Summarize the performance, interpret results.
