# Explore Telco Dataset

In [1]:
import pandas as pd
import numpy as np

###  Load Telco Customer Churn Data
The dataset can be retrieved at https://www.kaggle.com/blastchar/telco-customer-churn/data

In [2]:
# Make sure the data is within the same folder as the Jupyter Notebook file
df = pd.read_csv("Data/WA_Fn-UseC_-Telco-Customer-Churn.csv",index_col=0)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 7043 entries, 7590-VHVEG to 3186-AJIEK
Data columns (total 20 columns):
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: float64(1), int64(2), object(17)
memory usage: 1.1+ MB
N

Unnamed: 0_level_0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Reflections  
Most of the columns are of type "Object" as they contain strings. They are *all* of categorical nature, and 12 of them are boolean categories (Yes/No). 

Three columns are of numerical nature `SeniorCitizen`(int64), `tenure`(int64) `MonthlyCharges`(float64). Surprisingly, `TotalCharges` is not a *float64*, but an Object. 

In [5]:
#There are no NAs. "count" returns 7043 for all features
df.count()

gender              7043
SeniorCitizen       7043
Partner             7043
Dependents          7043
tenure              7043
PhoneService        7043
MultipleLines       7043
InternetService     7043
OnlineSecurity      7043
OnlineBackup        7043
DeviceProtection    7043
TechSupport         7043
StreamingTV         7043
StreamingMovies     7043
Contract            7043
PaperlessBilling    7043
PaymentMethod       7043
MonthlyCharges      7043
TotalCharges        7043
Churn               7043
dtype: int64

In [12]:
# Now we want to understand which string attributes are binary, which are of a higher category order
attributesInfo = []

for col in df: 
    if df.dtypes[col]==np.object: # We disregard numerical attributes
        #print(col) 
        #print(df[col].unique(), len(df[col].unique()))
        attributesInfo.append({"columnName":""})
        
        

gender
['Female' 'Male'] 2
Partner
['Yes' 'No'] 2
Dependents
['No' 'Yes'] 2
PhoneService
['No' 'Yes'] 2
MultipleLines
['No phone service' 'No' 'Yes'] 3
InternetService
['DSL' 'Fiber optic' 'No'] 3
OnlineSecurity
['No' 'Yes' 'No internet service'] 3
OnlineBackup
['Yes' 'No' 'No internet service'] 3
DeviceProtection
['No' 'Yes' 'No internet service'] 3
TechSupport
['No' 'Yes' 'No internet service'] 3
StreamingTV
['No' 'Yes' 'No internet service'] 3
StreamingMovies
['No' 'Yes' 'No internet service'] 3
Contract
['Month-to-month' 'One year' 'Two year'] 3
PaperlessBilling
['Yes' 'No'] 2
PaymentMethod
['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)'] 4
TotalCharges
['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5'] 6531
Churn
['No' 'Yes'] 2


In [20]:
    
#print(df.head())

# boolify() takes a dataframe and a column as input, returns the same dataset with a new column, 
# where the input column is encoded with binary values.
def boolify(df, columns, inverse=False):
    db = df;
    for columnName in columns:
        one_hot = pd.get_dummies(df[columnName])
        one_hot = one_hot.drop('No',axis = 1)
        one_hot = one_hot.rename(columns={"Yes": "bool"+columnName}, errors="raise")
        print(one_hot.head())
        db = db.join(one_hot)
    return db

# We first turn binary columns into one hot encoded columns
db = boolify(df, ['Churn', 'PaperlessBilling'])
print(db.head())



            boolChurn
customerID           
7590-VHVEG          0
5575-GNVDE          0
3668-QPYBK          1
7795-CFOCW          0
9237-HQITU          1
            boolPaperlessBilling
customerID                      
7590-VHVEG                     1
5575-GNVDE                     0
3668-QPYBK                     1
7795-CFOCW                     0
9237-HQITU                     1
            gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
customerID                                                                  
7590-VHVEG  Female              0     Yes         No       1           No   
5575-GNVDE    Male              0      No         No      34          Yes   
3668-QPYBK    Male              0      No         No       2          Yes   
7795-CFOCW    Male              0      No         No      45           No   
9237-HQITU  Female              0      No         No       2          Yes   

               MultipleLines InternetService OnlineSecurity OnlineBackup  .