In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import pyarrow as pa
import pyarrow.parquet as pq
import time

# too see max columns
pd.set_option('display.max_columns',None)

In [2]:
churn_df = pd.read_parquet("../Data/churn.parquet")

In [3]:
churn_df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,4836028,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,2166565,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,5724784,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,2630373,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,1700858,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7042995,5918432,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7042996,5548851,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7042997,6243950,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7042998,3491389,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [4]:
churn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043000 entries, 0 to 7042999
Data columns (total 21 columns):
 #   Column            Dtype  
---  ------            -----  
 0   customerID        int64  
 1   gender            object 
 2   SeniorCitizen     int64  
 3   Partner           object 
 4   Dependents        object 
 5   tenure            int64  
 6   PhoneService      object 
 7   MultipleLines     object 
 8   InternetService   object 
 9   OnlineSecurity    object 
 10  OnlineBackup      object 
 11  DeviceProtection  object 
 12  TechSupport       object 
 13  StreamingTV       object 
 14  StreamingMovies   object 
 15  Contract          object 
 16  PaperlessBilling  object 
 17  PaymentMethod     object 
 18  MonthlyCharges    float64
 19  TotalCharges      object 
 20  Churn             object 
dtypes: float64(1), int64(3), object(17)
memory usage: 1.1+ GB


In [5]:
churn_df.describe()

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges
count,7043000.0,7043000.0,7043000.0,7043000.0
mean,4999925.0,0.1621468,32.37115,64.76169
std,2886951.0,0.3685855,24.55774,30.08791
min,2.0,0.0,0.0,18.25
25%,2499645.0,0.0,9.0,35.5
50%,4998624.0,0.0,29.0,70.35
75%,7500639.0,0.0,55.0,89.85
max,9999997.0,1.0,72.0,118.75


## Data Cleaning

In [8]:
# replace this empty string with nan
churn_df['TotalCharges'] = churn_df['TotalCharges'].replace(" ",np.nan)
# Change TotalCharges column to float
churn_df['TotalCharges'] = churn_df['TotalCharges'].astype(float)

In [9]:
# find null values
churn_df.isnull().sum()

customerID              0
gender                  0
SeniorCitizen           0
Partner                 0
Dependents              0
tenure                  0
PhoneService            0
MultipleLines           0
InternetService         0
OnlineSecurity          0
OnlineBackup            0
DeviceProtection        0
TechSupport             0
StreamingTV             0
StreamingMovies         0
Contract                0
PaperlessBilling        0
PaymentMethod           0
MonthlyCharges          0
TotalCharges        11000
Churn                   0
dtype: int64

In [10]:
# replace null with mean
churn_df['TotalCharges'] = churn_df['TotalCharges'].fillna(churn_df['TotalCharges'].mean())

In [11]:
#numerical variables
n = list(churn_df.select_dtypes(include=['int64','float64']).keys())
print(n)

['customerID', 'SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']


In [12]:
#categorical variables
c = list(churn_df.select_dtypes(include='O').keys())
print(c)

['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']


## Data Exploration

In [6]:
#make pairplots
sns.pairplot(churn_df, dropna = True)

<seaborn.axisgrid.PairGrid at 0x2e777b144c0>

In [None]:
# Plot Monthly Charges
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [10, 6]})
sns.distplot(churn_df['MonthlyCharges'], norm_hist=False, kde=False, bins=30, hist_kws={"alpha": 1}).set(xlabel='Total Charges', ylabel='Count');

In [7]:
# Plot Total Charges
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [10, 6]})
sns.distplot(churn_df['TotalCharges'], norm_hist=False, kde=False, bins=30, hist_kws={"alpha": 1}).set(xlabel='Total Charges', ylabel='Count');



ValueError: could not convert string to float: ''

In [None]:
# Tenure Charges
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [10, 6]})
sns.displot(churn_df['tenure'], norm_hist=False, kde=False, bins=30, hist_kws={"alpha": 1}).set(xlabel='Tenure', ylabel='Count');

In [None]:
# Visualize relationship between tenure and Monthly Charges
sns.jointplot(x=churn_df['MonthlyCharges'], y=churn_df['tenure']);

In [None]:
# Visualize relationship between tenure and total Charges
sns.jointplot(x=churn_df['TotalCharges'], y=churn_df['tenure']);

In [None]:
# Visualize relationship between monthly charges and total charges
sns.jointplot(x=churn_df['TotalCharges'], y=churn_df['MonthlyCharges']);

In [None]:
# Histogram of Numerical Features
churn_df.hist(bins=15, figsize=(15, 6), layout=(2, 4));

In [None]:
# Countplot Gender
sns.countplot(churn_df['gender']);

In [None]:
# Countplot Gender
sns.countplot(churn_df['SeniorCitizen']);

In [None]:
churn_df.columns

In [None]:
# Countplot Partner
sns.countplot(churn_df['Partner']);

In [None]:
# Subplot of Categorical Features
fig, ax = plt.subplots(2, 4, figsize=(20, 15))
for variable, subplot in zip(c, ax.flatten()):
    sns.countplot(churn_df[variable], ax=subplot)
    for label in subplot.get_xticklabels():
        label.set_rotation(90)

In [None]:
# Relationship between Monthly Charges and Categorial Variables
fig, ax = plt.subplots(3, 3, figsize=(15, 15))
for var, subplot in zip(c, ax.flatten()):
    sns.boxplot(x=var, y='MonthlyCharges', data=churn_df, ax=subplot)

In [None]:
# Relationship between Total Charges and Categorial Variables
fig, ax = plt.subplots(3, 3, figsize=(15, 15))
for var, subplot in zip(c, ax.flatten()):
    sns.boxplot(x=var, y='TotalCharges', data=churn_df, ax=subplot)

In [None]:
# Relationship between Tenure and Categorial Variables
fig, ax = plt.subplots(3, 3, figsize=(15, 15))
for var, subplot in zip(c, ax.flatten()):
    sns.boxplot(x=var, y='tenure', data=churn_df, ax=subplot)