In [0]:
# IMPORT GENERIC PACKAGES
import numpy as np # numerical calc package
import pandas as pd # holds data
import matplotlib.pyplot as plt # plotting library
import seaborn as sns # pretty plotting

sns.set(rc={'figure.figsize':(20,10)})

from sklearn.model_selection import train_test_split # split dataset

from sklearn.linear_model import LogisticRegression # linear model for classification

from sklearn.metrics import confusion_matrix

In [0]:
data = pd.read_csv("telco.csv")
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

FileNotFoundError: ignored

In [0]:
# total number of null/empty/not available entries in a column
data['TotalCharges'].isna().sum()

In [0]:
data.head()

### Composition

In [0]:
# In the whole dataset, how many churned?
data.Churn.value_counts().plot(kind='line')
plt.gca().invert_yaxis()

[link text](https://)The data is imbalanced
-- there are more than twice as many not-churned than churn.

In [0]:
# How many people are subsribed to each type of internet service
data.InternetService.value_counts().plot(kind='barh')
plt.gca().invert_yaxis()

In [0]:
# How many people are subsribed to each payment method
data.PaymentMethod.value_counts().plot(kind='barh')
plt.gca().invert_yaxis()

### Relationship

In [0]:
# How is tenure related churn?
sns.catplot(data=data,
            x='Churn',
            y='tenure',
            kind='box',
            height=10)

In [0]:
# How are monthly charges related churn?
sns.catplot(data=data,
            x='Churn',
            y='MonthlyCharges',
            kind='box',
            height=10)p

### Compute Likelihood (when comparing categorical feature to categorical target)

In [0]:
# How likely are senior citizen going to churn?
pd.crosstab(data.SeniorCitizen,
            data.Churn)

In [0]:
# Probability of churning if not senior citizen
1393/(1393 + 4508)

In [0]:
# Probability of churning if senior citizen
476/(476 + 666)

Senior Citizen have higher probability of churning compared to not senior citizen.

In [0]:
# How many rows and columns
data.shape

In [0]:
# How many senior citizen 
data.SeniorCitizen.sum()

In [0]:
# How likely are different types of internet service attributing to churn?
pd.crosstab(data.InternetService,
            data.Churn)

In [0]:
# Probability of churning if DSL
459/(459+1962)

In [0]:
# Probability of churning if Fiber Optic
1297/(1297+1799)

In [0]:
# Probability of churning if no internet service
113/(1413 + 113)

In [0]:
# How likely are people who have multiple lines churn?
pd.crosstab(data.MultipleLines,
            data.Churn)

In [0]:
#Probability of churning if no multiple lines
849/(849+2541)

In [0]:
#Probability of churning if no internet service
170/(170+512)

In [0]:
#Probability of churning if with multiple lines
850/(850+2121)

### Distribution

In [0]:
# What is the distribution of tenure
sns.distplot(data.tenure, kde=False)

In [0]:
data.loc[5:10]

In [0]:
# What is the distribution of Monthly Charges
sns.distplot(data.MonthlyCharges, kde=False)

In [0]:
data["SeniorCitizen"].head()

In [0]:
data.shape

In [0]:
data.dtypes

In [0]:
data.columns

In [0]:
# Create new data
columns = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

new_data = data[columns]

In [0]:
# Dummify new data
dum_data = pd.get_dummies(new_data)

In [0]:
dum_data.columns

In [0]:
# Use len() to get umber of entries in a list or array
len(dum_data.columns)

In [0]:
# List of strongly correlated features to churn
dum_data.corr().sort_values('Churn_Yes', ascending=False)['Churn_Yes'].plot(kind='barh')
plt.gca().invert_yaxis()

In [0]:
# Get correlations with Churn and remove Churn_Yes and Churn_No
churn_correlations = dum_data.corr().sort_values('Churn_Yes', ascending=False)['Churn_Yes'].drop(['Churn_Yes', 'Churn_No'])

# Get high correlations (higher than 20% correlation)
filtered_churn_correlations = churn_correlations[np.abs(churn_correlations) > 0.2]


In [0]:
filtered_churn_correlations

In [0]:
# write list of features we want to use
features = filtered_churn_correlations.index


# Set X
X = dum_data[features]

In [0]:
X.columns

In [0]:
X.head()

In [0]:
# Set y
y = dum_data["Churn_Yes"]

In [0]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [0]:
# initialize model
log_reg = LogisticRegression()

# fit model
log_reg.fit(X_train, y_train)

# metrics (accuracy)
acc = log_reg.score(X_test, y_test)


# predictions
y_pred = log_reg.predict(X_test)

In [0]:
acc

In [0]:
confusion_matrix(y_test, y_pred)

In [0]:
# Probability of predicting not churn (True Negative Rate)
938/(938 + 98)

In [0]:
# Probability of predicting churn (True Positive Rate)
208/(208 + 165)

In [0]:
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='Blues')

Your model is more sure in predicting people who are NOT going to churn, than those who are gonna churn.

Recommendations: 
- explore other data we can collect or incorporate more features (or different permutations of features) into the model.
- try different modelling techniques (to be discussed in class)