# Classification Project: What Causes Telco Churn?

## --- Pipeline phase 1: Planning: ---

 ### In this notebook we will explore where drivers for churn or attrition might be based on a dataset snapshot of customers for a telecommunications service.  We will investigate differences between groups of customers and what might make one group or class different from another, and if that has any bearing on their propensity to leave the company.
 
 ### Please reference data_dictionary.py for explicit details on features.

In [1]:
# setting up our environment: 

import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

from acquire import get_telco_data
from telco_prep import *
from matplotlib import cm
from matplotlib.ticker import FormatStrFormatter


from scipy.stats import ttest_ind as ttest
from scipy.stats import pearsonr

## --- Pipeline phase 2: Acquisition: ---

In [2]:
# call our function to pull our dataframe using mySQL:
df = get_telco_data()

In [3]:
# get info on dataframe 
peekatdata(df)

  customer_id  gender  senior_citizen partner dependents  tenure  \
0  0003-MKNFE    Male               0      No         No       9   
1  0013-MHZWF  Female               0      No        Yes       9   
2  0015-UOCOJ  Female               1      No         No       7   
3  0023-HGHWL    Male               1      No         No       1   
4  0032-PGELS  Female               0     Yes        Yes       1   

  phone_service    multiple_lines  internet_service_type_id online_security  \
0           Yes               Yes                         1              No   
1           Yes                No                         1              No   
2           Yes                No                         1             Yes   
3            No  No phone service                         1              No   
4            No  No phone service                         1             Yes   

             ...             streaming_movies contract_type_id  \
0            ...                          Yes     

## --- Pipeline Phase 3: Preparation: ---

In [4]:
# call function to prepare dataframe based on parameters outlined
# in prepare.py and curriculum instructions
df = prep_telco_data(df)

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  df['total_charges'] = df['total_charges'].convert_objects(convert_numeric=True)


In [5]:
# get info on dataframe 
peekatdata(df)

  customer_id  gender  senior_citizen partner dependents  tenure  \
0  0003-MKNFE    Male               0      No         No       9   
1  0013-MHZWF  Female               0      No        Yes       9   
2  0015-UOCOJ  Female               1      No         No       7   
3  0023-HGHWL    Male               1      No         No       1   
4  0032-PGELS  Female               0     Yes        Yes       1   

  phone_service    multiple_lines  internet_service_type_id online_security  \
0           Yes               Yes                         1              No   
1           Yes                No                         1              No   
2           Yes                No                         1             Yes   
3            No  No phone service                         1              No   
4            No  No phone service                         1             Yes   

      ...      tenure_year phone_id household_type_id streaming_services  \
0     ...                1        2     

In [6]:
#drop our non-numeric columns as described in function notes
df = drop_cols(df)

In [7]:
# Numeric Scaling: scale the monthly_charges and total_charges data. 
# Make sure that the parameters for scaling are learned from the training data set.

# split the dataframe
X = df.drop(['churn'], axis = 1)
y = df[['churn']]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.70, random_state=123)

#concatinate our X and y together to make a single test and train df
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

# scale our monthly and total charges
scaler = MinMaxScaler()
scaler.fit(train[['monthly_charges', 'total_charges']])

train[['monthly_charges', 'total_charges']] = scaler.transform(train[['monthly_charges', 'total_charges']])
test[['monthly_charges', 'total_charges']] = scaler.transform(test[['monthly_charges', 'total_charges']])




In [8]:
peekatdata(train)

      senior_citizen  tenure  contract_type_id  payment_type_id  \
1479               0      52                 2                1   
2377               0      59                 3                4   
6613               0      46                 3                3   
6468               0      55                 3                3   
2668               0      10                 1                1   

      monthly_charges  total_charges  tenure_year  phone_id  \
1479         0.502488       0.399729            5         1   
2377         0.716915       0.597190            5         2   
6613         0.019900       0.100571            4         1   
6468         0.074129       0.164418            5         2   
2668         0.613930       0.096746            1         1   

      household_type_id  streaming_services  online_security_backup  gender_e  \
1479                  3                   1                       2         1   
2377                  3                   3             

## --- Pipeline phase 4: Exploration: ---

In [None]:
# create array of our features:
cols = []
for col in train:
    cols.append(col)
cols


In [None]:
# #1: could month in which a customer signed up influence churn?
# consider this plot: 
sns.lineplot(x='tenure', y='churn', data=df)

#### We can note from this data visualization that there is a downward trend or negative correlation between tenure and attrition.  There are minor spikes that appear to be more correlated with the one year and two year marks than they do with a specific cohort, which overall is roughly consistent.  

In [None]:
# Are there features that indicate a higher 
# propensity to churn? like type of internet service, 
# type of phone service, online security and backup, 
#senior citizens, paying more than x% of customers with the 
#same services, etc.?

In [None]:
features = ['senior_citizen','int_type_id','contract_type_id','payment_type_id', 'tenure_year', 'phone_id', 'household_type_id', 'streaming_services', 'online_security_backup','gender_e', 'device_protection_e', 'tech_support_e', 'paperless_billing_e']

_, ax = plt.subplots(nrows=13, ncols=1, figsize=(10,80))

churn_rate = train.churn.mean()

for i, feature in enumerate(features):
    sns.barplot(feature, 'churn', data=train, ax=ax[i], alpha=.5)
    ax[i].set_ylabel('Churn Rate')
    ax[i].axhline(churn_rate, ls='--', color='grey')

#### From these visualizations, we can see that certain features lead to stronger churning trends.  Off the cuff, we notice that seniors have a stronger tendency to churn, as well as fiber customers and  customers without tech support.  People that have paperless billing churn more on average as well as customers that pay by electronic check, which are intrinsically bound.

In [None]:
#Is there a price threshold for specific services where the 
#likelihood of churn increases once price for those services 
# goes past that point? If so, what is that point for what 
#service(s)?

In [None]:
sns.swarmplot(x="paperless_billing_e", y="monthly_charges", data=train, hue="churn", palette="Set2")
ax = sns.boxplot(x="paperless_billing_e", y="monthly_charges", data=train,
        showcaps=True,boxprops={'facecolor':'None'},
        showfliers=True,whiskerprops={'linewidth':0})



In [None]:
sns.swarmplot(x="tech_support_e", y="monthly_charges", data=train, hue="churn", palette="Set2")
ax = sns.boxplot(x="tech_support_e", y="monthly_charges", data=train,
        showcaps=True,boxprops={'facecolor':'None'},
        showfliers=True,whiskerprops={'linewidth':0})

In [None]:
sns.swarmplot(x="tenure_year", y="monthly_charges", data=train, hue="churn", palette="Set2")
ax = sns.boxplot(x="tenure_year", y="monthly_charges", data=train,
        showcaps=True,boxprops={'facecolor':'None'},
        showfliers=True,whiskerprops={'linewidth':0})

In [None]:
sns.swarmplot(x="int_type_id", y="monthly_charges", data=train, hue="churn", palette="Set2")
ax = sns.boxplot(x="int_type_id", y="monthly_charges", data=train,
        showcaps=True,boxprops={'facecolor':'None'},
        showfliers=True,whiskerprops={'linewidth':0})

In [None]:
higher_monthly_charges = train[['churn']][train.monthly_charges > train.monthly_charges.quantile(0.60)]
lower_monthly_charges = train[['churn']][train.monthly_charges < train.monthly_charges.quantile(0.60)]
ttest(higher_monthly_charges, lower_monthly_charges)

In [None]:
higher_monthly_charges = train[['monthly_charges', 'churn']][train.monthly_charges > train.monthly_charges.quantile(0.70)]
lower_monthly_charges = train[['monthly_charges', 'churn']][train.monthly_charges < train.monthly_charges.quantile(0.70)]

### # 4: is the churn rate significantly different for those on a month-to-month plan after month 12 than those on 1 year contracts after 12th month?


In [None]:
year_old_monthlies = train[train.tenure >= 12][train.contract_type_id == 1]
year_old_monthlies.head()
year_old_contract = train[train.tenure >= 12][train.contract_type_id == 2]
ttest(year_old_monthlies.churn, year_old_contract.churn)

In [None]:
# 5: are the mean monthly charges for those who have churned significantly different than for
# those who have not churned?
has_churned = train[train.churn == 1]
not_churned = train[train.churn == 0]
for col in train:
    ttest(has_churned.monthly_charges, not_churned.monthly_charges)
ttest(has_churned.monthly_charges, not_churned.monthly_charges)
ttest(has_churned[['int_type_id', 'monthly_charges']].monthly_charges, not_churned[['int_type_id', 'monthly_charges']].monthly_charges)

In [None]:
# How much of monthly_charges can be explained by internet_service_type? 
# (hint: correlation test). State your hypotheses and your conclusion clearly.
cor_test = train[['monthly_charges', 'int_type_id']]
cor_test.groupby('int_type_id').mean()
# sns.heatmap(cor_test.corr(), annot=True)
# pearsonr(cor_test.internet_service_type_id, cor_test.monthly_charges)
# pearsonr()

### #6: How much of monthly_charges can be explained by internet_service_type? (hint: correlation test). State your hypotheses and your conclusion clearly.

In [None]:
pd.crosstab(train.int_type_id, train.churn, margins=True).style.background_gradient(cmap='YlOrRd')

In [None]:
pearsonr(train.int_type_id, train.monthly_charges)


In [None]:
no_int = train[train.int_type_id == 0]
dsl_int = train[train.int_type_id == 1]
fbr_int = train[train.int_type_id == 2]

In [None]:
for i in (no_int, dsl_int, fbr_int):
    print(ttest(i.monthly_charges, i.churn))
    print(pearsonr(i.monthly_charges, i.churn))

In [None]:
features = ['senior_citizen','int_type_id','contract_type_id','payment_type_id', 'tenure_year', 'phone_id', 'household_type_id', 'streaming_services', 'online_security_backup','gender_e', 'device_protection_e', 'tech_support_e', 'paperless_billing_e']

for feature in features:
    print(ttest(has_churned[[feature, 'monthly_charges']].monthly_charges, not_churned[[feature, 'monthly_charges']].monthly_charges))
    

In [None]:
# figure = figsize(18,5)
sns.violinplot(x='tenure_year', y='monthly_charges', hue='churn', data=df)

In [None]:
sns.violinplot(x='tenure_year', y='total_charges', hue='churn', data=df)