In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [2]:
import env 
import acquire
import prepare
import model
import split_scale

# MVP 
## Predictions made with Decision Tree

### Aquire
Bringing the data from the MySQL DB

In [3]:
raw_df = acquire.get_telco_chunk()

Data is a table of customer info. 

Most variables are categories about the customer's demographics and their services acquired.

Only continous variables are the tenure, monthly and total charges

In [4]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customer_id                 7043 non-null object
gender                      7043 non-null object
senior_citizen              7043 non-null int64
partner                     7043 non-null object
dependents                  7043 non-null object
tenure                      7043 non-null int64
phone_service               7043 non-null object
multiple_lines              7043 non-null object
internet_service_type_id    7043 non-null int64
online_security             7043 non-null object
online_backup               7043 non-null object
device_protection           7043 non-null object
tech_support                7043 non-null object
streaming_tv                7043 non-null object
streaming_movies            7043 non-null object
contract_type_id            7043 non-null int64
paperless_billing           7043 non-null object
payment_type_id             7043 non-null int64
monthly_charges 

### Prepare
Converting the 'object' categories into Label-Encoded categories

In [6]:
df = prepare.prep_telco()

Prepare module gets rid of nulls and makes columns conform to Pandas friendly dtypes

In [7]:
# # Use a label encoder to transform the embarked column.
def encode_variable(column, df):
    lab_enc = LabelEncoder()
    lab_enc.fit(df[column])
    df[column] = lab_enc.transform(df[column])

In [8]:
cat_cols = df.select_dtypes('object').columns
for i in cat_cols:
    encode_variable(i, df)

All categorical variables are now in a numerical encoding for each category

In [12]:
# df.info()
df.head()

Unnamed: 0_level_0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,churn
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0002-ORFBO,0,0,1,1,9,1,0,1,0,2,0,2,2,0,2,1,2,65.6,593.3,0
0003-MKNFE,1,0,0,0,9,1,2,1,0,0,0,0,0,2,1,0,2,59.9,542.4,0
0004-TLHLJ,1,0,0,0,4,1,0,2,0,0,2,0,0,0,1,1,1,73.9,280.85,1
0011-IGKFF,1,1,1,0,13,1,0,2,0,2,2,0,2,2,1,1,1,98.0,1237.85,1
0013-EXCHZ,0,1,1,0,3,1,0,2,0,0,0,2,2,0,1,1,2,83.9,267.4,1


#### MVP will use all the features in X

In [14]:
X = df.drop('churn',axis=1)

y = df['churn']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=123)

In [16]:
X_train.head()

Unnamed: 0_level_0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2187-PKZAY,1,0,0,0,12,1,0,2,0,0,0,0,0,2,1,1,3,79.95,1043.4
3402-XRIUO,0,1,1,0,22,1,2,1,2,2,0,2,0,0,1,1,2,63.55,1381.8
9397-TZSHA,0,0,0,0,69,1,2,3,1,1,1,1,1,1,3,0,4,24.6,1678.05
9153-BTBVV,0,0,1,0,71,1,2,3,1,1,1,1,1,1,3,0,3,25.0,1753.0
3793-MMFUH,0,1,0,0,13,1,2,2,0,0,0,0,2,2,1,1,1,95.05,1290.0


In [17]:
y_pred, y_pred_proba = model.do_the_decisionTree('gini', X_train, y_train)

The accuracy score is 0.7835699797160244. 


In [26]:
print(metrics.classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86      3628
           1       0.63      0.44      0.52      1302

    accuracy                           0.78      4930
   macro avg       0.72      0.67      0.69      4930
weighted avg       0.77      0.78      0.77      4930



The accuracy is .783, which is just .04 better than if we predicted every customer would churn.

This model is better at finding customers that will remain than ones that will churn. It would be more helpful
to have better results in finding the likely-to-churn customers.