In [50]:
import pandas as pd
import numpy as np
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

from telco_pipeline import get_data_from_sql, peekatdata, split, df_value_counts, percent_missing, clean_data

### Acquire data using sql query function

In [2]:
df = get_data_from_sql()

In [3]:
peekatdata(df)

DataFrame Shape:

(7043, 24)

Info about:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
payment_type_id             7043 non-null int64
internet_service_type_id    7043 non-null int64
contract_type_id            7043 non-null int64
customer_id                 7043 non-null object
gender                      7043 non-null object
senior_citizen              7043 non-null int64
partner                     7043 non-null object
dependents                  7043 non-null object
tenure                      7043 non-null int64
phone_service               7043 non-null object
multiple_lines              7043 non-null object
online_security             7043 non-null object
online_backup               7043 non-null object
device_protection           7043 non-null object
tech_support                7043 non-null object
streaming_tv                7043 non-null object
streaming_movies            7043 non-null object
paperless_billing     

### Deal with spaces in total_charges and drop customer_id

In [4]:
df = clean_data(df)

- Confirm total_charges has no blank values and is dtype float.

In [5]:
df["total_charges"].value_counts()

20.20      11
19.75       9
20.05       8
19.90       8
19.65       8
19.55       7
45.30       7
20.15       6
20.25       6
19.45       6
20.45       5
20.30       5
69.65       4
69.90       4
19.40       4
70.60       4
20.50       4
20.40       4
19.50       4
49.90       4
69.95       4
19.95       4
19.30       4
75.30       4
44.00       4
19.20       4
19.85       4
74.70       4
50.15       4
69.60       4
20.35       4
44.40       4
470.20      3
70.10       3
85.50       3
19.25       3
44.75       3
85.00       3
50.45       3
2317.10     3
69.10       3
35.90       3
50.60       3
20.55       3
69.25       3
75.35       3
45.10       3
74.60       3
20.10       3
74.30       3
1284.20     3
45.85       3
55.70       3
50.75       3
70.30       3
20.00       3
84.50       3
25.25       3
80.55       3
69.55       3
19.10       3
24.80       3
220.45      3
70.45       3
70.15       3
79.55       3
86.05       3
24.40       3
74.90       3
20.90       3
45.70       3
74.35 

### Confirm dtypes

In [6]:
df.dtypes

payment_type_id               int64
internet_service_type_id      int64
contract_type_id              int64
gender                       object
senior_citizen                int64
partner                      object
dependents                   object
tenure                        int64
phone_service                object
multiple_lines               object
online_security              object
online_backup                object
device_protection            object
tech_support                 object
streaming_tv                 object
streaming_movies             object
paperless_billing            object
monthly_charges             float64
total_charges               float64
churn                        object
contract_type                object
internet_service_type        object
payment_type                 object
dtype: object

### Transform target value, churn, into binary values with dtype int.

In [7]:
df["churn"] = df["churn"].map({"Yes": 1, "No": 0})

### Split the data into 70/30 train and test sets

In [9]:
train, test = train_test_split(df, train_size=.70, stratify=df["churn"], random_state=123)

In [10]:
train.shape

(4922, 23)

In [11]:
test.shape

(2110, 23)

### Create x and y variables to run through a baseline model for later comparison

In [12]:
y_train = train[["churn"]]

y_test = test[["churn"]]

In [13]:
x_train = train[["tenure", "monthly_charges", "total_charges", "internet_service_type_id", "contract_type_id", "senior_citizen", "payment_type_id"]]

x_test = test[["tenure", "monthly_charges", "total_charges", "internet_service_type_id", "contract_type_id", "senior_citizen", "payment_type_id"]]

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## Train Model

### Create the Decision Tree Object

In [15]:
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

### Fit the model to the training data

In [16]:
x_train.shape

(4922, 7)

In [17]:
y_train.shape

(4922, 1)

In [18]:
clf.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

### Estimate Churn

In [20]:
y_pred = clf.predict(x_train)
y_pred[0:5]

array([0, 1, 0, 0, 0])

### Estimate the probability of churn

In [21]:
y_pred_proba = clf.predict_proba(x_train)
y_pred_proba

array([[0.79466667, 0.20533333],
       [0.32867133, 0.67132867],
       [0.98447894, 0.01552106],
       ...,
       [0.32867133, 0.67132867],
       [0.32867133, 0.67132867],
       [0.92972182, 0.07027818]])

## Evaluate Model

In [22]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(x_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.78


## Create a confusion matrix

- True Positive: number of occurrences where y is true and y is predicted true.

- True Negative: number of occurrences where y is false and y is predicted false.

- False Positive: number of occurrences where y is false and y is predicted true.

- False Negative: number of occurrences where y is true and y is predicted false.


In [32]:
cm = pd.DataFrame(confusion_matrix(y_train, y_pred))
cm

Unnamed: 0,0,1
0,3379,235
1,828,480


### Findings After Running Decision Tree Model

- This confustion matrix tells me that my model is better at predicting True Negatives than True Positives.

- The model predicts at an overall accuracy rate of 78%

- However, the Recall rate for customers who churn is 37% which means that the model is not predicting the positives for churn very well. It is at 93% for predicting the customers who will not churn, so my model is out of balance. 

- I want to be able to better predict the customers who will churn, so we can create personas and more accurately target them with strategies that will keep them from churning.

### Create a classification report

- Precision: the higher this number is, the more you were able to pinpoint all positives correctly. If this is a low score, you predicted a lot of positives where there were none. tp / (tp + fp)

- Recall: if this score is high, you didn’t miss a lot of positives. But as it gets lower, you are not predicting the positives that are actually there. tp / (tp + fn)

- f1-score: The balanced harmonic mean of Recall and Precision, giving both metrics equal weight. The higher the F-Measure is, the better.

- Support: number of occurrences of each class in where y is true.

In [40]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.93      0.86      3614
           1       0.67      0.37      0.47      1308

    accuracy                           0.78      4922
   macro avg       0.74      0.65      0.67      4922
weighted avg       0.77      0.78      0.76      4922



## Test Model

- My test data set is running at 79% accuracy, so even slightly higher than my train data at 78%.

- I'm happy with the overall accuracy of this model, but I want to try to select features that can better predict the customers who will churn.

- I may also see how a different algorithm works before moving on to exploration and feature selection and engineering.

In [41]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(x_test, y_test)))

Accuracy of Decision Tree classifier on test set: 0.79


In [42]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Train Model 

### Create the Logistic Regression Object

In [47]:
logit = LogisticRegression(random_state = 123, solver='saga')

### Fit the model to the training data


In [51]:
logit.fit(x_train, y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=123, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

### Print the coefficients and intercept of the model


In [52]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.02543716  0.00504436 -0.00017219 -0.00187283 -0.00220918  0.00020861
  -0.00285611]]
Intercept: 
 [-0.00081423]


### Estimate whether or not a customer will churn


In [53]:
y_pred = logit.predict(x_train)

### Estimate the probability of a customer churning using the train data


In [55]:
y_pred_proba = logit.predict_proba(x_train)

### Evaluate Model

- Compute the accuracy


In [56]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(x_train, y_train)))


Accuracy of Logistic Regression classifier on training set: 0.76


### Create a confusion matrix


In [57]:
print(confusion_matrix(y_train, y_pred))


[[3137  477]
 [ 683  625]]


### Compute Precision, Recall, F1-score, and Support


In [58]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.82      0.87      0.84      3614
           1       0.57      0.48      0.52      1308

    accuracy                           0.76      4922
   macro avg       0.69      0.67      0.68      4922
weighted avg       0.75      0.76      0.76      4922



### Test Model

- Compute the accuracy of the model when run on the test data


In [59]:
print('Accuracy of Logistic Regression classifier on test set: {:.2f}'
     .format(logit.score(x_test, y_test)))


Accuracy of Logistic Regression classifier on test set: 0.77


### Findings After Training and Testing Decision Tree v. Logistic Regression Basline Models

- The Logistic Regression model predicts at an overall accuracy rate of 78%

- However, the Recall rate for customers who churn is 48% as opposed to the 36% Recall rate for the Decision Tree model. It is at 87% for predicting the customers who will not churn, as opposed to the 93% Recall rate for customers who will not churn.

- This LR model DOES better predict the customers who will churn than the Decision Tree model, but I still wa