In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

telcom = pd.read_csv('./model_data1.csv', index_col=0)

In [2]:
# Print the unique Churn values
print(set(telcom['Churn']))

# Calculate the ratio size of each churn group
telcom.groupby(['Churn']).size() / telcom.shape[0] * 100

# Import the function for splitting data to train and test
from sklearn.model_selection import train_test_split

# Split the data into train and test
train, test = train_test_split(telcom, test_size = .25)

{0, 1}


In [4]:
target = 'Churn'
custid = 'customerID'

In [5]:
# Store column names from `telcom` excluding target variable and customer ID
cols = [col for col in telcom.columns if col not in custid + target]

# Extract training features
train_X = train[cols]

# Extract training target
train_Y = train[target]

# Extract testing features
test_X = test[cols]

# Extract testing target
test_Y = test[target]

In [9]:
# Fit logistic regression on training data
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logreg = LogisticRegression(max_iter=1000)
logreg.fit(train_X, train_Y)

# Predict churn labels on testing data
pred_test_Y = logreg.predict(test_X)

# Calculate accuracy score on testing data
test_accuracy = accuracy_score(test_Y, pred_test_Y)

# Print test accuracy score rounded to 4 decimals
print('Test accuracy:', round(test_accuracy, 4))

Test accuracy: 0.7976


In [10]:
# Initialize logistic regression instance 
logreg = LogisticRegression(penalty='l1', C=0.025, solver='liblinear')

# Fit the model on training data
logreg.fit(train_X, train_Y)

# Predict churn values on test data
pred_test_Y = logreg.predict(test_X)

# Print the accuracy score on test data
print('Test accuracy:', round(accuracy_score(test_Y, pred_test_Y), 4))

Test accuracy: 0.7915


In [12]:
C = [1, 0.5, 0.25, 0.1, 0.05, 0.025, 0.01, 0.005, 0.0025]
l1_metrics = np.array([[1.    , 0.    , 0.    ],
       [0.5   , 0.    , 0.    ],
       [0.25  , 0.    , 0.    ],
       [0.1   , 0.    , 0.    ],
       [0.05  , 0.    , 0.    ],
       [0.025 , 0.    , 0.    ],
       [0.01  , 0.    , 0.    ],
       [0.005 , 0.    , 0.    ],
       [0.0025, 0.    , 0.    ]])

In [13]:
# Run a for loop over the range of C list length
from sklearn.metrics import recall_score
for index in range(0, len(C)):
  # Initialize and fit Logistic Regression with the C candidate
  logreg = LogisticRegression(penalty='l1', C=C[index], solver='liblinear')
  logreg.fit(train_X, train_Y)
  # Predict churn on the testing data
  pred_test_Y = logreg.predict(test_X)
  # Create non-zero count and recall score columns
  l1_metrics[index,1] = np.count_nonzero(logreg.coef_)
  l1_metrics[index,2] = recall_score(test_Y, pred_test_Y)

# Name the columns and print the array as pandas DataFrame
col_names = ['C','Non-Zero Coeffs','Recall']
print(pd.DataFrame(l1_metrics, columns=col_names))

        C  Non-Zero Coeffs    Recall
0  1.0000             25.0  0.546479
1  0.5000             25.0  0.543662
2  0.2500             25.0  0.540845
3  0.1000             25.0  0.532394
4  0.0500             16.0  0.512676
5  0.0250             14.0  0.490141
6  0.0100              4.0  0.515493
7  0.0050              3.0  0.540845
8  0.0025              3.0  0.535211


You can now explore the C values and associated count of non-zero coefficients and the recall score to decide which parameter value to choose.

In [15]:
# Initialize decision tree classifier
from sklearn import tree
mytree = tree.DecisionTreeClassifier()

# Fit the decision tree on training data
mytree.fit(train_X, train_Y)

# Predict churn labels on testing data
pred_test_Y = mytree.predict(test_X)

# Calculate accuracy score on testing data
test_accuracy = accuracy_score(test_Y, pred_test_Y)

# Print test accuracy
print('Test accuracy:', round(test_accuracy, 4))

Test accuracy: 0.724


In [17]:
# Run a for loop over the range of depth list length
depth_list = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score
depth_tuning = np.array([[ 2.,  0.],
       [ 3.,  0.],
       [ 4.,  0.],
       [ 5.,  0.],
       [ 6.,  0.],
       [ 7.,  0.],
       [ 8.,  0.],
       [ 9.,  0.],
       [10.,  0.],
       [11.,  0.],
       [12.,  0.],
       [13.,  0.],
       [14.,  0.]])
for index in range(0, len(depth_list)):
  # Initialize and fit decision tree with the `max_depth` candidate
  mytree = DecisionTreeClassifier(max_depth=depth_list[index])
  mytree.fit(train_X, train_Y)
  # Predict churn on the testing data
  pred_test_Y = mytree.predict(test_X)
  # Calculate the recall score 
  depth_tuning[index,1] = recall_score(test_Y, pred_test_Y)

# Name the columns and print the array as pandas DataFrame
col_names = ['Max_Depth','Recall']
print(pd.DataFrame(depth_tuning, columns=col_names))

    Max_Depth    Recall
0         2.0  0.636620
1         3.0  0.340845
2         4.0  0.540845
3         5.0  0.529577
4         6.0  0.552113
5         7.0  0.512676
6         8.0  0.529577
7         9.0  0.557746
8        10.0  0.526761
9        11.0  0.571831
10       12.0  0.588732
11       13.0  0.577465
12       14.0  0.588732


                            Feature  Coefficient  Exp_Coefficient
    21                          tenure       -0.908            0.403
    4                 PhoneService_Yes       -0.820            0.440
    17               Contract_Two year       -0.595            0.551
    8                  TechSupport_Yes       -0.418            0.658
    16               Contract_One year       -0.414            0.661
    5               OnlineSecurity_Yes       -0.412            0.662
    6                 OnlineBackup_Yes       -0.143            0.867
    3                   Dependents_Yes       -0.039            0.961
    7             DeviceProtection_Yes       -0.017            0.983
    11            PaperlessBilling_Yes        0.071            1.074
    1                SeniorCitizen_Yes        0.098            1.103
    19  PaymentMethod_Electronic check        0.188            1.207
    22                  MonthlyCharges        0.901            2.463

In [18]:
# Combine feature names and coefficients into pandas DataFrame
feature_names = pd.DataFrame(train_X.columns, columns=['Feature'])
log_coef = pd.DataFrame(np.transpose(logreg.coef_), columns=['Coefficient'])
coefficients = pd.concat([feature_names, log_coef], axis = 1)

# Calculate exponent of the logistic regression coefficients
coefficients['Exp_Coefficient'] = np.exp(coefficients['Coefficient'])

# Remove coefficients that are equal to zero
coefficients = coefficients[coefficients['Coefficient']!=0]

# Print the values sorted by the exponent coefficient
print(coefficients.sort_values(by=['Exp_Coefficient']))

           Feature  Coefficient  Exp_Coefficient
25          tenure    -0.100964         0.903966
27    TotalCharges     0.000607         1.000607
26  MonthlyCharges     0.007560         1.007588


In [32]:
# !pip install graphviz

In [36]:
# Export graphviz object from the trained decision tree 
import graphviz
exported = tree.export_graphviz(decision_tree=mytree, 
			# Assign feature names
            out_file=None, feature_names=train_X.columns, 
			# Set precision to 1 and add class names
			precision=1, class_names=['Not churn','Churn'], filled = True)

# Call the Source function and pass the exported graphviz object
graph = graphviz.Source(exported)
# from IPython.display import display	
def display_image(p): 
    HtmlManager.getPlots().append(FigHtml([p]))
  
# Display the decision tree
# display_image("./decision_tree_rules.png")

# Display the decision tree
graph.show()

AttributeError: 'Source' object has no attribute 'show'

In [29]:
graph.render()

ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

In [26]:
# Export graphviz object from the trained decision tree 
import graphviz
exported = tree.export_graphviz(decision_tree=mytree, 
			# Assign feature names
            out_file=None, feature_names=train_X.columns, 
			# Set precision to 1 and add class names
			precision=1, class_names=['Not churn','Churn'], filled = True)

# Call the Source function and pass the exported graphviz object
graph = graphviz.Source(exported)

# Display the decision tree
graph.format = 'pdf'
graph.render('decision_tree')
graph.view() 


ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

The error message indicates that the Graphviz executable 'dot' cannot be found in the system path. You can fix this by adding the path to the Graphviz executables to the system environment variables or by specifying the path to 'dot' directly in your Python code. 

To add the path to the Graphviz executables to the system environment variables, follow these steps:

1. Open the Start menu and search for 'Environment Variables'.
2. Click on 'Edit the system environment variables'.
3. Click on the 'Environment Variables' button.
4. Under 'System Variables', scroll down until you find the 'Path' variable and click on 'Edit'.
5. Click on 'New' and add the path to the Graphviz executables (e.g. C:\Program Files (x86)\Graphviz2.38\bin).
6. Click on 'OK' to close all windows and save the changes.

To specify the path to 'dot' directly in your Python code, you can add the following line of code at the beginning of your script:

```
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
```

Replace 'C:/Program Files (x86)/Graphviz2.38/bin/' with the actual path to the Graphviz executables. This will append the path to the system path so that 'dot' can be found when executing the script.