Data from https://www.kaggle.com/blastchar/telco-customer-churn

In [78]:
# Import the pandas library for data manipulation and analysis. 
# It's commonly used for handling datasets in Python.
import pandas as pd

# Import the numpy library, which is used for numerical computing.
# It provides support for large multi-dimensional arrays and matrices, along with mathematical functions.
import numpy as np

# Import the seaborn library for statistical data visualization.
# It provides a high-level interface for creating informative and attractive visualizations.
import seaborn as sns

# Import the pyplot module from matplotlib, which is a plotting library.
# It's commonly used for creating static, animated, and interactive visualizations in Python.
from matplotlib import pyplot as plt

# This line makes the plots show up inline (directly in the notebook or console output) 
# instead of in a separate window. It's necessary when working in Jupyter notebooks.
%matplotlib inline

In [79]:
# Load the dataset from a CSV file into a pandas DataFrame.
# 'WA_Fn-UseC_-Telco-Customer-Churn.csv' is the file that contains the data, and
# pd.read_csv() reads this file and loads it into a DataFrame called 'df'.
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [80]:
# Calculate the number of rows (observations) in the DataFrame 'df'.
len(df)

7043

## Initial data preparation

In [81]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [82]:
# Display the first 5 rows of the DataFrame 'df' to get a quick look at the data.
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [83]:
# Transpose and display the first 5 rows of the DataFrame 'df' to view columns as rows.
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [84]:
# Display the data types of each column in the DataFrame 'df'.
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [85]:
# Convert the 'TotalCharges' column to a numeric data type.
# If any values cannot be converted, they will be replaced with NaN (Not a Number).
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Replace any NaN values in the 'TotalCharges' column with 0.
df['TotalCharges'] = df['TotalCharges'].fillna(0)

In [86]:
# Convert all column names to lowercase and replace spaces with underscores for consistency.
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Create a list of column names that have an object data type (typically used for strings).
string_columns = list(df.dtypes[df.dtypes == 'object'].index)

# Iterate over each column name in the list of string columns.
for col in string_columns:
    # Convert all string values in the column to lowercase and replace spaces with underscores.
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [87]:
# Convert the 'churn' column values from string ('yes'/'no') to binary integers (1/0).
df.churn = (df.churn == 'yes').astype(int)

In [88]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [89]:
# Import the train_test_split function from the sklearn.model_selection module.
from sklearn.model_selection import train_test_split

In [90]:
# Split the original DataFrame (df) into training and testing sets.
# 20% of the data will be used for testing, while 80% will be used for training.
# random_state=1 ensures that the split is reproducible.
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [91]:
# Split the training set (df_train_full) into a smaller training set (df_train) and a validation set (df_val).
# 33% of the original training data will be allocated to the validation set, while 67% will remain in the training set.
# random_state=11 ensures that the split is reproducible.
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

In [92]:
# Extract the target variable 'churn' from the training DataFrame and convert it to a NumPy array.
y_train = df_train.churn.values

# Extract the target variable 'churn' from the validation DataFrame and convert it to a NumPy array.
y_val = df_val.churn.values

In [93]:
# Delete the 'churn' column from the training DataFrame as it is no longer needed after extracting the target variable.
del df_train['churn']

# Delete the 'churn' column from the validation DataFrame as it is no longer needed after extracting the target variable.
del df_val['churn']

## Exploratory data analysis

In [94]:
# Check for missing values in each column of the df_train_full DataFrame and return the total count of missing values for each column.
df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [95]:
# Count the occurrences of each unique value in the 'churn' column of the df_train_full DataFrame.
df_train_full.churn.value_counts()

churn
0    4113
1    1521
Name: count, dtype: int64

In [96]:
# Calculate the mean (average) of the 'churn' column in the df_train_full DataFrame.
global_mean = df_train_full.churn.mean()

# Round the calculated mean to three decimal places.
round(global_mean, 3)

0.27

In [97]:
# Define a list of categorical feature names from the dataset.
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']

# Define a list of numerical feature names from the dataset.
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [98]:
# Calculate the number of unique values for each categorical feature in the df_train_full DataFrame
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

## Feature importance

In [99]:
# Calculate the mean churn rate for female customers in the df_train_full DataFrame
female_mean = df_train_full[df_train_full.gender == 'female'].churn.mean()
# Print the mean churn rate for female customers, rounded to 3 decimal places
print('gender == female:', round(female_mean, 3))

# Calculate the mean churn rate for male customers in the df_train_full DataFrame
male_mean = df_train_full[df_train_full.gender == 'male'].churn.mean()
# Print the mean churn rate for male customers, rounded to 3 decimal places
print('gender == male:  ', round(male_mean, 3))

gender == female: 0.277
gender == male:   0.263


In [100]:
# Calculate the ratio of female churn rate to the global mean churn rate
female_to_global_ratio = female_mean / global_mean

In [101]:
female_to_global_ratio

1.0253955354648652

In [102]:
# Calculate the ratio of male churn rate to the global mean churn rate
male_to_global_ratio = male_mean / global_mean

In [103]:
male_to_global_ratio

0.9749802969838747

In [104]:
# Calculate the mean churn rate for customers who have a partner
partner_yes = df_train_full[df_train_full.partner == 'yes'].churn.mean()
print('partner == yes:', round(partner_yes, 3))

# Calculate the mean churn rate for customers who do not have a partner
partner_no = df_train_full[df_train_full.partner == 'no'].churn.mean()
print('partner == no :', round(partner_no, 3))

partner == yes: 0.205
partner == no : 0.33


In [105]:
ratio = partner_yes / global_mean
# ratio = 0.25 / 0.30 = 0.8333

In [106]:
ratio

0.7594724924338315

In [107]:
ratio = partner_no / global_mean
# ratio = 0.40 / 0.30 = 1.3333

In [108]:
ratio

1.2216593879412643

In [109]:
# Group the training data by the 'gender' column and calculate the mean churn rate for each gender.
df_group = df_train_full.groupby(by='gender').churn.agg(['mean'])

# Calculate the difference in churn rates between each gender and the overall mean churn rate.
df_group['diff'] = df_group['mean'] - global_mean

# Calculate the risk ratio of churn for each gender compared to the overall mean churn rate.
df_group['risk'] = df_group['mean'] / global_mean

# Display the resulting DataFrame that contains mean churn rates, differences, and risk ratios by gender.
df_group

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


In [110]:
# Import the display function from the IPython.display module, 
# which allows for more versatile display of objects in Jupyter notebooks.
from IPython.display import display

In [111]:
# Calculate the mean of the 'churn' column in the df_train_full DataFrame, 
# which represents the proportion of customers who have churned (i.e., left the service).
global_mean = df_train_full.churn.mean()

# Display the calculated global mean value
global_mean

0.26996805111821087

In [112]:
# Iterate over each column in the categorical list
for col in categorical:
    # Group the DataFrame by the current categorical column and calculate the mean churn rate for each group
    df_group = df_train_full.groupby(by=col).churn.agg(['mean'])
    
    # Calculate the difference between the mean churn rate of each group and the global mean churn rate
    df_group['diff'] = df_group['mean'] - global_mean
    
    # Calculate the risk as the ratio of the group's mean churn rate to the global mean churn rate
    df_group['risk'] = df_group['mean'] / global_mean
    
    # Display the resulting DataFrame containing mean churn rates, differences, and risks for each group
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


Unnamed: 0_level_0,mean,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.24227,-0.027698,0.897403
1,0.413377,0.143409,1.531208


Unnamed: 0_level_0,mean,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.329809,0.059841,1.221659
yes,0.205033,-0.064935,0.759472


Unnamed: 0_level_0,mean,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.31376,0.043792,1.162212
yes,0.165666,-0.104302,0.613651


Unnamed: 0_level_0,mean,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.241316,-0.028652,0.89387
yes,0.273049,0.003081,1.011412


Unnamed: 0_level_0,mean,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.257407,-0.012561,0.953474
no_phone_service,0.241316,-0.028652,0.89387
yes,0.290742,0.020773,1.076948


Unnamed: 0_level_0,mean,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192347,-0.077621,0.712482
fiber_optic,0.425171,0.155203,1.574895
no,0.077805,-0.192163,0.288201


Unnamed: 0_level_0,mean,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.420921,0.150953,1.559152
no_internet_service,0.077805,-0.192163,0.288201
yes,0.153226,-0.116742,0.56757


Unnamed: 0_level_0,mean,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.404323,0.134355,1.497672
no_internet_service,0.077805,-0.192163,0.288201
yes,0.217232,-0.052736,0.80466


Unnamed: 0_level_0,mean,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.395875,0.125907,1.466379
no_internet_service,0.077805,-0.192163,0.288201
yes,0.230412,-0.039556,0.85348


Unnamed: 0_level_0,mean,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.418914,0.148946,1.551717
no_internet_service,0.077805,-0.192163,0.288201
yes,0.159926,-0.110042,0.59239


Unnamed: 0_level_0,mean,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.342832,0.072864,1.269897
no_internet_service,0.077805,-0.192163,0.288201
yes,0.302723,0.032755,1.121328


Unnamed: 0_level_0,mean,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.338906,0.068938,1.255358
no_internet_service,0.077805,-0.192163,0.288201
yes,0.307273,0.037305,1.138182


Unnamed: 0_level_0,mean,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.431701,0.161733,1.599082
one_year,0.120573,-0.149395,0.446621
two_year,0.028274,-0.241694,0.10473


Unnamed: 0_level_0,mean,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.172071,-0.097897,0.637375
yes,0.338151,0.068183,1.25256


Unnamed: 0_level_0,mean,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.168171,-0.101797,0.622928
credit_card_(automatic),0.164339,-0.10563,0.608733
electronic_check,0.45589,0.185922,1.688682
mailed_check,0.19387,-0.076098,0.718121


In [113]:
from sklearn.metrics import mutual_info_score

In [114]:
def calculate_mi(series):
    # Calculate the mutual information score between the provided series and the 'churn' column in the df_train_full DataFrame.
    return mutual_info_score(series, df_train_full.churn)

# Apply the calculate_mi function to each column in the categorical DataFrame and store the results in df_mi.
df_mi = df_train_full[categorical].apply(calculate_mi)

# Sort the mutual information scores in descending order and convert the result into a DataFrame named 'MI'.
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')

# Display the top 5 mutual information scores.
display(df_mi.head())

# Display the bottom 5 mutual information scores.
display(df_mi.tail())

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923


Unnamed: 0,MI
partner,0.009968
seniorcitizen,0.00941
multiplelines,0.000857
phoneservice,0.000229
gender,0.000117


In [115]:
# Calculate the correlation of each numerical feature with the 'churn' column in the df_train_full DataFrame.
df_train_full[numerical].corrwith(df_train_full.churn).to_frame('correlation')

Unnamed: 0,correlation
tenure,-0.351885
monthlycharges,0.196805
totalcharges,-0.196353


In [116]:
# Group the df_train_full DataFrame by the 'churn' column 
# and calculate the mean of each numerical feature for each group.
df_train_full.groupby(by='churn')[numerical].mean()

Unnamed: 0_level_0,tenure,monthlycharges,totalcharges
churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,37.531972,61.176477,2548.021627
1,18.070348,74.521203,1545.689415


## One-hot encoding

In [117]:
# Import the DictVectorizer class from the feature_extraction module of the sklearn library.
from sklearn.feature_extraction import DictVectorizer

In [118]:
# Create a dictionary representation of the training DataFrame's selected columns.
train_dict = df_train[categorical + numerical].to_dict(orient='records')

In [119]:
train_dict[0]

{'gender': 'male',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'no',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'dsl',
 'onlinesecurity': 'yes',
 'onlinebackup': 'yes',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'two_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'bank_transfer_(automatic)',
 'tenure': 71,
 'monthlycharges': 86.1,
 'totalcharges': 6045.9}

In [120]:
# Import the DictVectorizer class from scikit-learn's feature extraction module
dv = DictVectorizer(sparse=False)

# Fit the vectorizer to the training data (list of dictionaries) to learn the feature names and mappings
dv.fit(train_dict)

In [121]:
# Transform the list of dictionaries (train_dict) into a numerical feature matrix using the fitted DictVectorizer
X_train = dv.transform(train_dict)

In [122]:
X_train.shape

(3774, 45)

In [123]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

## Training logistic regression

In [124]:
# Import the LogisticRegression class from scikit-learn's linear_model module
# Logistic Regression is a type of model used for classification problems, especially when predicting binary outcomes (e.g., yes/no or 0/1).

from sklearn.linear_model import LogisticRegression

In [125]:
# Create an instance of the LogisticRegression model with specific parameters.
# 'solver' specifies the algorithm used for optimization, and 'liblinear' is good for small datasets.
# 'random_state' ensures reproducibility by setting the seed for random number generation.

model = LogisticRegression(solver='liblinear', random_state=1)

# Train the LogisticRegression model using the training data (X_train, y_train).
# The 'fit' method trains the model, learning the relationship between features (X_train) and labels (y_train).

model.fit(X_train, y_train)

In [126]:
# Convert the validation DataFrame (df_val) into a list of dictionaries.
# This includes both categorical and numerical columns, 
# making it easier to handle for feature transformation.
# Each row of df_val becomes a dictionary, with column names as keys and corresponding values.
val_dict = df_val[categorical + numerical].to_dict(orient='records')

# Transform the list of dictionaries (val_dict) into a feature matrix (X_val)
# using the DictVectorizer (dv) that was previously fitted on the training data.
# This step encodes categorical features as one-hot vectors,
# making them suitable for input into the logistic regression model.
X_val = dv.transform(val_dict)

In [127]:
# Use the trained logistic regression model (model) to predict the probabilities 
# of each class for the validation dataset (X_val).
# This method returns an array where each row corresponds to a sample in X_val
# and each column corresponds to the predicted probability of that sample belonging 
# to a particular class (in this case, the probability of not churn and churn).
probabilities = model.predict_proba(X_val)

In [128]:
probabilities

array([[0.76508893, 0.23491107],
       [0.7311339 , 0.2688661 ],
       [0.6805482 , 0.3194518 ],
       ...,
       [0.94274725, 0.05725275],
       [0.38476961, 0.61523039],
       [0.93872737, 0.06127263]])

In [129]:
# Use the trained logistic regression model (model) to predict the probabilities 
# of each class for the validation dataset (X_val).
# The predict_proba method returns an array of probabilities for each class.
# Here, we extract the probabilities of the positive class (class 1, e.g., "churn") 
# by selecting all rows (samples) and the second column (index 1).
y_pred = model.predict_proba(X_val)[:, 1]

In [130]:
y_pred

array([0.23491107, 0.2688661 , 0.3194518 , ..., 0.05725275, 0.61523039,
       0.06127263])

In [131]:
# Create a boolean array (churn) that indicates whether the predicted probability
# of churn (y_pred) for each sample in the validation set is greater than 0.5.
# This means that if the predicted probability is more than 0.5, we classify 
# the sample as likely to churn (True); otherwise, it is classified as not likely to churn (False).
churn = y_pred > 0.5

In [132]:
churn

array([False, False, False, ..., False,  True, False])

In [133]:
# Calculate the accuracy of the churn predictions by comparing the actual churn values
# (y_val) with the predicted churn values (churn). The mean of the boolean array 
# (y_val == churn) gives the proportion of correct predictions.
accuracy = (y_val == churn).mean()

In [134]:
accuracy

0.8016129032258065

## Model interpretation
* Look at the coefficients
* Train a smaller model with fewer features

In [135]:
a = [1, 2, 3, 4]
b = 'abcd'

In [136]:
dict(zip(a, b))

{1: 'a', 2: 'b', 3: 'c', 4: 'd'}

In [140]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'contract=month-to-month': 0.563,
 'contract=one_year': -0.086,
 'contract=two_year': -0.599,
 'dependents=no': -0.03,
 'dependents=yes': -0.092,
 'deviceprotection=no': 0.1,
 'deviceprotection=no_internet_service': -0.116,
 'deviceprotection=yes': -0.106,
 'gender=female': -0.027,
 'gender=male': -0.095,
 'internetservice=dsl': -0.323,
 'internetservice=fiber_optic': 0.317,
 'internetservice=no': -0.116,
 'monthlycharges': 0.001,
 'multiplelines=no': -0.168,
 'multiplelines=no_phone_service': 0.127,
 'multiplelines=yes': -0.081,
 'onlinebackup=no': 0.136,
 'onlinebackup=no_internet_service': -0.116,
 'onlinebackup=yes': -0.142,
 'onlinesecurity=no': 0.258,
 'onlinesecurity=no_internet_service': -0.116,
 'onlinesecurity=yes': -0.264,
 'paperlessbilling=no': -0.213,
 'paperlessbilling=yes': 0.091,
 'partner=no': -0.048,
 'partner=yes': -0.074,
 'paymentmethod=bank_transfer_(automatic)': -0.027,
 'paymentmethod=credit_card_(automatic)': -0.136,
 'paymentmethod=electronic_check': 0.175,


In [64]:
# Retrieve the intercept (bias term) of the logistic regression model, which represents the log-odds 
# of the response variable (churn) when all the predictor variables are zero. 
# The intercept is a key parameter in the logistic regression equation.
intercept = model.intercept_[0]

In [65]:
intercept

-0.12198863589816404

In [67]:
# Create a dictionary mapping feature names to their corresponding coefficients 
# in the logistic regression model. Each feature name is paired with its rounded coefficient,
# allowing for an easy interpretation of the impact each feature has on the predicted outcome.
feature_coefficients = dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

In [68]:
feature_coefficients

{'contract=month-to-month': 0.563,
 'contract=one_year': -0.086,
 'contract=two_year': -0.599,
 'dependents=no': -0.03,
 'dependents=yes': -0.092,
 'deviceprotection=no': 0.1,
 'deviceprotection=no_internet_service': -0.116,
 'deviceprotection=yes': -0.106,
 'gender=female': -0.027,
 'gender=male': -0.095,
 'internetservice=dsl': -0.323,
 'internetservice=fiber_optic': 0.317,
 'internetservice=no': -0.116,
 'monthlycharges': 0.001,
 'multiplelines=no': -0.168,
 'multiplelines=no_phone_service': 0.127,
 'multiplelines=yes': -0.081,
 'onlinebackup=no': 0.136,
 'onlinebackup=no_internet_service': -0.116,
 'onlinebackup=yes': -0.142,
 'onlinesecurity=no': 0.258,
 'onlinesecurity=no_internet_service': -0.116,
 'onlinesecurity=yes': -0.264,
 'paperlessbilling=no': -0.213,
 'paperlessbilling=yes': 0.091,
 'partner=no': -0.048,
 'partner=yes': -0.074,
 'paymentmethod=bank_transfer_(automatic)': -0.027,
 'paymentmethod=credit_card_(automatic)': -0.136,
 'paymentmethod=electronic_check': 0.175,


In [70]:
# Define a subset of features that we want to use for training.
subset = ['contract', 'tenure', 'totalcharges']

# Convert the selected features from the training DataFrame into a list of dictionaries.
# Each dictionary represents a single record with the specified features.
train_dict_small = df_train[subset].to_dict(orient='records')

# Create an instance of DictVectorizer to convert the list of dictionaries into a matrix of feature vectors.
# Setting sparse=False ensures that the output is a dense array rather than a sparse matrix.
dv_small = DictVectorizer(sparse=False)

# Fit the DictVectorizer to the small training dataset to learn the feature representation.
dv_small.fit(train_dict_small)

# Transform the training data using the fitted DictVectorizer, converting the records into a matrix of feature vectors.
X_small_train = dv_small.transform(train_dict_small)

# Retrieve the names of the features generated by the DictVectorizer.
# This allows us to see the transformed feature names after vectorization.
#dv_small.get_feature_names()
feature_names = dv_small.get_feature_names_out()

In [71]:
feature_names

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'tenure', 'totalcharges'], dtype=object)

In [72]:
# Create an instance of the LogisticRegression model.
# 'solver' specifies the optimization algorithm to use, and 'random_state' ensures reproducibility of results.
model_small = LogisticRegression(solver='liblinear', random_state=1)

# Fit the logistic regression model to the training data.
# The model learns the relationship between the input features (X_small_train) and the target variable (y_train).
model_small.fit(X_small_train, y_train)

In [73]:
# Access the intercept term from the logistic regression model `model_small`.
# The intercept is a crucial parameter in the model that represents the log-odds 
# of the outcome when all the input features are equal to zero.
# Since this is a single-value output, we use [0] to extract the first (and only) value.
intercept_value = model_small.intercept_[0]

In [74]:
intercept_value

-0.577229912199359

In [75]:
# Create a dictionary that pairs each feature name with its corresponding coefficient
# from the logistic regression model `model_small`. The coefficients indicate the effect
# of each feature on the log-odds of the target variable. 
# We use `dv_small.get_feature_names()` to get the names of the features 
# and `model_small.coef_[0].round(3)` to get the coefficients rounded to three decimal places.

feature_coefficients = dict(zip(dv_small.get_feature_names(), model_small.coef_[0].round(3)))

AttributeError: 'DictVectorizer' object has no attribute 'get_feature_names'

In [56]:
val_dict_small = df_val[subset].to_dict(orient='records')
X_small_val = dv_small.transform(val_dict_small)

In [57]:
y_pred_small = model_small.predict_proba(X_small_val)[:, 1]

## Using the model

In [58]:
customer = {
    'customerid': '8879-zkjof',
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'no',
    'dependents': 'no',
    'tenure': 41,
    'phoneservice': 'yes',
    'multiplelines': 'no',
    'internetservice': 'dsl',
    'onlinesecurity': 'yes',
    'onlinebackup': 'no',
    'deviceprotection': 'yes',
    'techsupport': 'yes',
    'streamingtv': 'yes',
    'streamingmovies': 'yes',
    'contract': 'one_year',
    'paperlessbilling': 'yes',
    'paymentmethod': 'bank_transfer_(automatic)',
    'monthlycharges': 79.85,
    'totalcharges': 3320.75,
}

In [59]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]

0.07332577315357781

In [60]:
print(list(X_test[0]))

[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 79.85, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 41.0, 3320.75]


In [61]:
customer = {
    'gender': 'female',
    'seniorcitizen': 1,
    'partner': 'no',
    'dependents': 'no',
    'phoneservice': 'yes',
    'multiplelines': 'yes',
    'internetservice': 'fiber_optic',
    'onlinesecurity': 'no',
    'onlinebackup': 'no',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'yes',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 85.7,
    'totalcharges': 85.7
}

In [62]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]

0.8321638622459152