# Importing the necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the Dataset

In [2]:
# Load the dataset
data = pd.read_csv('data/TelcoCustomerChurn.csv')

# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
#To display basic information of the data set
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


# Data Preprocessing

In [5]:
#I can see that the 'TotalCharges' column is of data type object (string) instead of numerical,
#so I am going to convert it to a numerical data type

# Convert 'TotalCharges' to numerical (float) type
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

In [6]:
#Also I am going to convert  the 'churn' to binary numerical values (0 for 'No' and 1 for 'Yes')

# Convert 'Churn' to binary numerical values (0 for 'No' and 1 for 'Yes')
data['Churn'] = data['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


# Mean Imputation

In [8]:
# Check for missing values
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)

# Handle missing values if present (replace with mean for 'TotalCharges')
data['TotalCharges'].fillna(data['TotalCharges'].mean(), inplace=True)

Missing Values:
 customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64


In [9]:

# Confirm that missing values have been handled
missing_values_after_handling = data.isnull().sum()
print("\nMissing Values after Handling:\n", missing_values_after_handling)


Missing Values after Handling:
 customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


# Feature Encoding

In [10]:
# I need to encode categorical features into numerical values so that the machine learning model can use them.
#I will use one-hot encoding for this purpose.

# One-hot encode categorical columns
categorical_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Display the updated dataframe after encoding
data_encoded.head()

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,1,29.85,29.85,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
1,5575-GNVDE,0,34,56.95,1889.5,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,1
2,3668-QPYBK,0,2,53.85,108.15,1,1,0,0,1,...,0,0,0,0,0,0,1,0,0,1
3,7795-CFOCW,0,45,42.3,1840.75,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,9237-HQITU,0,2,70.7,151.65,1,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0


# Feature Scaling

In [11]:
#Scaling the numerical features so they have a similar influence on the machine learning model

from sklearn.preprocessing import StandardScaler

# Scale the numerical features
scaler = StandardScaler()
numerical_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
data_encoded[numerical_columns] = scaler.fit_transform(data_encoded[numerical_columns])

# Display the updated dataframe after scaling
data_encoded.head()

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,-1.277445,-1.160323,-0.994971,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
1,5575-GNVDE,0,0.066327,-0.259629,-0.173876,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,1
2,3668-QPYBK,0,-1.236724,-0.36266,-0.960399,1,1,0,0,1,...,0,0,0,0,0,0,1,0,0,1
3,7795-CFOCW,0,0.514251,-0.746535,-0.1954,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,9237-HQITU,0,-1.236724,0.197365,-0.941193,1,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0


# Data Splitting

In [12]:
# Split the dataset into features (X) and target variable (y)
X = data_encoded.drop('Churn', axis=1)
y = data_encoded['Churn']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the resulting sets
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


((5634, 31), (1409, 31), (5634,), (1409,))

In [18]:
# Display the columns in the DataFrame
print("Columns in the DataFrame:")
print(data_encoded.columns)


Columns in the DataFrame:
Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'gender_Male', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No internet service', 'StreamingMovies_Yes',
       'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')


In [21]:
# Check the data types of columns in X_train
print(X_train.dtypes)


customerID                                object
SeniorCitizen                              int64
tenure                                   float64
MonthlyCharges                           float64
TotalCharges                             float64
gender_Male                                uint8
Partner_Yes                                uint8
Dependents_Yes                             uint8
PhoneService_Yes                           uint8
MultipleLines_No phone service             uint8
MultipleLines_Yes                          uint8
InternetService_Fiber optic                uint8
InternetService_No                         uint8
OnlineSecurity_No internet service         uint8
OnlineSecurity_Yes                         uint8
OnlineBackup_No internet service           uint8
OnlineBackup_Yes                           uint8
DeviceProtection_No internet service       uint8
DeviceProtection_Yes                       uint8
TechSupport_No internet service            uint8
TechSupport_Yes     

In [22]:
# Drop the 'customerID' column since its not of a major impact in training
X_train = X_train.drop('customerID', axis=1)

# Model Training and Evaluation

In [23]:
# Training a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

In [24]:
# here are the predictions even when dropping the customerID to avoid errors
# Predictions
X_test = X_test.drop('customerID', axis=1)  # Drop 'customerID' from the test set as well
y_pred = rf_classifier.predict(X_test)

In [25]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)

In [26]:
# Print the evaluation metrics
print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_report)

Accuracy: 0.7927608232789212

Confusion Matrix:
 [[943  93]
 [199 174]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.91      0.87      1036
           1       0.65      0.47      0.54       373

    accuracy                           0.79      1409
   macro avg       0.74      0.69      0.70      1409
weighted avg       0.78      0.79      0.78      1409



The model is approximately 79.28% accurate in predicting customer churn based on the test data.

The Classification Report shows that;

In Precision: The proportion of true positive predictions among all positive predictions. For predicting churn (1), it's about 65%, and for not predicting churn (0), it's about 83%.

In Recall: The proportion of true positive predictions among the actual positives. For predicting churn (1), it's about 47%, and for not predicting churn (0), it's about 91%.

F1-score: The weighted average of precision and recall. For predicting churn (1), it's about 54%, and for not predicting churn (0), it's about 87%.

# Feature importances 
I need to analyze the feature importances from the trained Random Forest model to understand which factors are most influential in predicting churn

In [28]:

# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display feature importances
print("Feature Importances:\n", feature_importance_df)

Feature Importances:
                                   Feature  Importance
3                            TotalCharges    0.190070
1                                  tenure    0.175139
2                          MonthlyCharges    0.172152
10            InternetService_Fiber optic    0.036002
28         PaymentMethod_Electronic check    0.035156
25                      Contract_Two year    0.030504
13                     OnlineSecurity_Yes    0.029633
4                             gender_Male    0.027534
26                   PaperlessBilling_Yes    0.025265
5                             Partner_Yes    0.024054
19                        TechSupport_Yes    0.023666
24                      Contract_One year    0.021958
15                       OnlineBackup_Yes    0.021849
6                          Dependents_Yes    0.020858
0                           SeniorCitizen    0.020023
9                       MultipleLines_Yes    0.019609
17                   DeviceProtection_Yes    0.019215
23    

# Here are the impacts;
1.The total charges, tenure, and monthly charges would heavily impact customer churn predictions.
2.The type of internet service also plays a significant role, particularly fiber optic service.
3.The payment method, especially electronic checks, seems to have notable importance.
4.Gender, partner, tech support, online security, and other services also play a role, though to a lesser extent.

# Resolutions.
 
# Sprint can focus on improving the areas of high feature importance to reduce churn
1.Improving the quality or pricing of Fiber optic internet services or optimizing payment processes for electronic checks.

2.Tailoring marketing strategies: Knowing that tenure and contract length matter, specific promotions or benefits could be offered to long-term customers.

3.Addressing concerns: Understanding the impact of tech support and online security, Sprint can enhance these services to increase customer satisfaction and reduce churn.