In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
#Load the data
data = pd.read_csv('TelcocustomerChurn.csv')

#inspect the data
print(data.head())
print(data.info())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [6]:
# Check for missing values
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')  # Convert 'TotalCharges' to numeric
data = data.dropna()  # Drop rows with missing value

In [None]:
# Convert categorical columns to numeric using LabelEncoder
label_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
              'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
              'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']

le = LabelEncoder()
for col in label_cols:
    data[col] = le.fit_transform(data[col])

In [9]:
# Prepare the features (X) and target (y)
X = data.drop(['customerID', 'Churn'], axis=1)  # Features
y = data['Churn']  # Target (1 = churn, 0 = no churn)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a RandomForestClassifier model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

In [10]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Random Forest Classifier Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)

Random Forest Classifier Results
Accuracy:  0.7924662402274343
Confusion Matrix: 
 [[932 101]
 [191 183]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.83      0.90      0.86      1033
           1       0.64      0.49      0.56       374

    accuracy                           0.79      1407
   macro avg       0.74      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407



In [12]:
from sklearn.linear_model import LogisticRegression

# Build a Logistic Regression model
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train, y_train)

# Make predictions using Logistic Regression
y_pred_log = log_reg_model.predict(X_test)

# Evaluate the Logistic Regression model
accuracy_log = accuracy_score(y_test, y_pred_log)
conf_matrix_log = confusion_matrix(y_test, y_pred_log)
class_report_log = classification_report(y_test, y_pred_log)

print("\nLogistic Regression Results")
print("Accuracy: ", accuracy_log)
print("Confusion Matrix: \n", conf_matrix_log)
print("Classification Report: \n", class_report_log)


Logistic Regression Results
Accuracy:  0.7874911158493249
Confusion Matrix: 
 [[923 110]
 [189 185]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.83      0.89      0.86      1033
           1       0.63      0.49      0.55       374

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.71      1407
weighted avg       0.78      0.79      0.78      1407



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Let us get the data of customers who are likely to churn


In [13]:
# Predict churn for the customers in the test set
y_pred = rf_model.predict(X_test)

# Create a DataFrame to store the test data, the true churn values, and the predicted churn values
test_results = X_test.copy()  # Copy the test features
test_results['True Churn'] = y_test.values  # Add the true churn values (from the test set)
test_results['Predicted Churn'] = y_pred  # Add the predicted churn values

# Add the customer IDs back for easier identification
test_results['CustomerID'] = data.loc[X_test.index, 'customerID']

# Filter the DataFrame to only include customers predicted to churn
churned_customers = test_results[test_results['Predicted Churn'] == 1]

# Show the data of customers likely to churn
print(churned_customers[['CustomerID', 'Predicted Churn']])


      CustomerID  Predicted Churn
6125  0871-URUWO                1
3223  3082-VQXNH                1
3469  9402-ORRAH                1
1976  9497-QCMMS                1
1081  1751-NCDLI                1
...          ...              ...
5025  7326-RIGQZ                1
2869  2672-HUYVI                1
1628  6267-DCFFZ                1
5570  5982-FPVQN                1
5250  3338-CVVEH                1

[284 rows x 2 columns]


In [14]:
#export the data to excel

import pandas as pd

# Assuming 'churned_customers' DataFrame contains the data you want to export
churned_customers.to_excel('churned_customers.xlsx', index=False)


    Results: We tested the model with new customer data that it hadn't seen before, and here's what we found:
        Accuracy: Our model correctly predicted whether a customer would stay or leave about 79% of the time.
        Churn Prediction: For customers that the model predicted would leave, it was correct about 64% of the time. For those predicted to stay, it was correct about 83% of the time.

    The confusion matrix shows how many times the model predicted correctly or incorrectly for both stayers and leavers:
        932 times, it correctly predicted customers would stay.
        183 times, it correctly predicted customers would leave.
        It also made some mistakes, like predicting customers would leave when they actually stayed.

In summary, we used customer information and a smart model to predict whether they will stay or leave. The model is fairly accurate, helping businesses identify which customers might churn so they can take action to keep them!