In [115]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split # train test

## Import the metrics we'll be using
from sklearn import metrics
## Import Logistic Regression from sklearn
from sklearn.linear_model import LogisticRegression

In [91]:
# Read in the CSV
df = pd.read_csv('https://raw.githubusercontent.com/gdiwa23/Swanalytics/refs/heads/main/1%20Project%20Data%20-%20Telco_Churn.csv')

In [92]:
# Change the column to a float and then fill the nulls with 0 as the customer hasn't paid us yet
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')# casts to float
df['Total Charges'] = df['Total Charges'].fillna(0)
df['Total Charges'].isnull().sum() # check = 0 nulls

np.int64(0)

In [93]:
# Set CustomerID as the index
df.set_index('CustomerID', inplace=True)

In [94]:
# Remove the columns we will not be using for the model
df = df.drop(columns=["Count","City","Country","State","Lat Long","Churn Label","Churn Reason"])

In [95]:
# Get the customers who have not left us
df_nochurn = df[df['Churn Value'] ==  0]
df_nochurn = df_nochurn.drop(columns=["Churn Value"])

In [96]:
# Get the feature columns for the model
feature_cols = df.columns.drop('Churn Value')

In [97]:
# Create the features, target an also the data we want to predict on outside of the train test split
X = df[feature_cols].copy()
y = df['Churn Value']
df_nochurn = df_nochurn[feature_cols]

In [98]:
# Train test split our data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = 1)


In [99]:
# Feature engineering the data
def clean_data(df):
  df_clean = df.copy() # df is X


  # OHE the service column into Fibre Optic or DSL - 0 in both indicates no internet service
  service_dummies = pd.get_dummies(df_clean['Internet Service'], prefix='Service', drop_first=True, dtype=int)
  df_clean = pd.concat([df_clean, service_dummies], axis=1)
  df_clean = df.drop(columns=['Internet Service'])

  df_clean = pd.get_dummies(df_clean, columns = ['Contract'], drop_first = True, prefix = 'Contract', dtype = int)
  df_clean = pd.get_dummies(df_clean, columns=['Payment Method'], drop_first=True, prefix='Payment_Method', dtype=int)


  # Label Encoding

  df_clean['Gender'] = df_clean['Gender'].map({'Female': 0, 'Male':1, 0:0, 1:1})
  df_clean['Senior Citizen'] = df_clean['Senior Citizen'].map({'No': 0, 'Yes':1, 0:0, 1:1})
  df_clean['Partner'] = df_clean['Partner'].map({'No': 0, 'Yes':1, 0:0, 1:1})
  df_clean['Dependents'] =  df_clean['Dependents'].map({'No': 0, 'Yes':1, 0:0, 1:1})
  df_clean['Phone Service'] = df_clean['Phone Service'].map({'No': 0, 'Yes':1, 0:0, 1:1})
  df_clean['Multiple Lines'] = df_clean['Multiple Lines'].map({'No': 0, 'Yes':1,'No phone service':0, 0:0, 1:1})

  df_clean['Online Security'] = df_clean['Online Security'].map({'No':0, 'Yes':1, 'No internet service':0, 0:0, 1:1})
  df_clean['Online Backup'] = df_clean['Online Backup'].map({'No':0, 'Yes':1, 'No internet service':0, 0:0, 1:1})
  df_clean['Device Protection'] = df_clean['Device Protection'].map({'No':0, 'Yes':1, 'No internet service':0, 0:0, 1:1})
  df_clean['Tech Support'] = df_clean['Tech Support'].map({'No':0, 'Yes':1, 'No internet service':0, 0:0, 1:1})


  #TV - No internet service has been combined with No
  df_clean['Streaming TV'] = df['Streaming TV'].replace(['No internet service', 'No'], 'No')
  df_clean['Streaming TV'] = df_clean['Streaming TV'].map({'No': 0, 'Yes':1, 0:0, 1:1})
  # Movies
  df_clean['Streaming Movies'] = df['Streaming Movies'].replace(['No internet service', 'No'], 'No')
  df_clean['Streaming Movies'] = df_clean['Streaming Movies'].map({'No': 0, 'Yes':1, 0:0, 1:1})
  # Paperless Billing
  df_clean['Paperless Billing'] = df_clean['Paperless Billing'].map({'No': 0, 'Yes':1, 0:0, 1:1})




  return df_clean

In [100]:
# Clean our data
X_train_fe = clean_data(X_train)
df_nochurn_fe = clean_data(df_nochurn)
X_train_lr = clean_data(X_train)
X_test_lr = clean_data(X_test)

In [101]:
# Initialise the model
lr = LogisticRegression(max_iter=10000, random_state=10)

In [102]:
# Fit the model to our data
lr.fit(X_train_lr,y_train )

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,10
,solver,'lbfgs'
,max_iter,10000


In [103]:
def get_results(actual, predicted):
    print("The confusion matrix for your predictions is:")
    print(metrics.confusion_matrix(actual, predicted), "\n")
    print(f'The accuracy of your model is: {metrics.accuracy_score(actual, predicted)}')
    print(f'The recall of your model is: {metrics.recall_score(actual, predicted)}')
    print(f'The precision of your model is: {metrics.precision_score(actual, predicted)}')
    print(f'The F1-score of your model is: {metrics.f1_score(actual, predicted)}')

## Predicting churn of train data

In [105]:
# Predict the target of of training data
X_train_lr[['P No Churn','P Churn']] = lr.predict_proba(X_train_lr)

In [106]:
# If the prediction is above 0.5 (50%) then the customer is predicted to churn
X_train_lr['y_pred'] = np.where(X_train_lr['P Churn']>.5, 1, 0)

In [107]:
# Print our metrics for the model
get_results(y_train,X_train_lr['y_pred'])

The confusion matrix for your predictions is:
[[3678  437]
 [ 627  892]] 

The accuracy of your model is: 0.8111466098686546
The recall of your model is: 0.587228439763002
The precision of your model is: 0.6711813393528969
The F1-score of your model is: 0.6264044943820225


## Predicting the churn of test data

In [113]:
# Predict the target of of test data
X_test_lr[['P No Churn','P Churn']] = lr.predict_proba(X_test_lr)
# If the prediction is above 0.5 (50%) then the customer is predicted to churn
X_test_lr['y_pred'] = np.where(X_test_lr['P Churn']>.5, 1, 0)
# Print our metrics for the model
get_results(y_test,X_test_lr['y_pred'])


The confusion matrix for your predictions is:
[[946 113]
 [151 199]] 

The accuracy of your model is: 0.8126330731014905
The recall of your model is: 0.5685714285714286
The precision of your model is: 0.6378205128205128
The F1-score of your model is: 0.6012084592145015


## Predicting the churn of customers

In [109]:
# Predict the target of of training data
df_nochurn_fe[['P No Churn','P Churn']] = lr.predict_proba(df_nochurn_fe)
# If the prediction is above 0.5 (50%) then the customer is predicted to churn
df_nochurn_fe['y_pred'] = np.where(df_nochurn_fe['P Churn']>.5, 1, 0)
# Select only the probability they churn column
df_nochurn_fe = df_nochurn_fe[["P Churn"]]
# Order the data by most likely to churn
df_nochurn_fe = df_nochurn_fe.sort_values(by=['P Churn'] , ascending=False)

In [116]:
# Top 10 most likely to churn
df_nochurn_fe.head(10)

Unnamed: 0_level_0,P Churn
CustomerID,Unnamed: 1_level_1
3489-HHPFY,0.843143
7398-SKNQZ,0.838316
6630-UJZMY,0.837326
5150-ITWWB,0.834166
4927-WWOOZ,0.830246
2545-EBUPK,0.827164
7668-XCFYV,0.822475
8622-ZLFKO,0.822297
0187-QSXOE,0.821161
3452-GWUIN,0.818412


In [114]:
# Write the top 500 in order to a csv
#df_nochurn_fe.head(500).to_csv('top500V1.csv', index=True)