# Customer Churn Prediction

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline   

In [4]:

churnData = pd.read_csv('./files_for_lab/Customer-Churn.csv')

# Checking the data types of all columns
print(churnData.dtypes)

# Converting 'TotalCharges' to numeric, forcing errors to NaN
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')

# Checking for null values
print(churnData.isnull().sum())

# Replacing null values in 'TotalCharges' with the median
churnData['TotalCharges'].fillna(churnData['TotalCharges'].median(), inplace=True)
    

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64


In [5]:

# Selecting the relevant features and target variable
features = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
target = churnData['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)  # Convert target to binary

# Scaling the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

# Fitting a logistic regression model on the training data
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Checking the accuracy on the test data
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# model evaluation
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
    

Accuracy: 0.8070
[[959  77]
 [195 178]]
              precision    recall  f1-score   support

           0       0.83      0.93      0.88      1036
           1       0.70      0.48      0.57       373

    accuracy                           0.81      1409
   macro avg       0.76      0.70      0.72      1409
weighted avg       0.80      0.81      0.79      1409



In [6]:

# Checking for class imbalance
print(target.value_counts(normalize=True))
    

Churn
0    0.73463
1    0.26537
Name: proportion, dtype: float64


In [7]:

# Applying SMOTE for upsampling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(features_scaled, target)

# Splitting the resampled data
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Fitting a logistic regression model on the resampled training data
model_resampled = LogisticRegression(random_state=42)
model_resampled.fit(X_train_resampled, y_train_resampled)

# Checking the accuracy on the resampled test data
y_pred_resampled = model_resampled.predict(X_test_resampled)
accuracy_resampled = accuracy_score(y_test_resampled, y_pred_resampled)
print(f'Resampled Accuracy: {accuracy_resampled:.4f}')

# model evaluation
print(confusion_matrix(y_test_resampled, y_pred_resampled))
print(classification_report(y_test_resampled, y_pred_resampled))
    

Resampled Accuracy: 0.7522
[[759 262]
 [251 798]]
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      1021
           1       0.75      0.76      0.76      1049

    accuracy                           0.75      2070
   macro avg       0.75      0.75      0.75      2070
weighted avg       0.75      0.75      0.75      2070

