## Predicting Customer Churn
### Aim: Identifying Customers that are Susceptible to Churn in order to Enhance Retention Strategies and Boost Business Growth.

In [1]:
# Import libraries
from ast import literal_eval

import pandas as pd 
import numpy as np 

# Import visualization libraries 
import matplotlib.pyplot as plt 
import seaborn as sns 

# Import ML Libraries 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm # Visualize progress

# Import warnings to ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [3]:
# Import the dataset
df = pd.read_excel('../Customer_Churn_Prediction/Dataset.xlsx')
df.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Email,Phone,Address,Segment,PurchaseHistory,...,ServiceInteractions,PaymentHistory,WebsiteUsage,ClickstreamData,EngagementMetrics,Feedback,MarketingCommunication,NPS,ChurnLabel,Timestamp
0,1001,Mark Barrett,31,Male,Andrewfort,allison74@example.net,3192528777,"61234 Shelley Heights Suite 467\nCohentown, GU...",Segment B,"[{'Product': 'Frozen Cocktail Mixes', 'Frequen...",...,"[{'Type': 'Call', 'Date': '2019-09-26'}, {'Typ...","[{'Method': 'Credit Card', 'Late_Payments': 5}...","{'PageViews': 49, 'TimeSpent(minutes)': 15}","[{'Action': 'Add to Cart', 'Page': 'register',...","{'Logins': 19, 'Frequency': 'Weekly'}","{'Rating': 1, 'Comment': 'I move baby go small...","[{'Email_Sent': '2019-10-17', 'Email_Opened': ...",3,1,2020-01-27 01:36:49
1,1002,Jeremy Welch,66,Female,Millerhaven,fmiller@example.com,231-587-1818x8651,"4959 Jennifer Junction\nNew Angelaport, TN 87397",Segment C,"[{'Product': 'Watercraft Polishes', 'Frequency...",...,"[{'Type': 'Call', 'Date': '2020-01-05'}, {'Typ...","[{'Method': 'Credit Card', 'Late_Payments': 3}...","{'PageViews': 100, 'TimeSpent(minutes)': 9}","[{'Action': 'Add to Cart', 'Page': 'homepage',...","{'Logins': 9, 'Frequency': 'Weekly'}","{'Rating': 2, 'Comment': 'Wish what bag cut li...","[{'Email_Sent': '2021-08-02', 'Email_Opened': ...",6,0,2019-01-06 18:30:03
2,1003,Brandon Patel,36,Female,Lozanostad,jasonbrown@example.org,(270)633-9095,"38701 Amanda Brook Apt. 076\nKimshire, NJ 62516",Segment B,"[{'Product': 'Vehicle Waxes, Polishes & Protec...",...,"[{'Type': 'Email', 'Date': '2019-10-09'}, {'Ty...","[{'Method': 'Credit Card', 'Late_Payments': 1}...","{'PageViews': 1, 'TimeSpent(minutes)': 97}","[{'Action': 'Search', 'Page': 'terms', 'Timest...","{'Logins': 19, 'Frequency': 'Monthly'}","{'Rating': 4, 'Comment': 'Some Democrat guess ...","[{'Email_Sent': '2021-08-29', 'Email_Opened': ...",3,0,2019-04-30 04:25:10
3,1004,Tina Martin,62,Female,South Dustin,matthew62@example.net,050.081.8706x11982,"67324 Ashley Coves\nSouth John, RI 29650",Segment C,"[{'Product': 'Mouthwash', 'Frequency': 5, 'Val...",...,"[{'Type': 'Call', 'Date': '2020-08-28'}, {'Typ...","[{'Method': 'Credit Card', 'Late_Payments': 36...","{'PageViews': 25, 'TimeSpent(minutes)': 31}","[{'Action': 'Click', 'Page': 'privacy', 'Times...","{'Logins': 4, 'Frequency': 'Daily'}","{'Rating': 1, 'Comment': 'Yard feel never miss...","[{'Email_Sent': '2021-02-03', 'Email_Opened': ...",1,1,2020-03-03 17:33:28
4,1005,Christopher Rodriguez,68,Female,West James,shannonstrickland@example.org,+1-701-854-4915x724,"01169 Miller Mission\nWest Anthonyburgh, WY 47359",Segment C,"[{'Product': 'Ice Cream Novelties', 'Frequency...",...,"[{'Type': 'Call', 'Date': '2019-04-10'}, {'Typ...","[{'Method': 'Credit Card', 'Late_Payments': 0}...","{'PageViews': 77, 'TimeSpent(minutes)': 51}","[{'Action': 'Click', 'Page': 'privacy', 'Times...","{'Logins': 12, 'Frequency': 'Weekly'}","{'Rating': 3, 'Comment': 'Ten determine unit i...","[{'Email_Sent': '2022-03-11', 'Email_Opened': ...",3,0,2019-04-05 22:42:22


### Data Exploration
- Understanding the data structure, statistics, and quality of the dataset,
- Visualizing the data to gain insights,
- Check for missing values.

In [5]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values

CustomerID                0
Name                      0
Age                       0
Gender                    0
Location                  0
Email                     0
Phone                     0
Address                   0
Segment                   0
PurchaseHistory           0
SubscriptionDetails       0
ServiceInteractions       0
PaymentHistory            0
WebsiteUsage              0
ClickstreamData           0
EngagementMetrics         0
Feedback                  0
MarketingCommunication    0
NPS                       0
ChurnLabel                0
Timestamp                 0
dtype: int64

In [6]:
# Check for duplicates
duplicates = df.duplicated().sum()
duplicates

0

In [8]:
# Statistical overview of the numerical columns
stats_overview = df.describe()
stats_overview

Unnamed: 0,CustomerID,Age,NPS,ChurnLabel
count,12483.0,12483.0,12483.0,12483.0
mean,7242.0,43.930065,2.973884,0.505808
std,3603.67604,15.341521,2.644623,0.499986
min,1001.0,18.0,0.0,0.0
25%,4121.5,31.0,1.0,0.0
50%,7242.0,44.0,2.0,1.0
75%,10362.5,57.0,4.0,1.0
max,13483.0,70.0,9.0,1.0


In [None]:
# Set up the figure and the axes
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 10))

# plot distribution of the target variable ChurnLabel
sns.countplot(data=df, x='ChurnLabel', ax=axes[0, 0])
axes[0, 0].set_title('Distribution of ChurnLabel')
axes[0, 0].set_xticklabels(['No Churn', 'Churn'])
# axes[0, 0].set_xlabel('ChurnLabel')
# axes[0, 0].set_ylabel('Count')


# # Plot Distribution of Gender
# sns.countplot(data=df, ax=axes[0, 1])
# axes.set_title("Distribution of Gender")



# plot distribution of the numerical features
sns.histplot(data=df, x='Age', ax=axes[0, 1], kde=True)
axes[0, 1].set_title('Distribution of Age')
axes[0, 1].set_xlabel('Age')
axes[0, 1].set_ylabel('Count')
sns.histplot(data=df, x='Income', ax=axes[1, 0], kde=True)
axes[1, 0].set_title('Distribution of Income')
axes[1, 0].set_xlabel('Income')
axes[1, 0].set_ylabel('Count')
sns.histplot(data=df, x='AccountBalance', ax=axes[1, 1], kde=True)
axes[1, 1].set_title('Distribution of AccountBalance')
axes[1, 1].set_xlabel('AccountBalance')
axes[1, 1].set_ylabel('Count')

# plot distribution of the categorical features
sns.countplot(data=df, x='MaritalStatus', ax=axes[2, 0])
axes[2, 0].set_title('Distribution of MaritalStatus')
axes[2, 0].set_xlabel('MaritalStatus')
axes[2, 0].set_ylabel('Count')

# Convert the Education column to a categorical type
df['Education'] = df['Education'].astype('category')
sns.countplot(data=df, x='Education', ax=axes[2, 1])
axes[2, 1].set_title('Distribution of Education')
axes[2, 1].set_xlabel('Education')
axes[2, 1].set_ylabel('Count')

# Adjust layout