In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import  LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

# Understanding and Definition of the Problem

Customer churn is the rate at which customers stop doing business with a company. Customers may decide to cease using a company's product or service for several reasons, including cost, unhappiness with the service and product selection, or poor customer service. For telco businesses, it is essential to both draw in new clients and curb consumer churn. When consumers leave, it is quite expensive for the business. A high churn rate adversely affects profits, impede growth and considerable impact on the market share. Predicting whether a particular consumer will go, or stay is the main challenge. Finding the crucial elements of churning presents a unique problem.

The key challenge is to predict if an individual customer will churn or not. In this project I will use supervised machine leaning to predict if a telco customer will churn or not. I will also do parameter tuning to improve the performance of the model.

# ABOUT THE DATASET

In IBM Cognos Analytics 11.1.3, the data module that is named Telco Customer Churn in the Base Samples was enhanced to provide a wider narrative.The Telco customer churn data contains information about a fictional telco company that provided home phone and Internet services to 7043 customers in California in Q3. It indicates which customers have left, stayed, or signed up for their service. Multiple important demographics are included for each customer, as well as a Satisfaction Score, Churn Score, and Customer Lifetime Value (CLTV) index.
The Telco customer churn data module is composed of 5 uploaded files:

    Telco_customer_churn_demographics.xlsx
    Telco_customer_churn_location.xlsx
    Telco_customer_churn_population.xlsx
    Telco_customer_churn_services.xlsx
    Telco_customer_churn_status.xlsx

The variables in each data set are described below

Demographics

CustomerID: A unique ID that identifies each customer.

Count: A value used in reporting/dashboarding to sum up the number of customers in a filtered set.

Gender: The customer’s gender: Male, Female

Age: The customer’s current age, in years, at the time the fiscal quarter ended.

Senior Citizen: Indicates if the customer is 65 or older: Yes, No

Married: Indicates if the customer is married: Yes, No

Dependents: Indicates if the customer lives with any dependents: Yes, No. Dependents could be children, parents, grandparents, etc.

Number of Dependents: Indicates the number of dependents that live with the customer.

In [None]:
demographics = pd.read_excel("dataset/Telco_customer_churn_demographics.xlsx")
demographics.head()

Location

CustomerID: A unique ID that identifies each customer.

Count: A value used in reporting/dashboarding to sum up the number of customers in a filtered set.

Country: The country of the customer’s primary residence.

State: The state of the customer’s primary residence.

City: The city of the customer’s primary residence.

Zip Code: The zip code of the customer’s primary residence.

Lat Long: The combined latitude and longitude of the customer’s primary residence.

Latitude: The latitude of the customer’s primary residence.

Longitude: The longitude of the customer’s primary residence.

In [None]:
location = pd.read_excel("dataset/Telco_customer_churn_location.xlsx")
location.head()

Population

ID: A unique ID that identifies each row.

Zip Code: The zip code of the customer’s primary residence.

Population: A current population estimate for the entire Zip Code area.

In [None]:
population = pd.read_excel("dataset/Telco_customer_churn_population.xlsx")
population.head()

Services

CustomerID: A unique ID that identifies each customer.

Count: A value used in reporting/dashboarding to sum up the number of customers in a filtered set.

Quarter: The fiscal quarter that the data has been derived from (e.g. Q3).

Referred a Friend: Indicates if the customer has ever referred a friend or family member to this company: Yes, No

Number of Referrals: Indicates the number of referrals to date that the customer has made.

Tenure in Months: Indicates the total amount of months that the customer has been with the company by the end of the quarter specified above.

Offer: Identifies the last marketing offer that the customer accepted, if applicable. Values include None, Offer A, Offer B, Offer C, Offer D, and Offer E.

Phone Service: Indicates if the customer subscribes to home phone service with the company: Yes, No

Avg Monthly Long Distance Charges: Indicates the customer’s average long distance charges, calculated to the end of the quarter specified above.

Multiple Lines: Indicates if the customer subscribes to multiple telephone lines with the company: Yes, No

Internet Service: Indicates if the customer subscribes to Internet service with the company: No, DSL, Fiber Optic, Cable.

Avg Monthly GB Download: Indicates the customer’s average download volume in gigabytes, calculated to the end of the quarter specified above.

Online Security: Indicates if the customer subscribes to an additional online security service provided by the company: Yes, No

Online Backup: Indicates if the customer subscribes to an additional online backup service provided by the company: Yes, No

Device Protection Plan: Indicates if the customer subscribes to an additional device protection plan for their Internet equipment provided by the company: Yes, No

Premium Tech Support: Indicates if the customer subscribes to an additional technical support plan from the company with reduced wait times: Yes, No

Streaming TV: Indicates if the customer uses their Internet service to stream television programing from a third party provider: Yes, No. The company does not charge an additional fee for this service.

Streaming Movies: Indicates if the customer uses their Internet service to stream movies from a third party provider: Yes, No. The company does not charge an additional fee for this service.

Streaming Music: Indicates if the customer uses their Internet service to stream music from a third party provider: Yes, No. The company does not charge an additional fee for this service.

Unlimited Data: Indicates if the customer has paid an additional monthly fee to have unlimited data downloads/uploads: Yes, No

Contract: Indicates the customer’s current contract type: Month-to-Month, One Year, Two Year.

Paperless Billing: Indicates if the customer has chosen paperless billing: Yes, No

Payment Method: Indicates how the customer pays their bill: Bank Withdrawal, Credit Card, Mailed Check

Monthly Charge: Indicates the customer’s current total monthly charge for all their services from the company.

Total Charges: Indicates the customer’s total charges, calculated to the end of the quarter specified above.

Total Refunds: Indicates the customer’s total refunds, calculated to the end of the quarter specified above.

Total Extra Data Charges: Indicates the customer’s total charges for extra data downloads above those specified in their plan, by the end of the quarter specified above.

Total Long Distance Charges: Indicates the customer’s total charges for long distance above those specified in their plan, by the end of the quarter specified above.

In [None]:
services = pd.read_excel("dataset/Telco_customer_churn_services.xlsx")
services.head()

Status

CustomerID: A unique ID that identifies each customer.

Count: A value used in reporting/dashboarding to sum up the number of customers in a filtered set.

Quarter: The fiscal quarter that the data has been derived from (e.g. Q3).

Satisfaction Score: A customer’s overall satisfaction rating of the company from 1 (Very Unsatisfied) to 5 (Very Satisfied).

Satisfaction Score Label: Indicates the text version of the score (1-5) as a text string.

Customer Status: Indicates the status of the customer at the end of the quarter: Churned, Stayed, or Joined

Churn Label: Yes = the customer left the company this quarter. No = the customer remained with the company. Directly related to Churn Value.

Churn Value: 1 = the customer left the company this quarter. 0 = the customer remained with the company. Directly related to Churn Label.

Churn Score: A value from 0-100 that is calculated using the predictive tool IBM SPSS Modeler. The model incorporates multiple factors known to cause churn. The higher the score, the more likely the customer will churn.

Churn Score Category: A calculation that assigns a Churn Score to one of the following categories: 0-10, 11-20, 21-30, 31-40, 41-50, 51-60, 61-70, 71-80, 81-90, and 91-100

CLTV: Customer Lifetime Value. A predicted CLTV is calculated using corporate formulas and existing data. The higher the value, the more valuable the customer. High value customers should be monitored for churn.

CLTV Category: A calculation that assigns a CLTV value to one of the following categories: 2000-2500, 2501-3000, 3001-3500, 3501-4000, 4001-4500, 4501-5000, 5001-5500, 5501-6000, 6001-6500, and 6501-7000.

Churn Category: A high-level category for the customer’s reason for churning: Attitude, Competitor, Dissatisfaction, Other, Price. When they leave the company, all customers are asked about their reasons for leaving. Directly related to Churn Reason.

Churn Reason: A customer’s specific reason for leaving the company. Directly related to Churn Category.

In [None]:
status = pd.read_excel("dataset/Telco_customer_churn_status.xlsx")
status.head()

# Understanding the Demographic data

Distribution of Demographic Data and Insight

In [None]:
# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Create a figure to hold the visualizations for the "Age" variable
fig, axs = plt.subplots(2, 2, figsize=(14, 10))

# Histogram for "Age"
sns.histplot(demographics['Age'], kde=False, ax=axs[0, 0])
axs[0, 0].set_title('Histogram of Age')

# Kernel Density Estimate for "Age"
sns.kdeplot(demographics['Age'], ax=axs[0, 1])
axs[0, 1].set_title('Kernel Density Estimate of Age')

# Cumulative Distribution Function for "Age"
sns.kdeplot(demographics['Age'], cumulative=True, ax=axs[1, 0])
axs[1, 0].set_title('Cumulative Distribution Function of Age')

# Box-and-Whisker Plot for "Age"
sns.boxplot(y=demographics['Age'], ax=axs[1, 1])
axs[1, 1].set_title('Box-and-Whisker Plot of Age')

plt.tight_layout()
plt.show()


The histogram and kernel density estimate (KDE) for age show a broad distribution, suggesting a diverse customer base in terms of age. There's no single dominant age group, indicating that the services offered by Telco appeal to a wide age range of customers.
The cumulative distribution function (CDF) further confirms this diversity, with a steady increase across the age spectrum, showing that the customer base is not skewed towards any specific age group.
The box-and-whisker plot reveals the median age lies in the middle age range, with some outliers indicating that there are customers who are significantly older than the median, which supports the diversity in customer age.
Number of Dependents
The histogram and KDE for the number of dependents suggest that a significant portion of the customer base has few to no dependents. This could indicate that the services are popular among individuals and small families.
The CDF shows that a large proportion of customers have less than two dependents, reinforcing the idea that the majority of customers are either single individuals or small families.
The box-and-whisker plot for the number of dependents indicates that the median number of dependents is low, with a few outliers having a higher number of dependents. This suggests that while the service appeals to customers with various family sizes, those with larger families are less common.

Categorical Variables

In [None]:
# Create figure to hold the visualizations for the categorical variables
fig, axs = plt.subplots(3, 2, figsize=(14, 15))
categorical_variables = ['Gender', 'Under 30',
                         'Senior Citizen', 'Married', 'Dependents']

for ax, variable in zip(axs.flatten(), categorical_variables):
    sns.countplot(x=variable, data=demographics, ax=ax)
    ax.set_title(f'Distribution of {variable}')

# Adjust layout and remove the empty subplot
plt.tight_layout()
fig.delaxes(axs.flatten()[5])  # Remove the last subplot which is empty
plt.show()

Gender
The count plot for gender shows a relatively balanced distribution between male and female customers, indicating that the service offerings cater equally well to both genders.
Under 30 and Senior Citizen
The distribution of customers under 30 and those classified as senior citizens provides insights into the age-related demographics of the customer base. The service appeals to a broad age range, including both younger individuals and senior citizens, suggesting diverse usage and needs across different age groups.
Married
The count plot for marital status reveals a mix of married and unmarried customers, indicating variability in the customer base's social and family structures. This diversity could influence the types of services and plans that are popular among different segments of customers.
Dependents
Similar to the numeric analysis of dependents, the categorical count plot shows that a sizable portion of the customer base does not have dependents, which could correlate with the number of single individuals or couples without children using Telco services.

# understanding Location

Top 10 cities by Count

In [None]:
plt.figure(figsize=(12, 8))
city_counts = location['City'].value_counts()
# Show top 10 cities for brevity
sns.barplot(y=city_counts.index[:10], x=city_counts.values[:10])
plt.title('Top 10 Cities by Customer Count')
plt.xlabel('Number of Customers')
plt.ylabel('City')
plt.show()

Customer distibution by zipcode (Top 10)

In [None]:
plt.figure(figsize=(12, 8))
zip_code_counts = location['Zip Code'].value_counts()
sns.barplot(y=zip_code_counts.index[:10].astype(
    str), x=zip_code_counts.values[:10])
plt.title('Top 10 Zip Codes by Customer Count')
plt.xlabel('Number of Customers')
plt.ylabel('Zip Code')
plt.show()

# Understanfing Service
Selecting some important variables for anlysis

Tenure in Months and  Monthly Charge

In [None]:
numerical_variables_selected = ['Tenure in Months', 'Monthly Charge']

# Create figure for Box-and-Whisker Plot, Histogram, KDE, and CDF
fig, axs = plt.subplots(len(numerical_variables_selected), 4, figsize=(20, 10))

for i, variable in enumerate(numerical_variables_selected):
    # Box-and-Whisker Plot
    sns.boxplot(x=services[variable], ax=axs[i, 0])
    axs[i, 0].set_title(f'Box-and-Whisker Plot of {variable}')

    # Histogram
    sns.histplot(services[variable], kde=False, ax=axs[i, 1])
    axs[i, 1].set_title(f'Histogram of {variable}')

    # Kernel Density Estimate
    sns.kdeplot(services[variable], ax=axs[i, 2])
    axs[i, 2].set_title(f'Kernel Density Estimate of {variable}')

    # Cumulative Distribution Function
    sns.kdeplot(services[variable], cumulative=True, ax=axs[i, 3])
    axs[i, 3].set_title(f'Cumulative Distribution Function of {variable}')

plt.tight_layout()
plt.show()

The histograms and kernel density estimates (KDE) for "Tenure in Months" and "Monthly Charge" provide insights into two key aspects of customer service engagement and billing:

Tenure in Months: The distribution of tenure among customers indicates the spread of customer loyalty and how long customers have been with the service. The histogram and KDE suggest a mix of newly acquired customers and those with longer tenure, indicating a diverse customer base in terms of service duration.

Monthly Charge: These visualizations show the distribution of monthly charges that customers incur. The spread of monthly charges highlights the variety of service plans customers are subscribed to, with some variability in how much customers are paying per month

The Cumulative Distribution Function (CDF) and Box-and-Whisker Plots for "Tenure in Months" and "Monthly Charge" reveal additional insights into the dataset:

Tenure in Months:

The CDF shows a gradual increase, indicating a steady accumulation of customers over time. The curve suggests a significant proportion of customers have relatively short tenure, with a steady increase in tenure length across the customer base.
The Box-and-Whisker Plot highlights the median tenure, along with the interquartile range, showing a wide spread in customer tenure. The presence of outliers suggests some customers have been with the service for a very long time compared to the majority.
Monthly Charge:

The CDF for monthly charges demonstrates that a large portion of customers pay lower monthly fees, with a gradual increase towards higher charges. This indicates a variety of service plans and options that cater to different customer needs and budgets.
The Box-and-Whisker Plot reveals the median monthly charge and the range of charges customers incur. The spread and outliers indicate variability in how much customers are willing to pay for services, reflecting the diverse offering of service plans.
These analyses provide a comprehensive view of customer tenure and billing, reflecting the diversity in customer engagement and service preferences within the Telco customer base. Understanding these patterns is crucial for tailoring customer service, retention strategies, and pricing models.

Categorical variables
Selecting key categorical variables for demonstration

In [None]:
categorical_variables = ['Offer', 'Contract', 'Payment Method']

# Create figure for count plots
fig, axs = plt.subplots(len(categorical_variables), 1, figsize=(8, 12))

for i, variable in enumerate(categorical_variables):
    sns.countplot(y=services[variable], ax=axs[i])
    axs[i].set_title(f'Count Plot of {variable}')

plt.tight_layout()
plt.show()

The count plots for "Offer", "Contract", and "Payment Method" reveal significant insights into the preferences and behaviors of Telco's customer base regarding service offers, contract types, and payment methods:

Offer: The distribution shows which promotional offers are more popular among customers. This insight can help Telco understand which offers are most attractive and potentially why certain offers may be leading to higher customer acquisition or retention.

Contract: This plot illustrates the variety of contract lengths customers are enrolled in, from month-to-month to longer-term contracts. The distribution across different contract types can inform strategies around customer loyalty and retention, highlighting the balance between flexibility and commitment in customer preferences.

Payment Method: The distribution of payment methods used by customers indicates preferences for transaction methods, whether through bank withdrawals, credit cards, or other means. Understanding payment method preferences can help in tailoring billing processes and improving customer satisfaction.

# Understing the Status of the customers

The visualizations for "Satisfaction Score" and "CLTV" (Customer Lifetime Value) provide valuable insights into these numerical variables

In [None]:
fig, axs = plt.subplots(2, 4, figsize=(24, 12))

variables_to_plot = ['Satisfaction Score', 'CLTV']

for i, variable in enumerate(variables_to_plot):
    # Box-and-Whisker Plot
    sns.boxplot(x=status[variable], ax=axs[i, 0])
    axs[i, 0].set_title(f'Box-and-Whisker Plot of {variable}')

    # Histogram
    sns.histplot(status[variable], kde=False, ax=axs[i, 1])
    axs[i, 1].set_title(f'Histogram of {variable}')

    # Kernel Density Estimate
    sns.kdeplot(status[variable], ax=axs[i, 2])
    axs[i, 2].set_title(f'Kernel Density Estimate of {variable}')

    # Cumulative Distribution Function
    sns.kdeplot(status[variable], cumulative=True, ax=axs[i, 3])
    axs[i, 3].set_title(f'Cumulative Distribution Function of {variable}')

plt.tight_layout()
plt.show()

Box-and-Whisker Plot: Shows the distribution, median, quartiles, and any potential outliers for both variables. For "Satisfaction Score", this plot can highlight the range of satisfaction among customers, while for "CLTV", it indicates the spread of customer lifetime values, identifying high-value customers.

Histogram: Reveals the frequency distribution of values. The histogram for "Satisfaction Score" might show common satisfaction levels among customers, while the "CLTV" histogram indicates the distribution of customer lifetime values, helping identify common value ranges.

Kernel Density Estimate (KDE): Provides a smooth, continuous curve representing the distribution of each variable. The KDE helps visualize the density of satisfaction scores and customer lifetime values, identifying peaks where values are more concentrated.

Cumulative Distribution Function (CDF): Shows the cumulative probability for each variable, offering insight into the proportion of customers below certain satisfaction scores or CLTV thresholds. This is particularly useful for understanding how satisfaction or value accumulates across the customer base.

Categorical variables

In [None]:
categorical_variables = ['Customer Status', 'Churn Reason', 'Churn Category']
for variable in categorical_variables:
    plt.figure(figsize=(10, len(status[variable].unique()) * 0.5))
    order = status[variable].value_counts().index
    sns.countplot(y=status[variable], order=order)
    plt.title(f'Count Plot of {variable}')
    plt.xlabel('Number of Customers')
    plt.ylabel(variable)
    plt.tight_layout()
    plt.show()

The count plots for "Customer Status" and "Churn Reason" offer insights into customer churn dynamics within the dataset:

Customer Status: This plot shows the distribution of customer statuses, such as "Churned", "Stayed", or any other categories present. It provides a clear view of the churn rate relative to the retention rate within the customer base, which is crucial for understanding overall customer satisfaction and loyalty.

Churn Reason: The distribution of reasons for churn is particularly insightful, revealing the most common factors leading to customer churn. This visualization helps identify areas where improvements could potentially reduce churn rates, such as service issues, pricing concerns, or competitive offers.

These insights are valuable for developing targeted strategies to improve customer satisfaction, retention, and overall service quality. By understanding the primary reasons for churn, Telco can tailor its customer service, pricing, and marketing strategies to address these concerns and enhance customer loyalty.

Understanding the distribution of churn categories can help Telco focus its efforts on the most pressing issues leading to customer churn. Whether it's enhancing service quality, adjusting pricing strategies, or responding to competitive pressures, insights from this analysis are valuable for formulating targeted interventions aimed at reducing churn and improving customer loyalty.

# Relationship among the variables

To start with the relationship among the variables, all the five datasets are merged into one data set.

From domain expert knowledge, products, services, and charges are the main contributers to churn in telco. I will analyse the relatioship between churn and these variables

demographics, location, services and status has variable Count which the count of the occurance of each of the CustomerID.
These create duplicate in the merge dataframe, hence are droped before the merger

To procced the analysis i will remove some uncessary variables based on my domain expert capabilities
the varaibles are:
Lat Long --Location identifier, ZIP code is enough to identify the location
Latitude --Location identifier, ZIP code is enough to identify the location
Longitude --Location identifier, ZIP code is enough to identify the location
Customer ID ---unique identifier of each customer. has no impact
Customer Status -- identifier of churn status. this taken care of by Churn Label
Churn Value --identifier of churn status. this taken care of by Churn Label
ID - Identifier for population. has no impact

In [None]:
demographics.drop("Count", axis=1, inplace=True)
location.drop("Count", axis=1, inplace=True)
services.drop("Count", axis=1, inplace=True)
status.drop("Count", axis=1, inplace=True)

In [None]:
df = pd.merge(
    pd.merge(
        pd.merge(
            pd.merge(
                demographics,
                location,
                on="Customer ID",
            ),
            services,
            on="Customer ID",
        ),
        status,
        on="Customer ID",
    ),
    population,
    on="Zip Code",
)

In [None]:
drop_list = ['Lat Long', 'Latitude', 'Longitude', 'Customer ID', 'Customer Status', 'Churn Value', 'ID']
df = df.drop(drop_list, axis=1)

Satisfaction Score and Churn

In [None]:
plt.figure(figsize=(8, 6))
sns.violinplot(x='Churn Label', y='Satisfaction Score',
               data=status, palette='Pastel1', inner='quartile')
plt.title('Satisfaction Scores Density by Churn Label')
plt.xlabel('Churn Label')
plt.ylabel('Satisfaction Score')
plt.tight_layout()
plt.show()

The violin plot further elucidates the density and distribution of "Satisfaction Scores" by "Churn Label"
There's a noticeable difference in the average satisfaction scores between customers who churn and those who do not, suggesting that lower satisfaction is associated with higher churn rates.
The violin plot's density aspect highlights where satisfaction scores are most concentrated, offering clues to the satisfaction levels that are most common among churned versus not churned customers.

Offers and Churn

In [None]:
# List of variables to analyze
variables = ['Offer', 'Gender', 'Under 30', 'Senior Citizen', 'Multiple Lines',
                        'Contract', 'Unlimited Data', 'Online Backup', 'Device Protection Plan',
                        'Premium Tech Support', 'Streaming TV']

# Analysis: Count and Churn Rate by Category for each variable
analysis_results = {}

for var in variables:
    count_by_category = df.groupby(
        [var, 'Churn Label']).size().unstack(fill_value=0)

    # Calculate churn rate by category
    churn_rate_by_category = count_by_category['Yes'] / \
        (count_by_category['Yes'] + count_by_category['No'])

    analysis_results[var] = {
        'Count by Category': count_by_category,
        'Churn Rate by Category': churn_rate_by_category
    }


offer_variable = 'Offer'
analysis_results[offer_variable]['Count by Category'], analysis_results[offer_variable]['Churn Rate by Category']

plt.figure(figsize=(10, 6))
churn_rate_by_offer = analysis_results['Offer']['Churn Rate by Category']
sns.barplot(x=churn_rate_by_offer.index, y=churn_rate_by_offer.values)
plt.title('Churn Rate by Offer Type')
plt.xlabel('Offer Type')
plt.ylabel('Churn Rate')
plt.xticks(rotation=45)
plt.show()

The bar plot visualizes the churn rate by offer type, clearly illustrating how the churn rate varies significantly across different offers. Offers E and D have notably higher churn rates compared to others, indicating a strong relationship between the type of offer a customer receives and their likelihood of churning. Offers A and B are associated with the lowest churn rates, suggesting these offers might be more effective in retaining customers.

This visualization provides a clear depiction of how different offers impact customer churn, offering valuable insights into which offers may need to be reevaluated or modified to reduce churn rates.

Churn in relation to selected variables:
variables = ['Offer', 'Gender', 'Under 30', 'Senior Citizen', 'Multiple Lines',
                        'Contract', 'Unlimited Data', 'Online Backup', 'Device Protection Plan',
                        'Premium Tech Support', 'Streaming TV']

In [None]:

num_rows = (len(variables) + 1) // 2

# Create subplots
fig, axs = plt.subplots(num_rows, 2, figsize=(12, 6*num_rows))

# Flatten axs if num_rows is 1
if num_rows == 1:
    axs = axs.reshape(1, -1)

# Plot count plots for each variable
for i, var in enumerate(variables):
    row = i // 2
    col = i % 2
    sns.countplot(x=var, hue='Churn Label',
                  data=df, ax=axs[row, col], palette='Set1')
    axs[row, col].set_title(f'Churn Label across {var}')
    axs[row, col].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

The series of bar plots visualize the churn rates across different categories for each of the variables listed:

Gender: The churn rate appears to be somewhat balanced between different genders, suggesting that gender alone may not be a strong predictor of churn.
Under 30: There is a distinction in churn rates based on whether customers are under 30, indicating age might influence churn behavior.
Senior Citizen: Senior citizen status shows a noticeable difference in churn rates, potentially indicating different needs or satisfaction levels within this group.
Multiple Lines: Customers with and without multiple lines have varied churn rates, which could reflect on service satisfaction or usage patterns.
Contract: There's a significant variance in churn rates based on the type of contract, with longer contracts possibly indicating lower churn rates.
Unlimited Data: The presence of unlimited data options appears to influence churn rates, which could be a critical factor in customer retention.
Online Backup, Device Protection Plan, Premium Tech Support, Streaming TV: Each of these services shows different churn rates among their categories, suggesting that the value or satisfaction customers derive from these services can impact their likelihood to churn.
These visualizations provide a comprehensive overview of how different factors relate to customer churn, highlighting areas where targeted strategies could potentially improve retention. Each variable's churn rate variation suggests specific customer segments or service features that are more closely associated with higher churn rates.

HeatMap for numerical values

In [None]:
numerical_df = df.select_dtypes(include='number')
plt.figure(figsize=(10, 6))
sns.heatmap(numerical_df.corr(method="pearson"))
plt.title('Heatmap of Numerical Values')
plt.show()

In [None]:
numerical_stats = df.describe().loc[['min', 'max']]

print(numerical_stats)

Initial data inspection

In [None]:
df.head()

In [None]:
df.info()

Descriptive statistics

In [None]:
df.describe()

# Data Cleaning

checking data types

In [None]:
for column_name, dtype in df.dtypes.items():
    print(f"{column_name}: {dtype}")

check which varaiables are type object

In [None]:

for column_name, dtype in df.dtypes.items():
    if dtype == 'object':
        print(f"{column_name}: {dtype}")

Zip Code is a label or identifier rather than a numerical value that you would perform mathematical operations on. Hence, comvert ZIP code to object

In [None]:
df['Zip Code'] = df['Zip Code'].astype(str)

Check for missing value value

In [None]:
missing_values = df.isnull().any()
print(missing_values)

In [None]:
missing_count = df.isnull().sum()
missing_count

In [None]:
columns_with_missing_value =  missing_count[missing_values > 0]
columns_with_missing_value

Three variables have missing values, offer, nternet Type, Churn category, Churn Reason. 
customers with missing values for internet type and offer with be replace with value Unknown.

further analysis to check if there are any customers who has churn but has null value for Churn Category or Churn reason.

In [None]:
count_missing_churn_category = df[(df['Churn Label'] == 'Yes') & (df['Churn Category'].isnull())].shape[0]
count_missing_churn_category

In [None]:
count_missing_churn_reason = df[(df['Churn Label'] == 'Yes') & (df['Churn Reason'].isnull())].shape[0]
count_missing_churn_reason

the missing values for Churn Category and churn Reason are those customer who have not churn. which is obvius.
These missing values will be replaced with value NOT Churn

In [None]:
# Replace missing values in column 'Internet Type' with Unknown
df['Internet Type'] = df['Internet Type'].fillna('Unknown')
# Replace missing values in column 'Offer' with Unknown
df['Offer'] = df['Offer'].fillna('Unknown')
# Replace missing values in column 'Churn Category' with NOT Churn
df['Churn Category'] = df['Churn Category'].fillna('NOT Churn')

# Replace missing values in column 'Churn Reason' with NOT Churn
df['Churn Reason'] = df['Churn Reason'].fillna('Not Churn')

check for duplicates


In [None]:
duplicates = df.duplicated()
duplicates

In [None]:
any_duplicate = df.duplicated().values.any()
any_duplicate

Rmoving duplicate

In [None]:

df.drop_duplicates(inplace=True)

checking if there are any column that has the same value

In [None]:
# Function to check if all values in a column are the same
def column_has_same_values(column):
    return column.nunique() == 1


# Check if each column has the same values
for column_name, column in df.items():
    if column_has_same_values(column):
        print(f"{column_name}")

column Country, State, Quarter_x and Quater_y has the same values hence are dropped 

In [None]:
df = df.drop(columns=["Country", "State", "Quarter_x", "Quarter_y"])

# Outlier Detection and Treatment:
List of numerical columns to check for outliers

In [None]:
numerical_columns = ['Age', 'Number of Dependents',  'Number of Referrals', 'Tenure in Months', 
                      'Avg Monthly Long Distance Charges', 'Avg Monthly GB Download', 'Monthly Charge',
                     'Total Charges',  'Total Refunds', 'Total Extra Data Charges',  
                     'Total Long Distance Charges', 'Total Revenue', 'Satisfaction Score',
                       'Churn Score', 'CLTV', 'Population']

Detecting Outliers

In [None]:
# Function to visualize outliers using boxplots
def visualize_outliers(df, columns):
    plt.figure(figsize=(15, 10))
    for i, column in enumerate(columns, 1):
        plt.subplot(len(columns), 1, i)
        sns.boxplot(x=df[column])
        plt.title(f'Boxplot of {column}')
    plt.tight_layout()
    plt.show()


# Visualize outliers for numerical columns
visualize_outliers(df, numerical_columns)

From the above, 8 numeric variables have outliers

In [None]:
# Function to detect and treat outliers using IQR
def detect_and_treat_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Detect outliers
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]

    # Treat outliers by capping
    df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)

    return df, outliers


# Detect and treat outliers for each numerical column
outliers_info = {}
for column in numerical_columns:
    df, outliers = detect_and_treat_outliers(df, column)
    outliers_info[column] = outliers

# Display information about outliers
for column, outliers in outliers_info.items():
    print(f'Column: {column}, Number of Outliers: {len(outliers)}')

Treatment of Outliers
Number of dependents however has significant number of outliers. droping this values will significantly impact the number of records.
I will use log transformation to take care of the outliers

In [None]:
# log transformation to take care of the outliers
df['Number of Dependents'] = np.log1p(df['Number of Dependents'])
df['Number of Referrals'] = np.log1p(df['Number of Referrals'])
df['Avg Monthly GB Download'] = np.log1p(df['Avg Monthly GB Download'])
df['Total Refunds'] = np.log1p(df['Total Refunds'])
df['Total Extra Data Charges'] = np.log1p(df['Total Extra Data Charges'])
df['Total Long Distance Charges'] = np.log1p(df['Total Long Distance Charges'])
df['Total Revenue'] = np.log1p(df['Total Revenue'])
df['Population'] = np.log1p(df['Population'])

# Feature Enginnering

Encoding categorical features
To effectively handle categorical variables through encoding techniques, it's essential to understand the nature of each categorical variable in your dataset. Categorical variables can be broadly classified into two types: nominal (where there is no inherent order among the categories) and ordinal (where the categories have a natural order). The choice of encoding technique depends on the type of categorical variable

Nominal Variables without a Large Number of Categories:

Variables: Gender, Under 30, Senior Citizen, Married, Dependents, Referred a Friend, Phone Service, Multiple Lines, Internet Service, Internet Type, Online Security, Online Backup, Device Protection Plan, Premium Tech Support, Streaming TV, Streaming Movies, Streaming Music, Unlimited Data, Paperless Billing, Payment Method, Customer Status, Churn Label, Churn Category.


For the high-cardinality variables ('City', 'Zip Code', 'Churn Reason'), a straightforward approach could be to use label encoding. This approach will not capture the geographical proximity but will allow us to transform these features into a model-readable format.

'Churn Reason': This variable has many unique values, each providing specific reasons for churn. A label encoding can be applied here as well,

To avoid creating new columns, i will use label encoding for all the categorical features

In [None]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Iterate over each column
for col in df.columns:
    # Encode only if the column is categorical
    if df[col].dtype == 'object':
        df[col] = label_encoder.fit_transform(df[col])

Selection of important features that contribute to the model’s performance.
Given the nature of this dataset, selecting important features that contribute to the model’s performance involves considering both the diversity of feature types and the potential multicollinearity among them. The following methods were used

Domain Knowledge: Incorporate domain knowledge to evaluate whether features logically contribute to customer churn.
The Aim is to predict if customer will churn or not. At the point of determing if customer will churn, Churn Category	Churn Reason will not be available. hence these two features need to be dropped

In [None]:
df = df.drop(columns=['Churn Category',	'Churn Reason'])

Random Forest for Feature Importance: A Random Forest classifier was used to assess the importance of each feature in predicting the target variable (Churn Label). The feature importances were plotted, and features with importance greater than 0.01 were considered important.



In [None]:
# Separate features and target variable
X = df.drop('Churn Label', axis=1)
y = df['Churn Label']

In [None]:

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [None]:

# Random Forest for feature importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
feature_importances = rf.feature_importances_

In [None]:
# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x=feature_importances, y=X.columns)
plt.title('Feature Importance (Random Forest)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

# Select top features based on importance
important_features = X.columns[feature_importances > 0.01]

In [None]:
# Select top features based on importance
important_features = X.columns[feature_importances > 0.01]
important_features

Performing Dimensionality Reduction using Principal Component Analysis (PCA)

In [None]:


# Standardize the features before applying PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=0.98)  # Retain 95% of the variance
X_pca = pca.fit_transform(X_scaled)

# Split the data into training and test sets using the PCA-transformed data
X_train_pca, X_test_pca, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.3, random_state=42)

# Train a model using the PCA-transformed data
model_pca = LogisticRegression()
model_pca.fit(X_train_pca, y_train)
y_pred_pca = model_pca.predict(X_test_pca)

# Evaluate the model
accuracy_pca = accuracy_score(y_test, y_pred_pca)
print(f'Accuracy (PCA): {accuracy_pca:.2f}')
print('\nClassification Report (PCA):\n',
      classification_report(y_test, y_pred_pca))

In [None]:


# Fit PCA on scaled data
pca.fit(X_scaled)

# Get the loadings (coefficients) of the original features in the principal components
loadings = pd.DataFrame(pca.components_.T, columns=[
                        f'PC{i+1}' for i in range(pca.n_components_)], index=X.columns)

# Display the loadings for the first few principal components
print(loadings.head())

# To identify important features for a specific principal component

important_features_pc1 = loadings['PC1'].abs().sort_values(
    ascending=False).head() 
print('\nImportant features for PC1:\n', important_features_pc1)

Creation of new features that improve the model’s predictive power

4 new features were created 


1. Interaction Features: Create new features that are the product of two or more existing features.

In [None]:
df['Total Long Distance Charges'] = df['Tenure in Months'] * \
    df['Avg Monthly Long Distance Charges']

2. Ratio Features: Create features that are the ratio of two existing features.

In [None]:
df['Referral Rate'] = df['Number of Referrals'] / df['Tenure in Months']

3. Aggregated Features: creating features based on categorical featured.

In [None]:
avg_charges_by_internet_type = df.groupby(
    'Internet Type')['Avg Monthly GB Download'].mean().to_dict()
df['Avg Charges by Internet Type'] = df['Internet Type'].map(
    avg_charges_by_internet_type)

4. Binning: Convert continuous features into categorical features by binning them into intervals

In [None]:
df['Age Group'] = pd.cut(df['Age'], bins=[0, 30, 60, 90], labels=[
                           'Young', 'Middle-Aged', 'Senior'])

encoding the new Age Group feature

In [None]:
df['Age Group'] = label_encoder.fit_transform(df['Age Group'])

In [None]:
df.to_csv('df_for_model.csv', index=False)

Scaling and normalization of numerical features
Age, Tenure in Months, Monthly Charge, Total Charges, Churn Score, CLTV, Population:

These variables display a wide range of values and could benefit from Standardization (Z-score scaling), as it centers the data around the mean with a unit standard deviation, making it suitable for algorithms that assume data is centered around 0.
Number of Dependents, Number of Referrals, Avg Monthly Long Distance Charges, Avg Monthly GB Download, Total Refunds, Total Extra Data Charges, Total Long Distance Charges, Total Revenue, Satisfaction Score, Churn Value:

Many of these variables exhibit skewed distributions or have a significant proportion of zero values. For variables with a non-normal distribution, Normalization (Min-Max Scaling) could be more appropriate to rescale the data within a specific range (0 to 1), especially if the model benefits from bounded values.
For "Total Refunds" and "Total Extra Data Charges", which are constants (zeros in this case), scaling or normalization is not applicable as these features do not vary.

In [None]:
#Dataframe before scaling: to be used later
df_nonscaled = df

In [None]:

columns_to_standardize = ['Age', 'Tenure in Months', 'Monthly Charge',
                          'Total Charges', 'Churn Score', 'CLTV', 'Population']
columns_to_normalize = ['Number of Referrals', 'Avg Monthly Long Distance Charges', 'Avg Monthly GB Download',
                        'Total Long Distance Charges', 'Total Revenue', 'Satisfaction Score']

# Initialize scalers
scaler_standard = StandardScaler()
scaler_min_max = MinMaxScaler()


# Apply standardization
df[columns_to_standardize] = scaler_standard.fit_transform(
    df[columns_to_standardize])

# Apply normalization
df[columns_to_normalize] = scaler_min_max.fit_transform(
    df[columns_to_normalize])

# Display summary statistics of the transformed variables to verify changes
df[columns_to_standardize + columns_to_normalize].describe()

# Model Selection and Development

Selection of an appropriate supervised learning algorithm that suits the problem.
Selection of an Appropriate Supervised Learning Algorithm
For churn prediction, which is a binary classification problem (churn or not churn), some commonly used supervised learning algorithms are:

Logistic Regression
Decision Trees
Random Forest
Gradient Boosting Machines (e.g., XGBoost, LightGBM)
Support Vector Machines (SVM)
Neural Networks
The choice of algorithm depends on the size and nature of your dataset, the interpretability of the model, and the computational resources available. 

The choosen Algorithm is Random Forest. 
Random Forest is an ensemble method that can handle both numerical and categorical data well. It is robust to outliers and can capture complex interactions between features. 
It can handle a mix of feature types well. Additionally, it often perform well out-of-the-box with default parameters, which can be a good starting point before tuning for better performance.

Development and training of the model using the training set.

In [None]:

# Drop rows with NaN values and separate the features and target variable
df_clean = df.dropna()
X = df_clean.drop(columns=['Churn Label'])
y = df_clean['Churn Label']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

accuracy


In [None]:
print(classification_rep)

Improving the model performance

Tuning of hyperparameters to optimize the model’s performance.

In [None]:


# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation
                           n_jobs=-1,  # Use all available cores
                           verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score

In [None]:
# Create a Random Forest classifier with the specified parameters
rf_model_tuned = RandomForestClassifier(n_estimators=100,
                                  max_depth=10,
                                  min_samples_split=5,
                                  min_samples_leaf=2,
                                  random_state=42)
rf_model_tuned.fit(X_train, y_train)
y_pred_tuned = rf_model_tuned.predict(X_test)
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
classification_rep_tuned = classification_report(y_test, y_pred_tuned)


In [None]:
print(accuracy_tuned)

In [None]:
print(classification_rep_tuned)

Training the model using only Suggested iportant features only amd measuring the performance

In [None]:
important_features = ['Age', 'Dependents', 'Zip Code', 'Number of Referrals',
 'Tenure in Months', 'Internet Type', 'Avg Monthly GB Download',
 'Contract', 'Monthly Charge', 'Total Charges',
 'Total Long Distance Charges', 'Total Revenue', 'Satisfaction Score',
 'Churn Score', 'CLTV']

target_feature = ['Churn Label']
X = df[important_features]
y = df[target_feature]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# Initialize and train the Random Forest classifier
rf_model_important_feactures = RandomForestClassifier(random_state=42)
rf_model_important_feactures.fit(X_train, y_train)

# Make predictions on the test set
y_pred_important_features = rf_model_important_feactures.predict(X_test)

accuracy_important_feactures = accuracy_score(y_test, y_pred_important_features)
classification_rep_important_feactures = classification_report(y_test, y_pred_important_features)



In [None]:
print(accuracy_important_feactures)

In [None]:
print(classification_rep_important_feactures)

# Evaluating the model


Model performance before turning

In [None]:
accuracy_before_tune = accuracy_score(y_test, y_pred)
precision_before_tuned = precision_score(y_test, y_pred)
recall_before_tuned = recall_score(y_test, y_pred)
f1_score_before_tuned = f1_score(y_test, y_pred)
print(accuracy_before_tune, precision_before_tuned,
      recall_before_tuned, f1_score_before_tuned)

Model performance after tuning

In [None]:
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
precision_tuned = precision_score(y_test, y_pred_tuned)
recall_tuned = recall_score(y_test, y_pred_tuned)
f1_score_tuned = f1_score(y_test, y_pred_tuned)
(accuracy_tuned, precision_tuned, recall_tuned, f1_score_tuned)

Perform 5-fold cross-validation

In [None]:
cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')

cv_mean = cv_scores.mean()
cv_std = cv_scores.std()

(cv_mean, cv_std)

These results indicate that the model performs consistently well across different folds, with a high average accuracy and low variability. This suggests that the model is robust and generalizes well to unseen data.

Confusion matrix

In [None]:

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_tuned)

conf_matrix

In [None]:


# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred_tuned)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('Actual Labels')
plt.xlabel('Predicted Labels')

# Display the plot
plt.show()