In [2]:
# Import Required Libraries
# Import the necessary libraries for data manipulation, visualization, and modeling.

import pandas as pd
import numpy as np
import statistics as st
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score,classification_report, ConfusionMatrixDisplay

# Ignore warnings
warnings.filterwarnings("ignore")

# Set seaborn style for plots
sns.set(rc={"figure.figsize": (15, 6)})

# Display all columns in the dataframe
pd.set_option("display.max_columns", None)

## Initial Data Exploration

In [3]:
data = pd.read_csv('data-source/Telco-Customer-Churn.csv')

In [4]:
# Display the column names
print("Column names in the dataset:")
print(data.columns)

Column names in the dataset:
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')


In [5]:
data.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
# Display the shape of the dataset
print("Shape of the dataset:")
print(data.shape)

Shape of the dataset:
(7043, 21)


In [7]:
# Display the data types of each column
print("Data types of each column:")
print(data.dtypes)

Data types of each column:
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


In [8]:
# Display information about the dataset
print("Information about the dataset:")
print(data.info())


Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBi

In [9]:
# Check for null values in the dataset
print("Null values in the dataset:")
print(data.isnull().sum())


Null values in the dataset:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [10]:
# Check for duplicate values in the dataset
print("Duplicate values in the dataset:")
print(data.duplicated().sum())

Duplicate values in the dataset:
0


In [11]:
# Display summary statistics of the dataset
print("Summary statistics of the dataset:")
print(data.describe())


Summary statistics of the dataset:
       SeniorCitizen       tenure  MonthlyCharges
count    7043.000000  7043.000000     7043.000000
mean        0.162147    32.371149       64.761692
std         0.368612    24.559481       30.090047
min         0.000000     0.000000       18.250000
25%         0.000000     9.000000       35.500000
50%         0.000000    29.000000       70.350000
75%         0.000000    55.000000       89.850000
max         1.000000    72.000000      118.750000


In [12]:
# Separate categorical and numerical features
categorical_features = data.select_dtypes(include="object").columns
numerical_features = data.select_dtypes(exclude="object").columns


In [13]:
# Display categorical features
print("Categorical features:")
print(categorical_features)

Categorical features:
Index(['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges',
       'Churn'],
      dtype='object')


In [14]:
# Display numerical features
print("Numerical features:")
print(numerical_features)


Numerical features:
Index(['SeniorCitizen', 'tenure', 'MonthlyCharges'], dtype='object')


In [15]:
# Display unique values for each categorical feature
print("Unique values for each categorical feature:")
for feature in categorical_features:
    print(f"{feature}: {data[feature].unique()}")

Unique values for each categorical feature:
customerID: ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
gender: ['Female' 'Male']
Partner: ['Yes' 'No']
Dependents: ['No' 'Yes']
PhoneService: ['No' 'Yes']
MultipleLines: ['No phone service' 'No' 'Yes']
InternetService: ['DSL' 'Fiber optic' 'No']
OnlineSecurity: ['No' 'Yes' 'No internet service']
OnlineBackup: ['Yes' 'No' 'No internet service']
DeviceProtection: ['No' 'Yes' 'No internet service']
TechSupport: ['No' 'Yes' 'No internet service']
StreamingTV: ['No' 'Yes' 'No internet service']
StreamingMovies: ['No' 'Yes' 'No internet service']
Contract: ['Month-to-month' 'One year' 'Two year']
PaperlessBilling: ['Yes' 'No']
PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
TotalCharges: ['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']
Churn: ['No' 'Yes']


## Data Cleaning

No missing and duplicate value. So only correcting data type

In [16]:
# Change the binary columns with 1 or 0
data["Churn"] = data["Churn"].map({"Yes": 1, "No": 0})
data["PaperlessBilling"] = data["PaperlessBilling"].map({"Yes": 1, "No": 0})
data["PhoneService"] = data["PhoneService"].map({"Yes": 1, "No": 0})
data["Dependents"] = data["Dependents"].map({"Yes": 1, "No": 0})
data["Partner"] = data["Partner"].map({"Yes": 1, "No": 0})
data["gender"] = data["gender"].map({"Male": 1, "Female": 0})

In [17]:
data.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,0,0,1,0,1,0,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,0
1,5575-GNVDE,1,0,0,0,34,1,No,DSL,Yes,No,Yes,No,No,No,One year,0,Mailed check,56.95,1889.5,0
2,3668-QPYBK,1,0,0,0,2,1,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,1
3,7795-CFOCW,1,0,0,0,45,0,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,0,0,0,0,2,1,No,Fiber optic,No,No,No,No,No,No,Month-to-month,1,Electronic check,70.7,151.65,1


In [18]:
# Change the limited unique values to numbers
data["PaymentMethod"] = data["PaymentMethod"].map({
    "Electronic check": 0,
    "Mailed check": 1,
    "Bank transfer (automatic)": 2,
    "Credit card (automatic)": 3
})

# Map for MultipleLines column
data["MultipleLines"] = data["MultipleLines"].map({
    "No phone service": 0,
    "No": 1,
    "Yes": 2
})

# Map for InternetService column
data["InternetService"] = data["InternetService"].map({
    "DSL": 0,
    "Fiber optic": 1,
    "No": 2
})

# Map for OnlineSecurity column
data["OnlineSecurity"] = data["OnlineSecurity"].map({
    "No internet service": 0,
    "No": 1,
    "Yes": 2
})

# Map for OnlineBackup column
data["OnlineBackup"] = data["OnlineBackup"].map({
    "No internet service": 0,
    "No": 1,
    "Yes": 2
})

# Map for DeviceProtection column
data["DeviceProtection"] = data["DeviceProtection"].map({
    "No internet service": 0,
    "No": 1,
    "Yes": 2
})

# Map for TechSupport column
data["TechSupport"] = data["TechSupport"].map({
    "No internet service": 0,
    "No": 1,
    "Yes": 2
})

# Map for StreamingTV column
data["StreamingTV"] = data["StreamingTV"].map({
    "No internet service": 0,
    "No": 1,
    "Yes": 2
})

# Map for StreamingMovies column
data["StreamingMovies"] = data["StreamingMovies"].map({
    "No internet service": 0,
    "No": 1,
    "Yes": 2
})

# Map for Contract column
data["Contract"] = data["Contract"].map({
    "Month-to-month": 0,
    "One year": 1,
    "Two year": 2
})


In [19]:
# Split 'customerID' by '-' and create two new columns
data[['ID1', 'ID2']] = data['customerID'].str.split('-', expand=True)

# Convert the ID1 part to numeric (if needed)
data['ID1'] = pd.to_numeric(data['ID1'], errors='coerce')
# Replace 'customerID' with 'ID1' values
data['customerID'] = data['ID1']

# Drop the 'ID1' and 'ID2' columns
data.drop(['ID1', 'ID2'], axis=1, inplace=True)

print("Data types of each column:")

# Coerce all 'object' and 'float' columns to numeric
data = data.apply(pd.to_numeric, errors='coerce')

print(data.dtypes)
data.head(2)

Data types of each column:
customerID            int64
gender                int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService       int64
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract              int64
PaperlessBilling      int64
PaymentMethod         int64
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590,0,0,1,0,1,0,0,0,1,2,1,1,1,1,0,1,0,29.85,29.85,0
1,5575,1,0,0,0,34,1,1,0,2,1,2,1,1,1,1,0,1,56.95,1889.5,0


In [20]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
display(data.head())

# Display the shape of the dataset
print("Shape of the dataset:")
print(data.shape)

# Display the column names
print("Column names in the dataset:")
print(data.columns)

# Display the data types of each column
print("Data types of each column:")
print(data.dtypes)

# Display information about the dataset
print("Information about the dataset:")
print(data.info())

# Check for null values in the dataset
print("Null values in the dataset:")
print(data.isnull().sum())

# Check for duplicate values in the dataset
print("Duplicate values in the dataset:")
print(data.duplicated().sum())

# Drop duplicate values from the dataset
data.drop_duplicates(inplace=True)

# Display summary statistics of the dataset
print("Summary statistics of the dataset:")
print(data.describe())



First few rows of the dataset:


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590,0,0,1,0,1,0,0,0,1,2,1,1,1,1,0,1,0,29.85,29.85,0
1,5575,1,0,0,0,34,1,1,0,2,1,2,1,1,1,1,0,1,56.95,1889.5,0
2,3668,1,0,0,0,2,1,1,0,2,2,1,1,1,1,0,1,1,53.85,108.15,1
3,7795,1,0,0,0,45,0,0,0,2,1,2,2,1,1,1,0,2,42.3,1840.75,0
4,9237,0,0,0,0,2,1,1,1,1,1,1,1,1,1,0,1,0,70.7,151.65,1


Shape of the dataset:
(7043, 21)
Column names in the dataset:
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')
Data types of each column:
customerID            int64
gender                int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService       int64
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract              int64
PaperlessBilling      int64
PaymentMethod         int64
MonthlyCharges      float6

In [21]:
# Fill NaN values in 'TotalCharges' with the mean of the column
data['TotalCharges'] = data['TotalCharges'].fillna(data['TotalCharges'].mean())
# Check for null values in the dataset
print("Null values in the dataset:")
print(data.isnull().sum())

print("Data types of each column:")
print(data.dtypes)


Null values in the dataset:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64
Data types of each column:
customerID            int64
gender                int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService       int64
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract              int64
PaperlessBilling      

In [22]:
# Separate categorical and numerical features
categorical_features = data.select_dtypes(include="object").columns
numerical_features = data.select_dtypes(exclude="object").columns

# Display categorical features
print("Categorical features:")
print(categorical_features)

# Display numerical features
print("Numerical features:")
print(numerical_features)

# Display unique values for each categorical feature
print("Unique values for each categorical feature:")
for feature in categorical_features:
    print(f"{feature}: {data[feature].unique()}")

Categorical features:
Index([], dtype='object')
Numerical features:
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')
Unique values for each categorical feature:


In [23]:
# Export the cleaned dataset to a CSV file
data.to_csv('data-source/cleaned_data.csv', index=False)


In [24]:
print("Data types of each column:")
print(data.dtypes)


Data types of each column:
customerID            int64
gender                int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService       int64
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract              int64
PaperlessBilling      int64
PaymentMethod         int64
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object
