In [22]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [23]:
# Load the dataset
file_path = 'customer_churn_data.csv'
data = pd.read_csv(file_path)

# Print the shape of the dataset
print(f"The dataset has {data.shape[0]} Rows and {data.shape[1]} columns")

# Display the first few rows of the dataset
data_head = data.head()

data_head

The dataset has 1000 Rows and 10 columns


Unnamed: 0,CustomerID,Age,Gender,Tenure,MonthlyCharges,ContractType,InternetService,TotalCharges,TechSupport,Churn
0,1,49,Male,4,88.35,Month-to-Month,Fiber Optic,353.4,Yes,Yes
1,2,43,Male,0,36.67,Month-to-Month,Fiber Optic,0.0,Yes,Yes
2,3,51,Female,2,63.79,Month-to-Month,Fiber Optic,127.58,No,Yes
3,4,60,Female,8,102.34,One-Year,DSL,818.72,Yes,Yes
4,5,42,Male,32,69.01,Month-to-Month,,2208.32,No,Yes


In [24]:
# Check for missing values in each column
missing_values = data.isnull().sum()

# Check percentage of missing values
missing_percentage = (missing_values / len(data)) * 100

# Combine into a summary dataframe
missing_summary = pd.DataFrame({
    "Missing Values": missing_values,
    "Percentage": missing_percentage
}).sort_values(by="Missing Values", ascending=False)

missing_summary


Unnamed: 0,Missing Values,Percentage
InternetService,297,29.7
CustomerID,0,0.0
Age,0,0.0
Gender,0,0.0
MonthlyCharges,0,0.0
Tenure,0,0.0
ContractType,0,0.0
TotalCharges,0,0.0
TechSupport,0,0.0
Churn,0,0.0


In [25]:
# Handle missing values using Imputation

# For the categorical column, I used mode (most frequent value).
data.fillna({'InternetService': data['InternetService'].mode()[0]}, inplace=True)

# Verify if missing values have been handled
missing_values_after = data.isnull().sum()

missing_values_after


CustomerID         0
Age                0
Gender             0
Tenure             0
MonthlyCharges     0
ContractType       0
InternetService    0
TotalCharges       0
TechSupport        0
Churn              0
dtype: int64

In [26]:
# Checking for Duplicated Values
data[data.duplicated()]


Unnamed: 0,CustomerID,Age,Gender,Tenure,MonthlyCharges,ContractType,InternetService,TotalCharges,TechSupport,Churn


In [27]:
# Encode the target variable `Churn`
label_encoder = LabelEncoder()
data['Churn'] = label_encoder.fit_transform(data['Churn'])  # Yes = 1, No = 0

data_head = data.head()

data_head

Unnamed: 0,CustomerID,Age,Gender,Tenure,MonthlyCharges,ContractType,InternetService,TotalCharges,TechSupport,Churn
0,1,49,Male,4,88.35,Month-to-Month,Fiber Optic,353.4,Yes,1
1,2,43,Male,0,36.67,Month-to-Month,Fiber Optic,0.0,Yes,1
2,3,51,Female,2,63.79,Month-to-Month,Fiber Optic,127.58,No,1
3,4,60,Female,8,102.34,One-Year,DSL,818.72,Yes,1
4,5,42,Male,32,69.01,Month-to-Month,Fiber Optic,2208.32,No,1


In [28]:

# One-hot encode nominal variables
data = pd.get_dummies(data, columns=['Gender', 'ContractType', 'InternetService', 'TechSupport'], drop_first=True)

# Verify the encoded dataset
encoded_columns = data.columns
data_head_encoded = data.head()

encoded_columns, data_head_encoded


(Index(['CustomerID', 'Age', 'Tenure', 'MonthlyCharges', 'TotalCharges',
        'Churn', 'Gender_Male', 'ContractType_One-Year',
        'ContractType_Two-Year', 'InternetService_Fiber Optic',
        'TechSupport_Yes'],
       dtype='object'),
    CustomerID  Age  Tenure  MonthlyCharges  TotalCharges  Churn  Gender_Male  \
 0           1   49       4           88.35        353.40      1         True   
 1           2   43       0           36.67          0.00      1         True   
 2           3   51       2           63.79        127.58      1        False   
 3           4   60       8          102.34        818.72      1        False   
 4           5   42      32           69.01       2208.32      1         True   
 
    ContractType_One-Year  ContractType_Two-Year  InternetService_Fiber Optic  \
 0                  False                  False                         True   
 1                  False                  False                         True   
 2                  Fal

In [29]:
# Select numerical columns for scaling
numerical_cols = ['Age', 'Tenure', 'MonthlyCharges', 'TotalCharges']
 
# Initialize the MinMaxScaler
scaler = StandardScaler()

# Apply MinMax scaling
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Verify the transformation
data[numerical_cols].describe()


Unnamed: 0,Age,Tenure,MonthlyCharges,TotalCharges
count,1000.0,1000.0,1000.0,1000.0
mean,5.1514350000000004e-17,5.861978000000001e-17,-4.725109e-16,-3.907985e-17
std,1.0005,1.0005,1.0005,1.0005
min,-3.336519,-1.00476,-1.72734,-0.8939477
25%,-0.6815183,-0.739973,-0.8573718,-0.6741995
50%,0.03328963,-0.3163142,-0.01289105,-0.3383224
75%,0.6459821,0.3721313,0.8448197,0.3156084
max,3.913676,5.456037,1.773155,7.009614


In [31]:
# Save the dataframe to a CSV file
output_path = "cleaned_customer_churn_dataset.csv"
data.to_csv(output_path, index=False)

print(f"Dataset successfully saved to {output_path}")


Dataset successfully saved to cleaned_customer_churn_dataset.csv
