In [191]:
import pandas as pd
import numpy as np

df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv.xls')

#Check Column type and missing values
print("==================================================================================================================")
print(df.head())
print("==================================================================================================================")
print(df.info())
print("==================================================================================================================")
print(df.describe())
print("==================================================================================================================")

df = df.replace(r'^\s*$', np.nan, regex=True)
print(df.isnull().sum())



   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [192]:
#Handle TotalCharges for missing values
#Fill all the missing with just 0 to make it easier
df['TotalCharges'].fillna(0, inplace=True)

print("\n=================================================================================================================")
print("Check for no more missing values:")
print(df.isnull().sum())


Check for no more missing values:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(0, inplace=True)


In [193]:
#Replace all "No internet service" and "No phone service" with "No"
columns_To_Replace = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                      'TechSupport', 'StreamingTV', 'StreamingMovies']

for col in columns_To_Replace:
    df[col] = df[col].replace('No internet service', 'No')

df['MultipleLines'] = df['MultipleLines'].replace('No phone service', 'No')

print("Cleaned data:")
print(df[columns_To_Replace].head())

Cleaned data:
  OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV  \
0             No          Yes               No          No          No   
1            Yes           No              Yes          No          No   
2            Yes          Yes               No          No          No   
3            Yes           No              Yes         Yes          No   
4             No           No               No          No          No   

  StreamingMovies  
0              No  
1              No  
2              No  
3              No  
4              No  


In [194]:
#Convert Yes/No to binary 
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

#Gender to Male:1 and Female: 0
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

print("Cleaned data:")
print(df[binary_cols + ['gender']].head())

Cleaned data:
   Partner  Dependents  PhoneService  PaperlessBilling  Churn  gender
0        1           0             0                 1      0       0
1        0           0             1                 0      0       1
2        0           0             1                 1      1       1
3        0           0             0                 0      0       1
4        0           0             1                 1      1       0


In [195]:
# Encode multi-category variables
categorical_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                   'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                   'Contract', 'PaymentMethod']

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print(f"Shape after encoding: {df.shape}")
print(f"\nNew columns created: {df.columns.tolist()}")

Shape after encoding: (7043, 25)

New columns created: ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'Churn', 'MultipleLines_Yes', 'InternetService_Fiber optic', 'InternetService_No', 'OnlineSecurity_Yes', 'OnlineBackup_Yes', 'DeviceProtection_Yes', 'TechSupport_Yes', 'StreamingTV_Yes', 'StreamingMovies_Yes', 'Contract_One year', 'Contract_Two year', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']


In [None]:
#Dont need customerID (not useful for prediction)
df = df.drop('customerID', axis=1)

print(f"Final shape: {df.shape}")
print("\n==================================================================================================================")
print("First few rows of preprocessed data:")
print(df.head())

Final shape: (7043, 24)

First few rows of preprocessed data:
   gender  SeniorCitizen  Partner  Dependents  tenure  PhoneService  \
0       0              0        1           0       1             0   
1       1              0        0           0      34             1   
2       1              0        0           0       2             1   
3       1              0        0           0      45             0   
4       0              0        0           0       2             1   

   PaperlessBilling  MonthlyCharges TotalCharges  Churn  ...  \
0                 1           29.85        29.85      0  ...   
1                 0           56.95       1889.5      0  ...   
2                 1           53.85       108.15      1  ...   
3                 0           42.30      1840.75      0  ...   
4                 1           70.70       151.65      1  ...   

   OnlineBackup_Yes  DeviceProtection_Yes  TechSupport_Yes  StreamingTV_Yes  \
0              True                 False      

In [197]:
#Normalize numerical feature 
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num_col = ['tenure', 'MonthlyCharges', 'TotalCharges']

df[num_col] = scaler.fit_transform(df[num_col])

print("Scaled numerical:")
print(df[num_col].describe())

Scaled numerical:
             tenure  MonthlyCharges  TotalCharges
count  7.043000e+03    7.043000e+03  7.043000e+03
mean  -2.421273e-17   -6.406285e-17 -3.783239e-17
std    1.000071e+00    1.000071e+00  1.000071e+00
min   -1.318165e+00   -1.545860e+00 -1.005780e+00
25%   -9.516817e-01   -9.725399e-01 -8.299464e-01
50%   -1.372744e-01    1.857327e-01 -3.905282e-01
75%    9.214551e-01    8.338335e-01  6.648034e-01
max    1.613701e+00    1.794352e+00  2.825806e+00


In [198]:
#View preprocessed dataset info for final check 
print("\n==================================================================================================================")
print("FINAL PREPROCESSED DATASET:")
print("==================================================================================================================")
print(f"Shape: {df.shape}")
print(f"\nData types:\n{df.dtypes.value_counts()}")
print(f"\nMissing values: {df.isnull().sum().sum()}")
print("\n=====================================================================================================================")
print(df.head())


FINAL PREPROCESSED DATASET:
Shape: (7043, 24)

Data types:
bool       14
int64       7
float64     3
Name: count, dtype: int64

Missing values: 0

   gender  SeniorCitizen  Partner  Dependents    tenure  PhoneService  \
0       0              0        1           0 -1.277445             0   
1       1              0        0           0  0.066327             1   
2       1              0        0           0 -1.236724             1   
3       1              0        0           0  0.514251             0   
4       0              0        0           0 -1.236724             1   

   PaperlessBilling  MonthlyCharges  TotalCharges  Churn  ...  \
0                 1       -1.160323     -0.992611      0  ...   
1                 0       -0.259629     -0.172165      0  ...   
2                 1       -0.362660     -0.958066      1  ...   
3                 0       -0.746535     -0.193672      0  ...   
4                 1        0.197365     -0.938874      1  ...   

   OnlineBackup_Yes  D

In [199]:
# Save preprocessed data to CSV
df.to_csv('preprocessed_telco_churn.csv', index=False)
print("Preprocessed data saved to 'preprocessed_telco_churn.csv'")

Preprocessed data saved to 'preprocessed_telco_churn.csv'

