In [2]:
import pandas as pd

In [3]:
clean_file_path = r"C:\Users\comat\GitProjects\customer-churn-ai\data\cleaned_telco_churn.csv"

try:
    df = pd.read_csv(clean_file_path)
    print("Data Loaded Successfully! Sensational!")
except FileNotFoundError:
    print(f"Error: File not found at {clean_file_path}. Please check the path and try again.")
except Exception as e:
    print(f"An error occurred: {e}")

Data Loaded Successfully! Sensational!


***Step 1:*** Identify Categorical Features  
-We have already explored variables like: Contract, PaymentMethod, InternetService, OnlineSecurity, TechSupport, etc.  
-Now you can get the categorical columns automatically

In [4]:
# Get all categorical columns automatically:

cat_cols = df.select_dtypes(include=['object']).columns.tolist()
print(cat_cols)
# This will include customerID, which we’ll drop.


['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']


***Step 2:*** Drop Non-Informative Columns

In [5]:
df = df.drop('customerID', axis=1)

***Step 3:*** Clean Yes/No Binary Columns  
Let’s convert Yes/No columns to binary 1/0:

In [7]:

# Define yes/no columns
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 
               'Churn', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
               'TechSupport', 'StreamingTV', 'StreamingMovies']

# Replace 'Yes' with 1 and 'No' with 0
df[binary_cols] = df[binary_cols].replace({'Yes': 1, 'No': 0}).infer_objects(copy=False)
# Some columns might have "No internet service" — we’ll clean that next.


***Step 4:*** Replace "No internet service" or "No phone service"  
These are equivalent to "No", so we’ll convert them:

In [8]:
cols_to_clean = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                 'TechSupport', 'StreamingTV', 'StreamingMovies']

for col in cols_to_clean:
    df[col] = df[col].replace({'No internet service': 0})

Similarly for MultipleLines:

In [9]:
df['MultipleLines'] = df['MultipleLines'].replace({'No phone service': 'No'})

***Step 5:*** One-Hot Encode Remaining Categoricals  
Use pd.get_dummies() on remaining object columns:

In [None]:
df = pd.get_dummies(df, drop_first=True)
# "drop_first=True": When encoding categorical features with 'get_dummies', this argument prevents the creation of redundant columns.
# For a categorical column with 'n' unique values, 'get_dummies' would typically create 'n' binary columns. However, these columns are often
# correlated. For instance, if a column has two values (e.g., 'Yes' and 'No'), knowing the value of one column directly implies the value of the other.
# 'drop_first=True' drops the first category's column, thus reducing the number of created columns by one. This avoids multicollinearity,
# a situation where independent variables in a model are highly correlated, which can negatively impact model training and interpretation.
# By dropping the first category, we still retain all the necessary information, as the presence or absence of the remaining categories implicitly indicates the value of the dropped category.

Let’s now inspect:

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7043 non-null   int64  
 1   Partner                                7043 non-null   int64  
 2   Dependents                             7043 non-null   int64  
 3   tenure                                 7043 non-null   int64  
 4   PhoneService                           7043 non-null   int64  
 5   OnlineSecurity                         7043 non-null   int64  
 6   OnlineBackup                           7043 non-null   int64  
 7   DeviceProtection                       7043 non-null   int64  
 8   TechSupport                            7043 non-null   int64  
 9   StreamingTV                            7043 non-null   int64  
 10  StreamingMovies                        7043 non-null   int64  
 11  Pape

In [15]:
print(df.head())

   SeniorCitizen  Partner  Dependents  tenure  PhoneService  OnlineSecurity  \
0              0        1           0       1             0               0   
1              0        0           0      34             1               1   
2              0        0           0       2             1               1   
3              0        0           0      45             0               1   
4              0        0           0       2             1               0   

   OnlineBackup  DeviceProtection  TechSupport  StreamingTV  ...  Churn  \
0             1                 0            0            0  ...      0   
1             0                 1            0            0  ...      0   
2             1                 0            0            0  ...      1   
3             0                 1            1            0  ...      0   
4             0                 0            0            0  ...      1   

   gender_Male  MultipleLines_Yes  InternetService_Fiber optic  \
0       