In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder

# Set up the path to the raw data file (best-effort)
PROJECT_ROOT = str(Path.cwd())
RAW_DATA_PATH = str(Path(PROJECT_ROOT) / 'data' / 'raw' / 'WA_Fn-UseC_-Telco-Customer-Churn.csv')

print(f"Project Root: {PROJECT_ROOT}")
print(f"Raw Data Path: {RAW_DATA_PATH}")


In [5]:
# Load the dataset from the path defined above (robust lookup)
from pathlib import Path
filename = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'
# If RAW_DATA_PATH exists and the file is there, use it; otherwise search upwards and across the workspace
candidate = Path(RAW_DATA_PATH) if 'RAW_DATA_PATH' in globals() else None
if candidate is None or not candidate.exists():
    # search parent directories for data/raw/<filename> up to 6 levels
    p = Path.cwd()
    found = None
    for _ in range(7):
        test = p / 'data' / 'raw' / filename
        if test.exists():
            found = test
            break
        p = p.parent
    # fallback: global rglob search in cwd (may be slower)
    if found is None:
        results = list(Path.cwd().rglob(filename))
        found = results[0] if results else None
    if found is None:
        raise FileNotFoundError(f"Could not find {filename}. Tried RAW_DATA_PATH={RAW_DATA_PATH if 'RAW_DATA_PATH' in globals() else 'N/A'} and searched workspace.")
    RAW_DATA_PATH = str(found)
else:
    RAW_DATA_PATH = str(candidate)
print(f"Using data file: {RAW_DATA_PATH}")
df = pd.read_csv(RAW_DATA_PATH)

# Display basic information about the dataframe
print("Dataframe Shape:", df.shape)
df.info()


Using data file: d:\AI\MLops\MLOPS_churn_prediction_project\data\raw\WA_Fn-UseC_-Telco-Customer-Churn.csv
Dataframe Shape: (7043, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  St

In [6]:
# The 'TotalCharges' column has empty strings for new customers.
# We convert it to a numeric type. 'errors='coerce'' will turn any 
# non-numeric values (like empty strings) into NaN (Not a Number).
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Now, let's see how many missing values we created
print(f"Number of missing values in TotalCharges: {df['TotalCharges'].isnull().sum()}")

# It's safe to drop these 11 rows as they represent customers with no tenure.
df.dropna(inplace=True)

print(f"Shape of dataframe after dropping NaNs: {df.shape}")



Number of missing values in TotalCharges: 11
Shape of dataframe after dropping NaNs: (7032, 21)


In [7]:
# Feature 1: tenure_in_years
df['tenure_in_years'] = df['tenure'] / 12

# Feature 2: has_multiple_services
# We define which columns indicate a service
service_cols = [
    'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
    'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'
]

# We count how many active services a customer has.
# We treat "No phone service" and "No internet service" as not having that specific add-on.
df['num_services'] = df[service_cols].apply(lambda row: (row != 'No').sum(), axis=1)

# Create the binary flag: 1 if more than one service, 0 otherwise.
df['has_multiple_services'] = (df['num_services'] > 1).astype(int)

# Let's inspect our new features
print(df[['tenure', 'tenure_in_years', 'num_services', 'has_multiple_services']].head())

   tenure  tenure_in_years  num_services  has_multiple_services
0       1         0.083333             3                      1
1      34         2.833333             4                      1
2       2         0.166667             4                      1
3      45         3.750000             5                      1
4       2         0.166667             2                      1


In [8]:
# Separate features (X) from the target (y) and the ID column
X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn']

# Identify which columns are categorical and which are numerical
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

print("Categorical Columns to Encode:", categorical_cols)
print("Numerical Columns:", numerical_cols)

Categorical Columns to Encode: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
Numerical Columns: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'tenure_in_years', 'num_services', 'has_multiple_services']


In [9]:
# Initialize the OneHotEncoder
# handle_unknown='ignore' will prevent errors if new categories appear in future data
# sparse_output=False ensures we get a dense numpy array, which is easier to work with
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit and transform the categorical data
encoded_data = encoder.fit_transform(X[categorical_cols])

# Create a DataFrame with the new encoded columns
# get_feature_names_out helps us create meaningful column names
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

print("Shape of original categorical data:", X[categorical_cols].shape)
print("Shape of encoded data:", encoded_df.shape)
encoded_df.head()

Shape of original categorical data: (7032, 15)
Shape of encoded data: (7032, 41)


Unnamed: 0,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [10]:
# Combine the numerical features with the new one-hot encoded features
# We use reset_index(drop=True) to ensure the indices align for concatenation
X_processed = pd.concat([X[numerical_cols].reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

# Add the target variable back to the DataFrame
final_df = pd.concat([X_processed, y.reset_index(drop=True)], axis=1)

print("Final processed dataframe shape:", final_df.shape)
final_df.head()

Final processed dataframe shape: (7032, 49)


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,tenure_in_years,num_services,has_multiple_services,gender_Female,gender_Male,Partner_No,...,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn
0,0,1,29.85,29.85,0.083333,3,1,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,No
1,0,34,56.95,1889.5,2.833333,4,1,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,No
2,0,2,53.85,108.15,0.166667,4,1,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,Yes
3,0,45,42.3,1840.75,3.75,5,1,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,No
4,0,2,70.7,151.65,0.166667,2,1,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,Yes
