# Load libraries and dataset
This cell imports pandas and loads the raw CSV into a DataFrame named `df`. We use a relative path to the `data/raw` folder.

In [None]:
# Import pandas for data manipulation
import pandas as pd

# Read the raw CSV file into a DataFrame called df
# Path is relative to the notebook location
df = pd.read_csv("../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Display first few rows to verify the load
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# Drop identifier column
We remove `customerID` because it's a unique identifier and not useful for modeling or EDA.

In [None]:
# Drop the customerID column in-place to avoid keeping an unused identifier
df.drop(columns=['customerID'], inplace=True)


# Encode target variable
Convert the `Churn` column from 'Yes'/'No' to binary 1/0 so it's ready for modeling.

In [None]:
# Map Churn 'Yes'->1, 'No'->0
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
# Quick check to see class distribution after mapping
df['Churn'].value_counts()


Churn
0    5174
1    1869
Name: count, dtype: int64

# Identify categorical and numerical columns
We separate columns by dtype: object => categorical, everything else => numerical. This helps later for encoding and scaling.

In [None]:
# Select columns with object dtype as categorical columns
categorical_cols = df.select_dtypes(include='object').columns
# Select non-object columns as numerical columns
numerical_cols = df.select_dtypes(exclude='object').columns

# Return both lists so we can inspect them in the notebook output
categorical_cols, numerical_cols


(Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
        'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
        'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
        'PaperlessBilling', 'PaymentMethod', 'TotalCharges'],
       dtype='object'),
 Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'Churn'], dtype='object'))

# One-hot encode categorical features
We use `pd.get_dummies` with `drop_first=True` to avoid multicollinearity from dummy variables.

In [None]:
# Create dummy variables for categorical columns; drop the first level of each category
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
# Show the encoded DataFrame head to confirm encoding
df_encoded.head()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,TotalCharges_995.35,TotalCharges_996.45,TotalCharges_996.85,TotalCharges_996.95,TotalCharges_997.65,TotalCharges_997.75,TotalCharges_998.1,TotalCharges_999.45,TotalCharges_999.8,TotalCharges_999.9
0,0,1,29.85,0,False,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1,0,34,56.95,0,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,2,53.85,1,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,45,42.3,0,True,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,0,2,70.7,1,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


# Prepare train/test split
We separate features `X` and target `y`, then split with stratification to preserve class distribution.

In [None]:
# Import helper to split data into train and test sets
from sklearn.model_selection import train_test_split

# Features (X) and target (y)
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

# Split while preserving the proportion of churn classes using stratify=y
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Display shapes of train and test feature sets to confirm split
X_train.shape, X_test.shape


((5634, 6559), (1409, 6559))

# Notebook note
All code cells now include inline comments and each logical step has a preceding markdown explanation. If you want a different comment style (docstrings, more detail, or shorter notes), tell me which style to use going forward.

In [None]:
import os

os.makedirs("../data/processed", exist_ok=True)
# Save the training and testing sets to CSV files without index


In [11]:
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)
# Data Preprocessing for Customer Churn Prediction
