# Data Preparation

This notebook includes data preparation steps adapted from `HOS01Ab - Data Preparation.docx`. It will:

- Import necessary libraries
- Load `Customer-Churn.csv`
- Inspect the DataFrame
- Handle missing values
- Encode categorical variables
- Scale numerical features
- Build a ColumnTransformer for preprocessing

Run each cell in order.

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

In [2]:
# Load the dataset
csv_path = 'Customer-Churn.csv'
df = pd.read_csv(csv_path)

# Quick look
print('Shape:', df.shape)
df.head()

# Info and summary
print('\nInfo:')
print(df.info())

print('\nDescribe:')
print(df.describe(include='all'))

Shape: (7043, 20)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod    

In [3]:
# Check missing values
print(df.isnull().sum())

# If there are only a few missing values, drop them; otherwise impute.
missing_counts = df.isnull().sum().sum()
print('Total missing values:', missing_counts)

# Example: drop rows where many essential values are missing (change based on inspection)
df = df.dropna()
print('Shape after dropna:', df.shape)

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64
Total missing values: 11
Shape after dropna: (7032, 20)


In [5]:
# Separate features/target (adjust target column name if different)
if 'Churn' in df.columns:
    target_col = 'Churn'
else:
    # fallback to last column
    target_col = df.columns[-1]

X = df.drop(columns=[target_col])
y = df[target_col]

# Identify categorical and numerical columns
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()

print('Numerical columns:', num_cols)
print('Categorical columns:', cat_cols)

# Build preprocessing pipelines
numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ],
    remainder='drop'  # drop other columns
)

# Fit/transform example
X_pre = preprocessor.fit_transform(X)
print('Preprocessed shape:', X_pre.shape)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_pre, y, test_size=0.2, random_state=42)
print('Train shape:', X_train.shape)
print('Test shape:', X_test.shape)

Numerical columns: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
Categorical columns: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
Preprocessed shape: (7032, 30)
Train shape: (5625, 30)
Test shape: (1407, 30)
