In [None]:
# Data Clean up

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv("../data/raw/telco_churn.csv")

# Step 1: Drop Non-Predictive Columns

In [2]:
df.drop(columns=['customerID'], inplace=True)

# Step 2: Clean 'TotalCharges' values

In [3]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
missing_count = df['TotalCharges'].isnull().sum()
print(f"Missing values in TotalCharges: {missing_count}")
df.dropna(subset=['TotalCharges'], inplace=True)

Missing values in TotalCharges: 11


# Step 3: Encode Target Variable (Churn)

In [4]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Step 4: One-Hot Encode Categorical Features

In [6]:
categorical_cols = df.select_dtypes(include='object').columns.tolist()
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Step 5: Scale Numerical Features

In [7]:
scaler = StandardScaler()
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Step 6: Save Cleaned Data

In [8]:
df.to_csv("../data/processed/clean_telco.csv", index=False)
print("Saved cleaned dataset to ../data/processed/clean_telco.csv")

Saved cleaned dataset to ../data/processed/clean_telco.csv
