### Data Clean up

In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Step 1: Load raw dataset

In [19]:
df = pd.read_csv("../data/raw/telco_churn.csv")

# Step 2: Drop Non-Predictive Columns

In [20]:
df.drop(columns=['customerID'], inplace=True)

# Step 3: Convert 'TotalCharges' to numeric (some rows are blank strings)

In [21]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Step 4: Drop rows with missing TotalCharges (usually 11 rows)

In [22]:
df.dropna(subset=['TotalCharges'], inplace=True)

# Step 5: Encode target variable

In [23]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Step 6: One-hot encode categorical features (drop first to avoid dummy trap)

In [24]:
categorical_cols = df.select_dtypes(include='object').columns.tolist()
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Step 7: Scale only INPUT features (not the target!)

In [25]:
numerical_cols = ['tenure', 'TotalCharges']  # ❗ DO NOT scale 'MonthlyCharges'
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Step 8: Save cleaned dataset to file

In [26]:
df.to_csv("../data/processed/clean_telco.csv", index=False)
print("Cleaned dataset saved to ../data/processed/clean_telco.csv")

Cleaned dataset saved to ../data/processed/clean_telco.csv
