# 1. Data Cleaning
This notebook handles data loading and cleaning for the Customer Churn Capstone Project.

In [None]:

import pandas as pd
import os

# Load raw data
df = pd.read_csv("data/raw_data.csv")
df.head()


In [None]:

# Standardize column names
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)
df.columns


In [None]:

# Remove duplicates
df = df.drop_duplicates()
df.shape


In [None]:

# Handle missing values
num_cols = df.select_dtypes(include="number").columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

cat_cols = df.select_dtypes(include="object").columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [None]:

# Fix churn column (numeric to categorical)
df['churn'] = df['churn'].map({0: 'no', 1: 'yes'})
df['churn'].value_counts()


In [None]:

# Save cleaned data
os.makedirs("data", exist_ok=True)
df.to_csv("data/cleaned_data.csv", index=False)

print("cleaned_data.csv saved successfully")
