In [3]:
# ==========================================
# ðŸ“Š Data Cleaning and Preparation for Customer Churn Analysis
# ==========================================

# STEP 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# ==========================================
# STEP 2: Load Dataset
# ==========================================
df = pd.read_csv("File.csv")

# Preview data
print(df.head())
print(df.info())
print(df.shape)

# ==========================================
# STEP 3: Handle Missing Values
# ==========================================

# Check missing values
print("\nMissing values:\n", df.isnull().sum())

# Example: Fill numeric columns with mean, categorical with mode
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)

# ==========================================
# STEP 4: Remove Duplicates
# ==========================================
print("\nDuplicate rows before:", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("Duplicate rows after:", df.duplicated().sum())

# ==========================================
# STEP 5: Handle Inconsistent Data
# ==========================================
# Example: Standardize text columns (like Yes/No, Male/Female)
df['Churn'] = df['Churn'].str.strip().str.lower().replace({'yes': 1, 'no': 0})
df['Gender'] = df['Gender'].str.title()

# ==========================================
# STEP 6: Convert Data Types
# ==========================================
# Example: Convert TotalCharges from object to numeric
if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)

# ==========================================
# STEP 7: Identify and Handle Outliers
# ==========================================
# Using boxplot to visualize numeric columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

for col in numeric_cols:
    plt.figure()
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
    plt.show()

# Optional: Handle outliers using IQR method
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower, lower, df[col])
    df[col] = np.where(df[col] > upper, upper, df[col])

# ==========================================
# STEP 8: Feature Engineering
# ==========================================
# Example features
if 'MonthlyCharges' in df.columns and 'tenure' in df.columns:
    df['AvgChargesPerMonth'] = df['TotalCharges'] / (df['tenure'] + 1)

if 'SeniorCitizen' in df.columns:
    df['IsSenior'] = df['SeniorCitizen'].apply(lambda x: 1 if x == 1 else 0)

# ==========================================
# STEP 9: Normalize or Scale Data
# ==========================================
scaler = StandardScaler()
num_cols = df.select_dtypes(include=['float64', 'int64']).columns

df[num_cols] = scaler.fit_transform(df[num_cols])

# ==========================================
# STEP 10: Split into Train and Test Sets
# ==========================================
if 'Churn' in df.columns:
    X = df.drop('Churn', axis=1)
    y = df['Churn']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print("\nTraining set:", X_train.shape)
    print("Testing set:", X_test.shape)

# ==========================================
# STEP 11: Export Cleaned Dataset
# ==========================================
df.to_csv("Cleaned_Telecom_Customer_Churn.csv", index=False)
print("\nâœ… Cleaned dataset saved successfully as 'Cleaned_Telecom_Customer_Churn.csv'")


   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
  df['Churn'] = df['Churn'].str.strip().str.lower().replace({'yes': 1, 'no': 0})


KeyError: 'Gender'