# Telco Customer Churn — Exploratory Data Analysis
**Dataset:** WA_Fn-UseC_-Telco-Customer-Churn.csv  
**Target variable:** `Churn` (Yes / No)  
**Shape:** 7 043 rows × 21 columns

In [None]:
# ──────────────────────────────────────────────
# 1.  Imports & global settings
# ──────────────────────────────────────────────
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

sns.set_theme(style='whitegrid', palette='muted')
plt.rcParams.update({'figure.figsize': (10, 5), 'axes.titlesize': 14})

# Load data
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.shape

---
## 2. Initial Inspection

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe(include='all')

---
## 3. Data Cleaning

In [None]:
# 3a. Drop identifier column  –  customerID is unique per row, no predictive value
df.drop(columns=['customerID'], inplace=True)

# 3b. Fix TotalCharges  –  stored as object; 11 rows contain blank strings
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
print('NaNs in TotalCharges after coercion:', df['TotalCharges'].isna().sum())

# Fill the 11 missing values with the median (robust to skew)
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# 3c. Encode target  –  Yes → 1, No → 0
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

print('\nRemaining nulls:\n', df.isnull().sum())
print('\nChurn distribution after cleaning:\n', df['Churn'].value_counts())

---
## 4. Target Distribution  (Class Balance Check)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# --- bar chart ---
counts = df['Churn'].value_counts().sort_index()
axes[0].bar(['No Churn (0)', 'Churn (1)'], counts.values,
            color=['#4C72B0', '#DD8452'], edgecolor='black', width=0.45)
for i, v in enumerate(counts.values):
    axes[0].text(i, v + 40, str(v), ha='center', fontweight='bold', fontsize=13)
axes[0].set_ylabel('Count')
axes[0].set_title('Churn Class Distribution')

# --- pie chart ---
axes[1].pie(counts.values, labels=['No Churn', 'Churn'],
            autopct='%1.1f%%', colors=['#4C72B0', '#DD8452'],
            startangle=90, textprops={'fontsize': 12})
axes[1].set_title('Churn Ratio')

plt.tight_layout()
plt.show()

print(f'Churn rate: {df["Churn"].mean()*100:.2f} %  →  moderately imbalanced dataset')

---
## 5. Numerical Features — Distributions & Churn Split

In [None]:
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

fig, axes = plt.subplots(len(num_cols), 2, figsize=(14, 4 * len(num_cols)))

for i, col in enumerate(num_cols):
    # --- histogram (overall) ---
    axes[i, 0].hist(df[col], bins=40, color='#4C72B0', edgecolor='black', alpha=0.8)
    axes[i, 0].axvline(df[col].mean(), color='red', ls='--', lw=1.5, label=f'Mean {df[col].mean():.1f}')
    axes[i, 0].axvline(df[col].median(), color='orange', ls='--', lw=1.5, label=f'Median {df[col].median():.1f}')
    axes[i, 0].legend(fontsize=9)
    axes[i, 0].set_title(f'{col} — Overall Distribution')
    axes[i, 0].set_xlabel(col)

    # --- KDE by churn ---
    for label, color in [(0, '#4C72B0'), (1, '#DD8452')]:
        subset = df.loc[df['Churn'] == label, col]
        axes[i, 1].hist(subset, bins=40, alpha=0.45, color=color,
                        label='No Churn' if label == 0 else 'Churn', edgecolor='black')
    axes[i, 1].legend()
    axes[i, 1].set_title(f'{col} — By Churn Status')
    axes[i, 1].set_xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
# Box-plots for outlier inspection
fig, axes = plt.subplots(1, len(num_cols), figsize=(14, 4))
for i, col in enumerate(num_cols):
    sns.boxplot(x='Churn', y=col, data=df, ax=axes[i],
                palette={0: '#4C72B0', 1: '#DD8452'})
    axes[i].set_xticklabels(['No Churn', 'Churn'])
    axes[i].set_title(f'{col}')
plt.suptitle('Numerical Features — Outlier & Churn Comparison', y=1.02, fontsize=14)
plt.tight_layout()
plt.show()

---
## 6. Categorical Features — Churn Rate Analysis

In [None]:
cat_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
            'PhoneService', 'MultipleLines', 'InternetService',
            'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
            'TechSupport', 'StreamingTV', 'StreamingMovies',
            'Contract', 'PaperlessBilling', 'PaymentMethod']

# Number of sub-plots  →  arrange in a 4-col grid
n_cols_grid = 4
n_rows_grid = int(np.ceil(len(cat_cols) / n_cols_grid))

fig, axes = plt.subplots(n_rows_grid, n_cols_grid,
                         figsize=(6 * n_cols_grid, 4.5 * n_rows_grid))
axes = axes.flatten()

for idx, col in enumerate(cat_cols):
    ct = pd.crosstab(df[col], df['Churn'], normalize='index') * 100
    ct.plot(kind='bar', stacked=True, ax=axes[idx],
            color=['#4C72B0', '#DD8452'], edgecolor='black', width=0.6)
    axes[idx].set_title(col, fontsize=11)
    axes[idx].set_ylabel('% of category')
    axes[idx].set_xlabel('')
    axes[idx].tick_params(axis='x', rotation=30)
    axes[idx].legend(title='Churn', labels=['No', 'Yes'], fontsize=8)

# Hide unused axes
for idx in range(len(cat_cols), len(axes)):
    axes[idx].set_visible(False)

plt.suptitle('Churn Rate by Categorical Feature', fontsize=16, y=1.01)
plt.tight_layout()
plt.show()

---
## 7. Churn-Rate Summary Table  (sorted)

In [None]:
# Compute churn-rate per category for every categorical feature
rows = []
for col in cat_cols:
    grp = df.groupby(col)['Churn'].mean() * 100
    for cat, rate in grp.items():
        rows.append({'Feature': col, 'Category': str(cat), 'Churn Rate (%)': round(rate, 2)})

churn_summary = (pd.DataFrame(rows)
                 .sort_values('Churn Rate (%)', ascending=False)
                 .reset_index(drop=True))
churn_summary

---
## 8. Correlation Heatmap  (all features encoded)

In [None]:
# Label-encode categorical columns on a copy  –  original df untouched
df_enc = df.copy()
le = LabelEncoder()
for col in df_enc.select_dtypes(include='object').columns:
    df_enc[col] = le.fit_transform(df_enc[col])

corr = df_enc.corr()

fig, ax = plt.subplots(figsize=(14, 11))
mask = np.triu(np.ones_like(corr, dtype=bool))          # upper triangle
sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap='coolwarm',
            center=0, linewidths=0.5, square=True,
            cbar_kws={'shrink': 0.8}, ax=ax)
ax.set_title('Feature Correlation Heatmap (Label-Encoded)', pad=15)
plt.tight_layout()
plt.show()

---
## 9. Top Features Correlated with Churn

In [None]:
churn_corr = corr['Churn'].drop('Churn').abs().sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(9, 5))
bars = ax.barh(churn_corr.index, churn_corr.values,
               color=plt.cm.RdYlGn_r(churn_corr.values / churn_corr.max()),
               edgecolor='black')
ax.set_xlabel('|Correlation with Churn|')
ax.set_title('Absolute Correlation of Each Feature with Churn')
ax.invert_yaxis()
for bar, val in zip(bars, churn_corr.values):
    ax.text(val + 0.005, bar.get_y() + bar.get_height()/2,
            f'{val:.3f}', va='center', fontsize=9)
plt.tight_layout()
plt.show()

---
## 10. Pairplot — Key Numerical Features coloured by Churn

In [None]:
pair_df = df[['tenure', 'MonthlyCharges', 'TotalCharges', 'Churn']].copy()
pair_df['Churn_label'] = pair_df['Churn'].map({0: 'No Churn', 1: 'Churn'})

g = sns.pairplot(pair_df, hue='Churn_label', diag_kind='kde',
                 palette={'No Churn': '#4C72B0', 'Churn': '#DD8452'},
                 plot_kws={'alpha': 0.5, 's': 15})
g.fig.suptitle('Pairplot of Numerical Features by Churn', y=1.02, fontsize=14)
plt.show()

---
## 11. Key EDA Findings

| # | Observation |
|---|-------------|
| 1 | **Class imbalance** – ~26.9 % of customers churned; models should use stratified splits and consider class-weight or SMOTE. |
| 2 | **Tenure** – Churners cluster heavily in the first ~20 months. Long-tenure customers rarely churn. |
| 3 | **MonthlyCharges** – Higher charges correlate with churn; the churner distribution is right-shifted. |
| 4 | **TotalCharges** – Low values (new customers) dominate the churn group; 11 blank entries were median-imputed. |
| 5 | **Contract type** – Month-to-month contracts have the highest churn rate (~42 %). Two-year contracts churn < 5 %. |
| 6 | **Internet & add-ons** – Fiber-optic subscribers and those without Online Security / Tech Support churn more. |
| 7 | **Payment** – Electronic-check users churn most; automatic methods (bank transfer, credit card) are more stable. |
| 8 | **Gender / PhoneService** – Nearly neutral w.r.t. churn, low predictive signal. |
| 9 | **Strongest correlations with Churn** – Contract, tenure, OnlineSecurity, TechSupport, InternetService (after encoding). |
| 10 | **TotalCharges vs tenure** – Very high positive correlation (~0.83); potential multicollinearity to watch during modelling. |