# Exploratory Data Analysis - Churn Prediction

This notebook provides a quick overview of the churn prediction dataset and key patterns.


In [None]:
# Import libraries
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.dataset import load_data

# Configure plotting
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline


## 1. Load Data


In [None]:
# Load dataset
df = load_data("../data/raw/synthetic_churn.csv")

print(f"Dataset shape: {df.shape}")
print(f"\nColumn types:\n{df.dtypes}")
print(f"\nFirst few rows:")
df.head()


## 2. Target Distribution


In [None]:
# Churn distribution
churn_counts = df['churned'].value_counts()
churn_pct = df['churned'].value_counts(normalize=True)

print("Churn Distribution:")
print(f"  Not Churned (0): {churn_counts[0]} ({churn_pct[0]:.1%})")
print(f"  Churned (1): {churn_counts[1]} ({churn_pct[1]:.1%})")

# Visualization
fig, ax = plt.subplots(1, 2, figsize=(12, 4))

ax[0].bar(['Not Churned', 'Churned'], churn_counts.values, color=['green', 'red'], alpha=0.7)
ax[0].set_ylabel('Count')
ax[0].set_title('Churn Distribution (Count)')
ax[0].grid(axis='y', alpha=0.3)

ax[1].pie(churn_counts.values, labels=['Not Churned', 'Churned'], autopct='%1.1f%%', 
          colors=['green', 'red'], startangle=90)
ax[1].set_title('Churn Distribution (Percentage)')

plt.tight_layout()
plt.show()


## 3. Numerical Features


In [None]:
# Summary statistics
numerical_cols = ['tenure_days', 'tickets_last_30d', 'avg_handle_time', 
                  'sentiment_avg', 'escalations_90d']

print("Numerical Feature Statistics:")
df[numerical_cols].describe()


In [None]:
# Distribution plots by churn status
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for idx, col in enumerate(numerical_cols):
    df[df['churned']==0][col].hist(ax=axes[idx], bins=30, alpha=0.5, 
                                     label='Not Churned', color='green', edgecolor='black')
    df[df['churned']==1][col].hist(ax=axes[idx], bins=30, alpha=0.5, 
                                     label='Churned', color='red', edgecolor='black')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].legend()
    axes[idx].grid(alpha=0.3)

# Remove empty subplot
fig.delaxes(axes[-1])

plt.suptitle('Numerical Feature Distributions by Churn Status', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()


## 4. Categorical Features


In [None]:
# Churn rate by categorical features
categorical_cols = ['channel', 'plan_tier', 'first_contact_resolution']

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, col in enumerate(categorical_cols):
    churn_by_cat = df.groupby(col)['churned'].agg(['mean', 'count'])
    
    churn_by_cat['mean'].plot(kind='bar', ax=axes[idx], color='steelblue', alpha=0.7, edgecolor='black')
    axes[idx].set_ylabel('Churn Rate')
    axes[idx].set_xlabel(col)
    axes[idx].set_title(f'Churn Rate by {col}')
    axes[idx].grid(axis='y', alpha=0.3)
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=45, ha='right')
    
    # Add count labels on bars
    for i, (val, count) in enumerate(zip(churn_by_cat['mean'], churn_by_cat['count'])):
        axes[idx].text(i, val, f'n={count}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()


## 5. Correlation Analysis


In [None]:
# Correlation matrix
correlation_features = numerical_cols + ['churned']
corr_matrix = df[correlation_features].corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=14)
plt.tight_layout()
plt.show()

# Print correlations with target
print("\nCorrelations with Churn (sorted by absolute value):")
churn_corr = corr_matrix['churned'].drop('churned').abs().sort_values(ascending=False)
for feature, corr in churn_corr.items():
    print(f"  {feature:25s}: {corr_matrix['churned'][feature]:+.3f}")


## 6. Key Insights

Based on the exploratory analysis:

1. **Class Balance**: Check if the churn rate is balanced or skewed
2. **Tenure**: Longer tenure typically correlates with lower churn
3. **Support Volume**: Higher ticket counts often signal dissatisfaction
4. **Sentiment**: Negative sentiment is a strong churn indicator
5. **Escalations**: Strong predictor of churn risk
6. **Channel & Plan**: Different churn rates across segments

**Next Steps:**
- Feature engineering (if needed)
- Handle class imbalance with `class_weight='balanced'`
- Train baseline model and evaluate
- Consider ensemble methods (XGBoost) for better performance


## 7. Missing Values Check


In [None]:
# Check for missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})

print("Missing Values Summary:")
print(missing_df[missing_df['Missing Count'] > 0])

if missing_df['Missing Count'].sum() == 0:
    print("âœ“ No missing values found in the dataset!")
