# Exploratory Data Analysis (EDA)
## 1. Init

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Output directory for images
os.makedirs('../visualizations/eda', exist_ok=True)

df = pd.read_csv('../data/processed/customer_features.csv')
print(df.shape)
df.head()

## 2. Target Distribution

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=df, x='Churn')
plt.title("Churn Distribution")
plt.savefig('../visualizations/eda/churn_distribution.png')
plt.show()

## 3. RFM Analysis

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sns.boxplot(data=df, x='Churn', y='Recency', ax=axes[0])
axes[0].set_title("Recency vs Churn")

sns.boxplot(data=df, x='Churn', y='Frequency', ax=axes[1])
axes[1].set_title("Frequency vs Churn (Log Scale)")
axes[1].set_yscale('log')

sns.boxplot(data=df, x='Churn', y='TotalSpent', ax=axes[2])
axes[2].set_title("TotalSpent vs Churn (Log Scale)")
axes[2].set_yscale('log')

plt.savefig('../visualizations/eda/rfm_vs_churn.png')
plt.show()

## 4. Correlation Heatmap

In [None]:
plt.figure(figsize=(12, 10))
# Drop non-numeric for correlation
numeric_df = df.select_dtypes(include=[np.number])
corr = numeric_df.corr()
sns.heatmap(corr, annot=False, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.savefig('../visualizations/eda/correlation_heatmap.png')
plt.show()

## 5. Statistical Tests (T-Test)

In [None]:
from scipy import stats

churned = df[df['Churn'] == 1]
active = df[df['Churn'] == 0]

print("Feature Significance (T-Test):")
for col in ['Recency', 'Frequency', 'TotalSpent', 'AvgDaysBetweenPurchases', 'CustomerLifetimeDays']:
    t_stat, p_val = stats.ttest_ind(churned[col], active[col], equal_var=False)
    sig = "Significant" if p_val < 0.05 else "Not Significant"
    print(f"{col}: p-value={p_val:.5f} ({sig})")