# Stage 08 — Exploratory Data Analysis (EDA)

Goals:
- Summarize & visualize distributions and relationships
- Interpret trends/anomalies to inform modeling
- Document insights and assumptions clearly

## 0. Setup & Imports

In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis

sns.set(context='talk', style='whitegrid')
np.random.seed(8)
pd.set_option('display.max_columns', 100)

# Helper display
def title(msg):
    print('\n' + '='*len(msg))
    print(msg)
    print('='*len(msg))

## 1. Synthetic Data Generator (Reproducible)
No external CSVs required. This mimics a small financial-behavior dataset with both numeric and categorical variables and a time component.

In [ ]:
n = 180  # ~6 months of daily data
dates = pd.date_range('2021-01-01', periods=n, freq='D')

regions = np.random.choice(['North','South','East','West'], size=n, p=[0.25,0.25,0.30,0.20])
age = np.random.normal(40, 8, size=n).clip(22, 70)
income = np.random.lognormal(mean=10.6, sigma=0.3, size=n)  # lognormal skew
transactions = np.random.poisson(lam=3, size=n) + (np.random.rand(n) < 0.05).astype(int)*8  # rare spikes
base_spend = (income * (0.0015 + 0.00002*(age-40)) + transactions*20)
noise = np.random.normal(0, 50, size=n)
spend = (base_spend + noise).clip(0)

# Assemble
df = pd.DataFrame({
    'date': dates,
    'region': regions,
    'age': age.round(1),
    'income': income.round(2),
    'transactions': transactions,
    'spend': spend.round(2)
})

# Inject missingness and outliers
df.loc[np.random.choice(df.index, size=6, replace=False), 'income'] = np.nan
df.loc[np.random.choice(df.index, size=4, replace=False), 'spend'] = np.nan
df.loc[np.random.choice(df.index, size=2, replace=False), 'transactions'] = df['transactions'].max() + 15  # extreme outliers

df.head()

## 2. First Look: Structure & Sanity Checks

In [ ]:
title('df.info()')
df.info()
title('Head & Tail')
display(df.head(3))
display(df.tail(3))
title('Missingness counts')
df.isna().sum()

In [ ]:
title('Descriptive statistics')
desc = df[['age','income','transactions','spend']].describe().T
desc['skew'] = [skew(df[c].dropna()) for c in desc.index]
desc['kurtosis'] = [kurtosis(df[c].dropna()) for c in desc.index]
desc

### Notes
- `income` is lognormal (positively skewed): might consider log-transform later.
- `transactions` presents rare spikes: inspect outliers vs data quality.
- Missing values exist in `income` and `spend`: decide imputation strategy before modeling.

## 3. Univariate Visuals: Distributions & Outliers

In [ ]:
fig, axes = plt.subplots(2,2, figsize=(12,8))
sns.histplot(df['income'], kde=True, ax=axes[0,0])
axes[0,0].set_title('Income Distribution')
sns.boxplot(x=df['transactions'], ax=axes[0,1])
axes[0,1].set_title('Transactions (Outliers)')
sns.histplot(df['spend'], kde=True, ax=axes[1,0])
axes[1,0].set_title('Spend Distribution')
sns.countplot(x=df['region'], ax=axes[1,1])
axes[1,1].set_title('Region Frequency')
plt.tight_layout()
plt.show()

## 4. Bivariate Visuals: Relationships

In [ ]:
fig, axes = plt.subplots(1,2, figsize=(12,4))
sns.scatterplot(data=df, x='income', y='spend', hue='region', ax=axes[0])
axes[0].set_title('Income vs Spend')
sns.scatterplot(data=df, x='age', y='spend', ax=axes[1])
axes[1].set_title('Age vs Spend')
plt.tight_layout()
plt.show()

## 5. Time Series Glance

In [ ]:
daily = df.set_index('date')[['transactions','spend']].resample('W').sum()
daily.plot(subplots=True, figsize=(10,6), title=['Weekly Transactions','Weekly Spend'])
plt.tight_layout()
plt.show()

## 6. Correlation Matrix (Optional but Recommended)

In [ ]:
corr = df[['age','income','transactions','spend']].corr(numeric_only=True)
plt.figure(figsize=(6,4))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='vlag', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()
corr

## 7. Reusable Helper: `eda_summary(df)`

In [ ]:
def eda_summary(df: pd.DataFrame, numeric_cols=None):
    """Return a dict with quick profiling stats and basic missingness.
    numeric_cols: optional list to limit numeric profiling.
    """
    if numeric_cols is None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    out = {}
    out['shape'] = df.shape
    out['dtypes'] = df.dtypes.to_dict()
    out['missing'] = df.isna().sum().to_dict()
    profile = df[numeric_cols].describe().T
    profile['skew'] = [skew(df[c].dropna()) for c in profile.index]
    profile['kurtosis'] = [kurtosis(df[c].dropna()) for c in profile.index]
    out['numeric_profile'] = profile
    return out

summary = eda_summary(df)
summary['shape'], list(summary['dtypes'].items())[:3]

### 8. “So What?”: Insights & Assumptions
- **Skew**: `income` and `spend` are right-skewed → consider log transforms for linear models.
- **Outliers**: transactions show spikes → verify data quality; maybe winsorize.
- **Relationships**: positive `income–spend`; check nonlinearity (might motivate interaction terms later).
- **Missingness**: imputation strategy required before modeling.

**Next step**: translate these into feature hypotheses (ratios, lags, logs) in the Feature Engineering stage.

## 9. (Optional) Save Notebook Naming Pattern
Use: `notebooks/eda_<team>.ipynb`. (Skip actual disk I/O in this demo.)