# Task 2: Exploratory Data Analysis — Financial Inclusion in Ethiopia

**Objective:** Analyze patterns and factors influencing financial inclusion.

1. Dataset overview & temporal coverage
2. Access analysis (account ownership trajectory, growth rates, gender)
3. Usage analysis (digital payments, mobile money)
4. Infrastructure and enablers
5. Event timeline and overlay on trends
6. Correlation analysis
7. Key insights & data quality assessment

In [None]:
import sys
from pathlib import Path
ROOT = Path.cwd().parent if "notebooks" in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(ROOT))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.data import load_unified_data, load_reference_codes, load_guide, enrich_unified_data
from src.analysis import get_access_series, get_usage_series, get_events_timeline

pd.set_option("display.max_columns", 20)
sns.set_theme(style="whitegrid")

In [None]:
df = load_unified_data()
df = enrich_unified_data(df)
obs = df[df["record_type"] == "observation"].copy()
events = df[df["record_type"] == "event"].copy()
impact = df[df["record_type"] == "impact_link"].copy()
obs["observation_date"] = pd.to_datetime(obs["observation_date"])
obs["year"] = obs["observation_date"].dt.year
events["observation_date"] = pd.to_datetime(events["observation_date"])

---
## 1. Dataset Overview

### 1.1 Summarize by record_type, pillar, source_type

In [None]:
print("Record types:")
print(df["record_type"].value_counts().to_string())
print()
print("Observations by pillar:")
print(obs["pillar"].value_counts().to_string())
print()
print("Observations by source_type:")
print(obs["source_type"].value_counts().to_string())

### 1.2 Temporal coverage: which years have data for which indicators?

In [None]:
ind_year = obs.pivot_table(
    index="indicator_code", columns="year", values="value_numeric", aggfunc="mean"
)
ind_year_bin = (ind_year.notna()).astype(int)
plt.figure(figsize=(12, 10))
sns.heatmap(ind_year_bin, cmap="Blues", cbar_kws={"label": "Has data (1)"})
plt.title("Temporal coverage: indicators x year")
plt.tight_layout()
plt.show()

### 1.3 Data quality: distribution of confidence levels

In [None]:
conf = obs["confidence"].value_counts()
plt.figure(figsize=(6, 4))
conf.plot(kind="bar", color=["#2ecc71", "#f39c12", "#e74c3c", "#95a5a6"])
plt.title("Observation confidence distribution")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
print(conf.to_string())

### 1.4 Gaps: indicators with sparse coverage

In [None]:
n_obs_per_ind = obs.groupby("indicator_code").agg(
    n_obs=("record_id", "count"), years=("year", "nunique")
).sort_values("n_obs")
print("Indicators by number of observations (sparse at top):")
print(n_obs_per_ind.to_string())

---
## 2. Access Analysis

### 2.1 Account ownership trajectory (2011–2024)

In [None]:
acc_nat = obs[(obs["indicator_code"] == "ACC_OWNERSHIP") & (obs["gender"] == "all")].sort_values("observation_date")
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(acc_nat["observation_date"], acc_nat["value_numeric"], "o-", color="#2980b9", linewidth=2, markersize=10)
ax.set_xlabel("Year")
ax.set_ylabel("Account ownership (%)")
ax.set_title("Ethiopia: Account ownership trajectory (2011–2024)")
ax.set_ylim(0, 100)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### 2.2 Growth rates between survey years

In [None]:
acc_ts = acc_nat.sort_values("observation_date")
acc_ts = acc_ts.drop_duplicates(subset=["year"], keep="first")
acc_ts["prev"] = acc_ts["value_numeric"].shift(1)
acc_ts["growth_pp"] = acc_ts["value_numeric"] - acc_ts["prev"]
acc_ts["growth_pct"] = (acc_ts["value_numeric"] - acc_ts["prev"]) / acc_ts["prev"] * 100
print("Account ownership: level and growth between survey years")
print(acc_ts[["year", "value_numeric", "growth_pp", "growth_pct"]].to_string(index=False))

fig, ax = plt.subplots(figsize=(8, 4))
ax.bar(acc_ts["year"].astype(str), acc_ts["growth_pp"], color="#27ae60", alpha=0.8)
ax.axhline(0, color="black", linewidth=0.5)
ax.set_xlabel("Year (period end)")
ax.set_ylabel("Growth (percentage points)")
ax.set_title("Account ownership: change vs previous survey")
plt.tight_layout()
plt.show()

### 2.3 Gender gap (male vs female ownership)

In [None]:
acc_gender = obs[(obs["indicator_code"] == "ACC_OWNERSHIP") & (obs["gender"].isin(["male", "female"]))]
if len(acc_gender) >= 2:
    pivot = acc_gender.pivot_table(index="year", columns="gender", values="value_numeric")
    pivot["gap_pp"] = pivot["male"] - pivot["female"]
    print("Account ownership by gender and gap (pp):")
    print(pivot.to_string())
    fig, ax = plt.subplots(figsize=(8, 4))
    x = pivot.index.astype(str)
    ax.bar(np.arange(len(x)) - 0.2, pivot["male"], width=0.4, label="Male", color="#3498db")
    ax.bar(np.arange(len(x)) + 0.2, pivot["female"], width=0.4, label="Female", color="#e74c3c")
    ax.set_xticks(np.arange(len(x)))
    ax.set_xticklabels(x)
    ax.set_ylabel("%")
    ax.set_title("Account ownership by gender")
    ax.legend()
    plt.tight_layout()
    plt.show()
else:
    print("Insufficient gender-disaggregated data for ACC_OWNERSHIP.")

gen_gap = obs[obs["indicator_code"] == "GEN_GAP_ACC"]
if len(gen_gap) > 0:
    print("Gender gap (GEN_GAP_ACC) over time:")
    print(gen_gap[["observation_date", "value_numeric"]].to_string(index=False))

### 2.4 Urban vs rural
*(Dataset has location="national" only; no urban/rural disaggregation in observations.)*

In [None]:
print("Location values in observations:", obs["location"].dropna().unique().tolist())

### 2.5 2021–2024 slowdown
Account ownership grew only +3pp (46%→49%) despite massive mobile money expansion. Possible factors: survey timing vs operator rollout; "registered" vs "used in past 12 months"; double-counting bank+MM; structural (rural, women) lag.

---
## 3. Usage (Digital Payments) Analysis

### 3.1 Mobile money account penetration (2014–2024)

In [None]:
mm_acc = obs[obs["indicator_code"] == "ACC_MM_ACCOUNT"].sort_values("observation_date")
if len(mm_acc) > 0:
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.plot(mm_acc["observation_date"], mm_acc["value_numeric"], "s-", color="#8e44ad", linewidth=2)
    ax.set_xlabel("Year")
    ax.set_ylabel("Mobile money account (%)")
    ax.set_title("Mobile money account penetration")
    ax.set_ylim(0, 15)
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    print(mm_acc[["observation_date", "value_numeric", "unit"]].to_string(index=False))
else:
    print("No ACC_MM_ACCOUNT data.")

### 3.2 Digital payment adoption

In [None]:
dig = obs[obs["indicator_code"] == "USG_DIGITAL_PAY"].sort_values("observation_date")
if len(dig) > 0:
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.plot(dig["observation_date"], dig["value_numeric"], "o-", color="#16a085", linewidth=2)
    ax.set_xlabel("Year")
    ax.set_ylabel("%")
    ax.set_title("Digital payment adoption (made or received digital payment, past 12 months)")
    ax.set_ylim(0, 50)
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
print("Registered vs active: Telebirr/M-Pesa user counts vs Findex MM account % — different denominators; operator data = registered, Findex = usage in past 12 months.")

---
## 4. Infrastructure and Enablers

In [None]:
infra_codes = ["ACC_4G_COV", "ACC_MOBILE_PEN", "ACC_ATM_DENSITY", "ACC_BRANCH_DENSITY", "ACC_FAYDA"]
infra = obs[obs["indicator_code"].isin(infra_codes)].copy()
infra = infra.dropna(subset=["value_numeric"])
if len(infra) > 0:
    fig, ax = plt.subplots(figsize=(10, 5))
    for code in infra["indicator_code"].unique():
        sub = infra[infra["indicator_code"] == code].sort_values("observation_date")
        ax.plot(sub["observation_date"], sub["value_numeric"], "o-", label=code, alpha=0.8)
    ax.set_xlabel("Date")
    ax.set_ylabel("Value")
    ax.set_title("Infrastructure & enablers (4G, mobile pen, Fayda; ATM/branch if present)")
    ax.legend(bbox_to_anchor=(1.02, 1))
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
print("Leading indicators: 4G coverage and mobile penetration enable account opening and usage; Fayda supports KYC and inclusion.")

---
## 5. Event Timeline and Overlay

In [None]:
ev = get_events_timeline(df)
fig, ax = plt.subplots(figsize=(12, max(6, len(ev) * 0.4)))
y_pos = np.arange(len(ev))
dates = pd.to_datetime(ev["observation_date"])
ax.scatter(dates, y_pos, s=80, color="steelblue", zorder=2)
ax.set_yticks(y_pos)
ax.set_yticklabels(ev["indicator"].str[:55] + " (" + ev["category"].astype(str) + ")", fontsize=9)
ax.set_xlabel("Date")
ax.set_title("Cataloged events timeline")
ax.set_xlim(pd.Timestamp("2020-01-01"), pd.Timestamp("2026-01-01"))
ax.set_ylim(-0.5, len(ev) - 0.5)
ax.grid(True, axis="x", alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
acc_nat = obs[(obs["indicator_code"] == "ACC_OWNERSHIP") & (obs["gender"] == "all")].sort_values("observation_date")
ax.plot(acc_nat["observation_date"], acc_nat["value_numeric"], "o-", label="Account ownership", color="#2980b9", linewidth=2)
ax.set_ylim(0, 100)
for _, r in ev.iterrows():
    ax.axvline(r["observation_date"], color="red", alpha=0.4, linestyle="--")
    ax.text(r["observation_date"], 92, r["indicator"][:25] + "…", rotation=90, fontsize=8)
ax.set_xlabel("Year")
ax.set_ylabel("Account ownership (%)")
ax.set_title("Account ownership with events overlay")
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

---
## 6. Correlation Analysis

In [None]:
# One row per year (mean across dates in that year) for cleaner correlation
obs_wide = obs.pivot_table(index="year", columns="indicator_code", values="value_numeric", aggfunc="mean")
numeric = obs_wide.select_dtypes(include=[np.number])
if numeric.shape[1] >= 2:
    corr = numeric.corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, annot=True, fmt=".2f", cmap="RdBu_r", center=0, square=True)
    plt.title("Correlation between indicators (across observation dates)")
    plt.tight_layout()
    plt.show()
    acc_corr = corr["ACC_OWNERSHIP"].drop("ACC_OWNERSHIP", errors="ignore").sort_values(ascending=False, key=lambda x: x.abs())
    print("Factors most correlated with ACC_OWNERSHIP:")
    print(acc_corr.head(10).to_string())

In [None]:
if "USG_DIGITAL_PAY" in numeric.columns:
    use_corr = corr["USG_DIGITAL_PAY"].drop("USG_DIGITAL_PAY", errors="ignore").sort_values(ascending=False, key=lambda x: x.abs())
    print("Factors most correlated with USG_DIGITAL_PAY:")
    print(use_corr.head(10).to_string())
print("\nImpact_link records (event -> indicator):")
print(impact[["parent_id", "related_indicator", "impact_direction", "impact_magnitude", "impact_estimate", "lag_months"]].to_string(index=False))

---
## 7. Key Insights & Data Quality Assessment

See **`eda_insights_and_quality.md`** in the project root for:
- At least 5 key insights with supporting evidence
- Data quality assessment and limitations

**Market nuances (Guide Sheet D):** P2P dominance for commerce; mobile-money-only users rare (~0.5%); bank accounts accessible; low formal credit — relevant for interpreting Findex vs operator data.