In [1]:
# notebooks/task4_forecasting.ipynb
# Task 4: Forecasting Access and Usage 2025-2027
# Cell 1: Setup and Imports
import sys
import os
from pathlib import Path

project_root = Path(r"C:\Users\Administrator\Desktop\10Academy\Week 10\Week10-ethiopia-fi-forecast")
sys.path.insert(0, str(project_root))

from src.data_loader import EthiopiaFIData
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
import warnings
warnings.filterwarnings("ignore")

print("Setup complete.")

Setup complete.


In [2]:
# Cell 2: Load Enriched Dataset
enriched_path = project_root / "data" / "processed" / "ethiopia_fi_unified_data_enriched.csv"
ref_path = project_root / "data" / "raw" / "reference_codes.xlsx"

data = EthiopiaFIData(main_path=str(enriched_path), ref_path=str(ref_path))

print("Enriched dataset loaded for forecasting!")

Loading main CSV file: C:\Users\Administrator\Desktop\10Academy\Week 10\Week10-ethiopia-fi-forecast\data\processed\ethiopia_fi_unified_data_enriched.csv
Loading reference Excel file: C:\Users\Administrator\Desktop\10Academy\Week 10\Week10-ethiopia-fi-forecast\data\raw\reference_codes.xlsx
Loaded main dataset: 47 rows, 34 columns
Loaded reference codes: (71, 4)
Enriched dataset loaded for forecasting!


In [6]:
# Cell 3: Define Targets & Historical Series (Hardcoded Findex + Enrichments)
# Findex Account Ownership % (known from challenge/World Bank)
access_hist = pd.DataFrame({
    'Year': [2011, 2014, 2017, 2021, 2024],
    'Access (%)': [14.0, 22.0, 35.0, 46.0, 49.0]
}).set_index('Year')

# Usage proxy: Mobile money account % from Findex + enrichments (active as proxy)
usage_hist = pd.DataFrame({
    'Year': [2021, 2024, 2025],
    'Usage (%)': [4.7, 9.45, 15.0]  # 2025 estimated from M-Pesa active + trend
}).set_index('Year')

print("Historical Access (Account Ownership %):")
display(access_hist)

print("\nHistorical Usage (Digital Payment Proxy %):")
display(usage_hist)

Historical Access (Account Ownership %):


Unnamed: 0_level_0,Access (%)
Year,Unnamed: 1_level_1
2011,14.0
2014,22.0
2017,35.0
2021,46.0
2024,49.0



Historical Usage (Digital Payment Proxy %):


Unnamed: 0_level_0,Usage (%)
Year,Unnamed: 1_level_1
2021,4.7
2024,9.45
2025,15.0


In [7]:
# Cell 4: Baseline Trend Forecasting (Linear for Access, Log for Usage)
# Access baseline (linear)
years_hist = access_hist.index.values.reshape(-1, 1)
access_values = access_hist['Access (%)'].values

lin_model = LinearRegression()
lin_model.fit(years_hist, access_values)

years_future = np.array([2025, 2026, 2027]).reshape(-1, 1)
access_baseline = lin_model.predict(years_future)

# Usage baseline (log for slowing growth)
log_model = LinearRegression()
log_model.fit(years_hist[-3:], np.log(usage_hist['Usage (%)'].values + 1))
usage_baseline = np.exp(log_model.predict(years_future)) - 1

print("Baseline Forecasts:")
baseline_df = pd.DataFrame({
    'Year': [2025, 2026, 2027],
    'Access Baseline (%)': access_baseline.round(1),
    'Usage Baseline (%)': usage_baseline.round(1)
})
display(baseline_df)

Baseline Forecasts:


Unnamed: 0,Year,Access Baseline (%),Usage Baseline (%)
0,2025,54.8,17.7
1,2026,57.7,20.6
2,2027,60.5,24.1


In [8]:
# Cell 5: Event-Augmented & Scenario Forecasts
# Manual event effects from Task 3 (pp change, lagged)
event_effects_access = [2.0, 4.0, 5.0]  # NDPS + activation
event_effects_usage = [5.0, 8.0, 10.0]  # Interoperability boost

# Scenarios
forecast = pd.DataFrame({
    'Year': [2025, 2026, 2027],
    'Access Pessimistic (%)': access_baseline.round(1),
    'Access Base (%)': (access_baseline + event_effects_access).round(1),
    'Access Optimistic (%)': (access_baseline + np.array(event_effects_access) * 1.5).round(1),
    'Usage Pessimistic (%)': usage_baseline.round(1),
    'Usage Base (%)': (usage_baseline + event_effects_usage).round(1),
    'Usage Optimistic (%)': (usage_baseline + np.array(event_effects_usage) * 1.5).round(1)
})

# Wide CI due to sparse data (±10pp access, ±15pp usage)
forecast['Access Lower CI'] = (forecast['Access Base (%)'] - 10).clip(lower=49)
forecast['Access Upper CI'] = forecast['Access Base (%)'] + 10
forecast['Usage Lower CI'] = (forecast['Usage Base (%)'] - 15).clip(lower=9.45)
forecast['Usage Upper CI'] = forecast['Usage Base (%)'] + 15

print("Forecast Table with Confidence Intervals:")
display(forecast)

Forecast Table with Confidence Intervals:


Unnamed: 0,Year,Access Pessimistic (%),Access Base (%),Access Optimistic (%),Usage Pessimistic (%),Usage Base (%),Usage Optimistic (%),Access Lower CI,Access Upper CI,Usage Lower CI,Usage Upper CI
0,2025,54.8,56.8,57.8,17.7,22.7,25.2,49.0,66.8,9.45,37.7
1,2026,57.7,61.7,63.7,20.6,28.6,32.6,51.7,71.7,13.6,43.6
2,2027,60.5,65.5,68.0,24.1,34.1,39.1,55.5,75.5,19.1,49.1


In [10]:
# Cell 6: Scenario Visualization (Self-Contained with Import)
import plotly.express as px

# If forecast not defined (running cell alone), recreate from previous logic
try:
    _ = forecast
except NameError:
    # Recreate forecast DataFrame (from Cell 5 logic)
    access_baseline = np.array([52.0, 54.5, 57.0])  # Example baseline values
    usage_baseline = np.array([15.0, 20.0, 25.0])
    
    event_effects_access = [2.0, 4.0, 5.0]
    event_effects_usage = [5.0, 8.0, 10.0]
    
    forecast = pd.DataFrame({
        'Year': [2025, 2026, 2027],
        'Access Pessimistic (%)': access_baseline.round(1),
        'Access Base (%)': (access_baseline + event_effects_access).round(1),
        'Access Optimistic (%)': (access_baseline + np.array(event_effects_access) * 1.5).round(1),
        'Usage Pessimistic (%)': usage_baseline.round(1),
        'Usage Base (%)': (usage_baseline + event_effects_usage).round(1),
        'Usage Optimistic (%)': (usage_baseline + np.array(event_effects_usage) * 1.5).round(1),
        'Access Lower CI': (access_baseline + event_effects_access - 10).clip(lower=49).round(1),
        'Access Upper CI': (access_baseline + event_effects_access + 10).round(1),
        'Usage Lower CI': (usage_baseline + event_effects_usage - 15).clip(lower=9.45).round(1),
        'Usage Upper CI': (usage_baseline + event_effects_usage + 15).round(1)
    })
    print("Forecast DataFrame recreated in Cell 6 (for independent execution)")

# Access scenarios plot
fig_access = px.line(forecast, x='Year', y=['Access Pessimistic (%)', 'Access Base (%)', 'Access Optimistic (%)'],
                     title='Account Ownership Forecast Scenarios 2025–2027',
                     labels={'value': 'Account Ownership (%)', 'Year': 'Year'})

fig_access.add_scatter(x=forecast['Year'], y=forecast['Access Lower CI'], mode='lines', 
                       line=dict(dash='dot', color='gray'), name='Lower CI')
fig_access.add_scatter(x=forecast['Year'], y=forecast['Access Upper CI'], mode='lines', 
                       line=dict(dash='dot', color='gray'), name='Upper CI', fill='tonexty', fillcolor='rgba(0,0,0,0.1)')

fig_access.update_yaxes(title='Account Ownership (%)', range=[40, 80])
fig_access.update_layout(height=600, hovermode='x unified')
fig_access.show()

# Usage scenarios plot
fig_usage = px.line(forecast, x='Year', y=['Usage Pessimistic (%)', 'Usage Base (%)', 'Usage Optimistic (%)'],
                    title='Digital Payment Usage Forecast Scenarios 2025–2027',
                    labels={'value': 'Digital Payment Usage (%)', 'Year': 'Year'})

fig_usage.add_scatter(x=forecast['Year'], y=forecast['Usage Lower CI'], mode='lines', 
                      line=dict(dash='dot', color='gray'), name='Lower CI')
fig_usage.add_scatter(x=forecast['Year'], y=forecast['Usage Upper CI'], mode='lines', 
                      line=dict(dash='dot', color='gray'), name='Upper CI', fill='tonexty', fillcolor='rgba(0,0,0,0.1)')

fig_usage.update_yaxes(title='Digital Payment Usage (%)', range=[0, 60])
fig_usage.update_layout(height=600, hovermode='x unified')
fig_usage.show()

In [11]:
# Cell 7: Written Interpretation
interpretation = """
## Forecast Interpretation

- **Predictions**: Base scenario: Access ~58% by 2027 (from 49% 2024), Usage ~30-35%. Optimistic: Access 65-70%, Usage 45%+ if active gap closes.

- **Largest Impact Events**: NDPS 2026–2030 (interoperability) – +5-10pp usage, +3-5pp ownership (lagged).

- **Key Uncertainties**:
  - Active vs registered gap persistence (main slowdown driver).
  - Policy implementation/activation speed.
  - External factors (economy, literacy).

- **Limitations**: Sparse 5 Findex points → wide CI; manual event effects; no disaggregation.

Overall: Moderate progress likely (base scenario); breakthrough possible if NDPS activates dormant accounts.
"""

from IPython.display import Markdown
display(Markdown(interpretation))


## Forecast Interpretation

- **Predictions**: Base scenario: Access ~58% by 2027 (from 49% 2024), Usage ~30-35%. Optimistic: Access 65-70%, Usage 45%+ if active gap closes.

- **Largest Impact Events**: NDPS 2026–2030 (interoperability) – +5-10pp usage, +3-5pp ownership (lagged).

- **Key Uncertainties**:
  - Active vs registered gap persistence (main slowdown driver).
  - Policy implementation/activation speed.
  - External factors (economy, literacy).

- **Limitations**: Sparse 5 Findex points → wide CI; manual event effects; no disaggregation.

Overall: Moderate progress likely (base scenario); breakthrough possible if NDPS activates dormant accounts.
