# FIXED Notebook (v2): Youth Unemployment & Civil Crises in SSA
This notebook **works even if UCDP data is missing**, by generating synthetic conflict data.
When you add the real UCDP CSV later, the notebook will automatically switch to using it.

In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import requests
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

# ---------- SETUP ----------
BASE = Path('.')
DATA = BASE / 'data'
RAW = DATA / 'raw'
PROC = DATA / 'processed'
OUT = BASE / 'outputs'
for p in [RAW, PROC, OUT]: p.mkdir(parents=True, exist_ok=True)

def fetch_wdi(indicator, start=2000, end=2023):
    url = f"http://api.worldbank.org/v2/country/SSA/indicator/{indicator}?date={start}:{end}&format=json&per_page=20000"
    r = requests.get(url)
    r.raise_for_status()
    data = r.json()[1]
    df = pd.json_normalize(data)
    df = df[['country.value','country.id','date','value']]
    df.columns = ['country','iso3','year','value']
    df['year'] = df['year'].astype(int)
    return df

# Fetch WDI variables
youth = fetch_wdi('SL.UEM.1524.ZS'); youth.rename(columns={'value':'youth_unemp'}, inplace=True)
gdp = fetch_wdi('NY.GDP.PCAP.KD'); gdp.rename(columns={'value':'gdp_pc'}, inplace=True)
pop = fetch_wdi('SP.POP.TOTL'); pop.rename(columns={'value':'pop'}, inplace=True)
urb = fetch_wdi('SP.URB.TOTL.IN.ZS'); urb.rename(columns={'value':'pct_urban'}, inplace=True)

dfs = [youth, gdp, pop, urb]
from functools import reduce
wdi = reduce(lambda l,r: pd.merge(l,r,on=['country','iso3','year'],how='outer'), dfs)
wdi.to_csv(PROC / 'wdi_ssa_2000_2023.csv', index=False)
print('Saved WDI data.')

## Load UCDP data (or generate synthetic conflict data)

In [None]:
ucdp_path = RAW / 'ucdp_country_year.csv'

if ucdp_path.exists():
    print('UCDP file found. Using real conflict data...')
    ucdp = pd.read_csv(ucdp_path)
    if 'country_name' in ucdp.columns:
        ucdp.rename(columns={'country_name':'country'}, inplace=True)
    if 'battle_deaths' not in ucdp.columns:
        raise ValueError('UCDP file must have a battle_deaths column.')
else:
    print('⚠️ UCDP file not found — generating synthetic conflict data.')
    ucdp = wdi[['country','year']].copy()
    np.random.seed(42)
    ucdp['battle_deaths'] = np.random.poisson(lam=5, size=len(ucdp))

merged = pd.merge(wdi, ucdp[['country','year','battle_deaths']],
                  on=['country','year'], how='left')

merged['conflict_any'] = (merged['battle_deaths'] > 0).astype(int)
merged = merged.sort_values(['iso3','year']).reset_index(drop=True)

print('Merged dataset created. Shape:', merged.shape)
merged.head()

## Fixed-effects regression (statsmodels only)

In [None]:
# Create FE dummies
df = pd.get_dummies(merged, columns=['iso3','year'], drop_first=True)

y = df['conflict_any']
Xvars = ['youth_unemp','gdp_pc','pop','pct_urban'] + \
        [c for c in df.columns if c.startswith('iso3_') or c.startswith('year_')]

X = sm.add_constant(df[Xvars])
model = sm.OLS(y, X)
res = model.fit()
print(res.summary())

## Plot WDI trend figure

In [None]:
plt.figure(figsize=(9,5))
sns.lineplot(data=merged, x='year', y='youth_unemp', estimator='mean')
plt.title('Average Youth Unemployment in SSA (2000–2023)')
plt.tight_layout()
outpath = OUT / 'trend_youth_unemployment.png'
plt.savefig(outpath)
outpath