# 04 — Sample Construction and Dataset Filtering

This notebook constructs the analytical sample used in the thesis.

Steps:
1. Restrict time period (2000–2021)
2. Identify oil-dependent countries
3. Apply data completeness criteria
4. Remove unstable growth outliers
5. Ensure Namibia inclusion

Output:
- final_analysis_dataset.csv
- country_sample_list.csv

In [None]:
# =========================
# SETUP
# =========================

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
BASE_PATH = "/content/drive/MyDrive/thesis_project/"

PROCESSED_DATA = BASE_PATH + "data/processed/"
OUTPUT_DATA = BASE_PATH + "data/outputs/"

In [None]:
import pandas as pd
import numpy as np

## Load Master Panel Dataset

In [None]:
master = pd.read_csv(
    PROCESSED_DATA + "master_dataset.csv"
)

master.head()

Unnamed: 0,Country Name,Country Code,Year,Oil_Rents_GDP,log_oil_rents,GDP_growth,GDP_per_capita,log_gdp_per_capita,Inflation,Inflation_w,Gov_Debt_GDP,log_debt,Control of Corruption,Government Effectiveness,Rule of Law
0,Aruba,ABW,1985,,,,,,4.032258,4.032258,,,,,
1,Aruba,ABW,1986,0.0,0.0,,16150.651073,9.689778,1.073966,1.073966,,,,,
2,Aruba,ABW,1987,0.0,0.0,16.078431,18992.068378,9.851829,3.643045,3.643045,,,,,
3,Aruba,ABW,1988,0.0,0.0,18.648649,22468.50712,10.019914,3.121868,3.121868,,,,,
4,Aruba,ABW,1989,0.0,0.0,12.129841,24730.396448,10.115829,3.991628,3.991628,,,,,


## Dataset Overview

In [None]:
print("Countries:", master["Country Code"].nunique())
print("Years:", master["Year"].nunique())
print("Observations:", len(master))

Countries: 224
Years: 65
Observations: 12263


## Restrict Analysis Period (2000–2021)

In [None]:
master = master[
    (master["Year"] >= 2000) &
    (master["Year"] <= 2021)
].copy()

master["Year"].describe()

Unnamed: 0,Year
count,4948.0
mean,2010.605699
std,6.31313
min,2000.0
25%,2005.0
50%,2011.0
75%,2016.0
max,2021.0


## Identify Oil-Dependent Countries
Countries with average oil rents ≥ 3% of GDP.

In [None]:
oil_mean = (
    master
    .groupby("Country Code")["Oil_Rents_GDP"]
    .mean()
    .reset_index(name="mean_oil_rents")
)

oil_countries = oil_mean[
    oil_mean["mean_oil_rents"] >= 3
]["Country Code"]

sample = master[
    master["Country Code"].isin(oil_countries)
].copy()

print("Countries after oil filter:",
      sample["Country Code"].nunique())

Countries after oil filter: 40


## Data Completeness Filter
Countries must have ≥75% data availability across core variables.

In [None]:
core_vars = [
    "Oil_Rents_GDP",
    "GDP_growth",
    "GDP_per_capita",
    "Inflation",
    "Gov_Debt_GDP",
    "Control of Corruption",
    "Government Effectiveness",
    "Rule of Law"
]

completeness = (
    sample
    .groupby("Country Code")[core_vars]
    .apply(lambda x: x.notna().mean().mean())
    .reset_index(name="completeness_ratio")
)

valid_countries = completeness[
    completeness["completeness_ratio"] >= 0.75
]["Country Code"]

sample = sample[
    sample["Country Code"].isin(valid_countries)
]

print("Countries after completeness filter:",
      sample["Country Code"].nunique())

Countries after completeness filter: 37


## Remove Extreme Growth Volatility
Median Absolute Deviation (MAD) filter.

In [None]:
def mad(x):
    return np.median(np.abs(x - np.median(x)))

growth_mad = (
    sample
    .groupby("Country Code")["GDP_growth"]
    .apply(mad)
    .reset_index(name="growth_mad")
)

median_mad = growth_mad["growth_mad"].median()

stable_countries = growth_mad[
    growth_mad["growth_mad"] <= 3 * median_mad
]["Country Code"]

sample = sample[
    sample["Country Code"].isin(stable_countries)
]

## Ensure Namibia Inclusion
Namibia is retained regardless of filtering outcomes.

In [None]:
namibia_data = master[master["Country Code"] == "NAM"]

sample = pd.concat([sample, namibia_data]).drop_duplicates()

sample[sample["Country Code"] == "NAM"].head()

Unnamed: 0,Country Name,Country Code,Year,Oil_Rents_GDP,log_oil_rents,GDP_growth,GDP_per_capita,log_gdp_per_capita,Inflation,Inflation_w,Gov_Debt_GDP,log_debt,Control of Corruption,Government Effectiveness,Rule of Law
7791,Namibia,NAM,2000,0.0,0.0,3.492183,3083.596579,8.034176,,,,,0.55599,0.469873,0.445161
7792,Namibia,NAM,2001,0.0,0.0,1.177949,3055.404448,8.024994,,,,,,,
7793,Namibia,NAM,2002,0.0,0.0,4.788661,3145.945663,8.054188,,,,,0.100783,0.139743,0.358799
7794,Namibia,NAM,2003,0.0,0.0,4.239794,3232.305897,8.08126,7.136153,7.136153,,,0.064888,0.168386,0.329183
7795,Namibia,NAM,2004,0.0,0.0,12.269548,3582.895033,8.184205,4.136632,4.136632,,,-0.01085,0.104889,0.018302


## Final Sample Overview

In [None]:
print("Final countries:",
      sample["Country Code"].nunique())

Final countries: 36


## Create Country Sample List

In [None]:
final_countries = (
    sample[["Country Code","Country Name"]]
    .drop_duplicates()
    .sort_values("Country Name")
)

final_countries

Unnamed: 0,Country Code,Country Name
3172,DZA,Algeria
125,AGO,Angola
651,AZE,Azerbaijan
1080,BHR,Bahrain
1410,BOL,Bolivia
1595,BRN,Brunei Darussalam
2235,CMR,Cameroon
10731,TCD,Chad
2448,COL,Colombia
2365,COG,"Congo, Rep."


## Save Analytical Dataset

In [None]:
sample = sample.sort_values(
    ["Country Code", "Year"]
).reset_index(drop=True)

In [None]:
sample.to_csv(
    OUTPUT_DATA + "final_analysis_dataset.csv",
    index=False,
    float_format="%.10f"
)

final_countries.to_csv(
    OUTPUT_DATA + "country_sample_list.csv",
    index=False
)

## Pipeline Completion

In [None]:
print("✅ Final analytical dataset created.")

✅ Final analytical dataset created.
