In [1]:
import pandas as pd
import numpy as np

# File path relative to your notebook
gdp_file_path = "../data/API_NY/API_NY.GDP.PCAP.CD_DS2_en_csv_v2_134819.csv"

# World Bank files usually require skipping the first 4 rows of metadata
gdp_data = pd.read_csv(gdp_file_path, skiprows=4)

# 1. Select the relevant columns: Country Code (for merging) and 2022 data
# Note: PISA uses 3-character codes, so 'Country Code' is the correct join key.
# The '2022' column contains the GDP per capita for that year.
gdp_data = gdp_data[['Country Code', '2022']].copy()

# 2. Rename columns to match PISA's 'CNT' and be explicit
gdp_data.rename(columns={
    'Country Code': 'CNT',
    '2022': 'GDP_per_capita_2022'
}, inplace=True)

# 3. Clean up the data
# Convert GDP column to numeric, coercing errors
gdp_data['GDP_per_capita_2022'] = pd.to_numeric(gdp_data['GDP_per_capita_2022'], errors='coerce')

# Drop any rows where GDP is NaN after the conversion
gdp_data.dropna(subset=['GDP_per_capita_2022'], inplace=True)

print("---")
print("Cleaned GDP per Capita Data (2022):")
print(gdp_data.head())
print(f"\nTotal entities (countries/territories) with 2022 GDP data: {len(gdp_data)}")

---
Cleaned GDP per Capita Data (2022):
   CNT  GDP_per_capita_2022
0  ABW         30559.533535
1  AFE          1628.318944
2  AFG           357.261153
3  AFW          1796.668633
4  AGO          2929.694455

Total entities (countries/territories) with 2022 GDP data: 256
