In [17]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import numpy as np
import pandas as pd
from pandas_datareader import wb


In [18]:

# Indicator mapping
indicators = {
    'NY.GDP.PCAP.CD': 'GDP_per_capita',
    'NY.GDP.MKTP.KD.ZG': 'GDP_growth',
    'FP.CPI.TOTL.ZG': 'Inflation',
    'GC.DOD.TOTL.GD.ZS': 'Debt_to_GDP',
    'BN.CAB.XOKA.GD.ZS': 'Current_account_balance',
    'SL.UEM.TOTL.ZS': 'Unemployment',
    'PV.EST': 'Political_stability'
}

# Fetch from World Bank
try:
    wb_data = wb.download(
        indicator=list(indicators.keys()),
        country='all',
        start=2000,
        end=2023
    ).reset_index()

    # Rename columns
    wb_data = wb_data.rename(columns=indicators)

    # Pivot to wide format
    wb_data = wb_data.pivot_table(
        index=['country', 'year'],
        values=list(indicators.values()),
        aggfunc='first'
    ).reset_index()

    # Save to disk
    wb_data.to_csv('../data/raw/world_bank_indicators.csv', index=False)
    print("Saved World Bank indicators to '../data/raw/world_bank_indicators.csv'")

except Exception as e:
    print(f"Error: {e}")


Saved World Bank indicators to '../data/raw/world_bank_indicators.csv'


In [19]:
### Load Moody’s Sovereign Ratings
moodys_df = pd.read_csv('../data/raw/20241115 Moody\'s Investors Service Sovereign.csv')
print("Moody’s Sovereign Ratings:")
display(moodys_df.head())
print("Shape:", moodys_df.shape)

# Replace 'obligor_name' with 'country' for better naming
moodys_df = moodys_df.rename(columns={'obligor_name': 'country'})
moodys_df['country_cleaned'] = moodys_df['country'].apply(clean_name)

### Load and reshape BoC-BoE Sovereign Default Data
default_df = pd.read_excel(
    '../data/raw/BoC-BoE-Database-2024.xlsx',
    skiprows=64,
    sheet_name='Data_2024.csv'
)

# Rename for consistency
default_df = default_df.rename(columns={'DEBT_COUNTRY': 'country'})

# remove aggregates like 'World'
default_df = default_df[default_df['country'] != 'World']

# Melt to long format
id_cols = ['country', 'DEBT_YEAR']
value_cols = [col for col in default_df.columns if col not in id_cols]

default_long = default_df.melt(
    id_vars=id_cols,
    value_vars=value_cols,
    var_name='indicator',
    value_name='value'
)

# Clean country names
default_long['country_cleaned'] = default_long['country'].apply(clean_name)

print("\nBoC-BoE Sovereign Default Data (Long Format):")
display(default_long.head())
print("Shape:", default_long.shape)

### Load and clean World Bank Indicators
wb_df = pd.read_csv('../data/raw/world_bank_indicators.csv')
wb_df['country_cleaned'] = wb_df['country'].apply(clean_name)

print("\nWorld Bank Indicators:")
display(wb_df.head())
print("Shape:", wb_df.shape)




Moody’s Sovereign Ratings:


  moodys_df = pd.read_csv('../data/raw/20241115 Moody\'s Investors Service Sovereign.csv')


Unnamed: 0,rating_agency_name,file_creating_date,sec_category,issuer_name,legal_entity_identifier,object_type_rated,instrument_name,CUSIP_number,coupon_date,maturity_date,...,issuer_identifier_schema,instrument_identifier,instrument_identifier_schema,central_index_key,obligor_identifier,obligor_identifier_schema,obligor_identifier_other,obligor_sec_category,obligor_industry_group,obligor_name
0,Moody's Investors Service,2024-11-15,,,549300MCOOY1V7P2PG30,,,,,,...,,,,,2296,NRSRO,,Sovereign,,Eutelsat SA
1,Moody's Investors Service,2024-11-15,,,549300MCOOY1V7P2PG30,,,,,,...,,,,,2296,NRSRO,,Sovereign,,Eutelsat SA
2,Moody's Investors Service,2024-11-15,,,549300MCOOY1V7P2PG30,,,,,,...,,,,,2296,NRSRO,,Sovereign,,Eutelsat SA
3,Moody's Investors Service,2024-11-15,,,549300MCOOY1V7P2PG30,,,,,,...,,,,,2296,NRSRO,,Sovereign,,Eutelsat SA
4,Moody's Investors Service,2024-11-15,,,549300MCOOY1V7P2PG30,,,,,,...,,,,,2296,NRSRO,,Sovereign,,Eutelsat SA


Shape: (138107, 32)

BoC-BoE Sovereign Default Data (Long Format):


Unnamed: 0,country,DEBT_YEAR,indicator,value,country_cleaned
0,Afghanistan,1960,k,AFG_1960,afghanistan
1,Afghanistan,1961,k,AFG_1961,afghanistan
2,Afghanistan,1962,k,AFG_1962,afghanistan
3,Afghanistan,1963,k,AFG_1963,afghanistan
4,Afghanistan,1964,k,AFG_1964,afghanistan


Shape: (499328, 5)

World Bank Indicators:


Unnamed: 0,country,year,Current_account_balance,Debt_to_GDP,GDP_growth,GDP_per_capita,Inflation,Political_stability,Unemployment,country_cleaned
0,Afghanistan,2000,,,,174.930991,,-2.438969,7.935,afghanistan
1,Afghanistan,2001,,,-9.431974,138.706822,,,7.953,afghanistan
2,Afghanistan,2002,,,28.600001,178.954088,,-2.035034,7.93,afghanistan
3,Afghanistan,2003,,,8.832278,198.871116,,-2.198372,7.88,afghanistan
4,Afghanistan,2004,,,1.414118,221.763654,,-2.295682,7.899,afghanistan


Shape: (6280, 10)


In [20]:
# Sample Moody's Sovereign Ratings
print("\nSample: Moody's Sovereign Ratings")
display(moodys_df[['country_cleaned', 'rating', 'rating_action_date']].dropna().sample(10, random_state=42))

# Sample BoC–BoE Sovereign Defaults (just a few indicators)
print("\nSample: BoC–BoE Defaults (selected indicators)")
sample_indicators = ['DEBT_TOTAL_2023', 'DEBT_IMF_2024', 'DEBT_PARIS_CLUB_2024']
sample_defaults = default_long[default_long['indicator'].isin(sample_indicators)]
display(sample_defaults.sample(10, random_state=42))

# Sample World Bank Indicators
print("\nSample: World Bank Indicators")
display(wb_df.dropna(subset=['GDP_growth', 'Debt_to_GDP']).sample(10, random_state=42))



Sample: Moody's Sovereign Ratings


Unnamed: 0,country_cleaned,rating,rating_action_date
96938,farm credit banks,Aaa,2012-07-13
1644,brazil,Baa2,2015-02-10
120148,japan,Aa3,2012-06-15
33293,home loan banks,Aaa,2022-07-14
104896,farm credit banks,Aaa,2013-04-02
57998,home loan mortgage corp,WR,2020-09-30
33041,home loan banks,Aaa,2022-06-13
7077,home loan banks,WR,2017-10-06
52914,home loan banks,Aaa,2012-06-15
5211,home loan banks,WR,2018-12-11



Sample: BoC–BoE Defaults (selected indicators)


Unnamed: 0,country,DEBT_YEAR,indicator,value,country_cleaned
28670,Philippines,2022,DEBT_TOTAL_2023,,philippines
25416,Hungary,1968,DEBT_TOTAL_2023,51.59,hungary
29503,Sint Maarten,2023,DEBT_TOTAL_2023,,sint maarten
27235,Mauritius,1995,DEBT_TOTAL_2023,0.85,mauritius
84756,Yemen,1980,DEBT_PARIS_CLUB_2024,,yemen
75133,Bangladesh,2021,DEBT_PARIS_CLUB_2024,,bangladesh
77682,Ethiopia,2010,DEBT_PARIS_CLUB_2024,164.0,ethiopia
83044,Sri Lanka,1996,DEBT_PARIS_CLUB_2024,,sri lanka
25955,Kazakhstan,1995,DEBT_TOTAL_2023,161.72,kazakhstan
83894,Tunisia,2014,DEBT_PARIS_CLUB_2024,,tunisia



Sample: World Bank Indicators


Unnamed: 0,country,year,Current_account_balance,Debt_to_GDP,GDP_growth,GDP_per_capita,Inflation,Political_stability,Unemployment,country_cleaned
5190,South Africa,2022,-0.461509,76.237238,1.91148,6523.410978,7.039873,-0.681386,33.268,south africa
3537,Malawi,2016,-12.713401,37.886208,2.5,450.506106,21.711113,-0.116128,5.007,malawi
4855,San Marino,2017,-0.391283,54.593445,0.258265,45192.027977,1.014133,1.39289,,san marino
2375,High income,2002,,63.48669,1.638696,22907.428467,2.116284,,7.194273,high income
2153,Greece,2020,-6.486642,249.366027,-9.196231,17886.733165,-1.247984,0.115182,15.899,greece
5978,United States,2010,-2.870641,84.964374,2.695193,48642.631209,1.640043,0.438404,9.633,united states
2233,Guatemala,2004,-4.938649,22.144457,3.137917,1840.911849,7.578622,-0.808035,2.971,guatemala
887,Canada,2001,2.13237,57.458356,1.875098,23822.096211,2.52512,,7.219,canada
2133,Greece,2000,-7.80648,123.892648,4.137827,11638.20107,3.151182,0.808356,11.345,greece
368,Azerbaijan,2010,28.426831,6.385576,4.788833,5843.533768,5.726872,-0.238547,5.63,azerbaijan


In [None]:
# Filter for long-term issuer ratings only
lt_issuer_df = moodys_df[
    moodys_df['rating_type_term'].str.contains("LT Issuer Rating", case=False, na=False)
].copy()

# Ensure date is datetime
lt_issuer_df['rating_action_date'] = pd.to_datetime(lt_issuer_df['rating_action_date'], errors='coerce')
lt_issuer_df = lt_issuer_df.dropna(subset=['rating_action_date'])

# Sort and keep most recent rating per country
lt_issuer_df = lt_issuer_df.sort_values(['country_cleaned', 'rating_action_date'], ascending=[True, False])
latest_moodys = lt_issuer_df.groupby('country_cleaned').first().reset_index()

# Keep only necessary columns
clean_moodys = latest_moodys[['country_cleaned', 'rating', 'rating_action_date']]

# Preview
print("Cleaned Moody’s Sovereign Ratings:")
display(clean_moodys.sample(10, random_state=42))
print("Shape:", clean_moodys.shape)



✅ Cleaned Moody’s Sovereign Ratings:


Unnamed: 0,country_cleaned,rating,rating_action_date
137,slovakia,A2,2012-06-15
30,canada mortgage and housing corporation,Aaa,2012-06-15
119,oman,Ba1,2023-12-07
29,canada,Aaa,2012-06-15
142,spain,Baa1,2018-04-13
161,united kingdom,Aa3,2020-10-16
164,uzbekistan,Ba3,2023-01-20
51,eutelsat sa,WR,2023-06-14
105,mongolia,B3,2018-01-18
60,fondo de reestructuracion ordenada bancaria,WR,2019-02-22


Shape: (168, 3)


In [None]:
# Option 1: Match only countries present in default and WB indicators
valid_countries = set(default_long['country_cleaned']).union(set(wb_df['country_cleaned']))
sovereign_only = clean_moodys[clean_moodys['country_cleaned'].isin(valid_countries)].copy()

print("Final Sovereign-Only Moody's Ratings:")
display(sovereign_only.sample(10, random_state=42))
print("Shape:", sovereign_only.shape)


✅ Final Sovereign-Only Moody's Ratings:


Unnamed: 0,country_cleaned,rating,rating_action_date
160,united arab emirates,Aa2,2012-06-15
83,kenya,B3,2023-05-12
131,rwanda,B2,2016-08-12
23,botswana,A3,2021-04-23
56,fiji,B1,2021-04-20
77,israel,A1,2012-06-15
16,belarus,C,2023-06-02
138,slovenia,A3,2020-10-02
126,poland,A2,2012-06-15
127,portugal,A3,2023-11-17


Shape: (134, 3)


In [25]:
# Add rating year
sovereign_only['rating_year'] = sovereign_only['rating_action_date'].dt.year

# Preview
print("Moody's Ratings with Year:")
display(sovereign_only[['country_cleaned', 'rating', 'rating_action_date', 'rating_year']].sample(5, random_state=42))


Moody's Ratings with Year:


Unnamed: 0,country_cleaned,rating,rating_action_date,rating_year
160,united arab emirates,Aa2,2012-06-15,2012
83,kenya,B3,2023-05-12,2023
131,rwanda,B2,2016-08-12,2016
23,botswana,A3,2021-04-23,2021
56,fiji,B1,2021-04-20,2021


In [26]:
# Moody's Sovereign Ratings
min_year_moodys = sovereign_only['rating_action_date'].dt.year.min()
max_year_moodys = sovereign_only['rating_action_date'].dt.year.max()

# BoC–BoE Defaults
min_year_defaults = default_long['DEBT_YEAR'].min()
max_year_defaults = default_long['DEBT_YEAR'].max()

# World Bank Indicators
min_year_wb = wb_df['year'].min()
max_year_wb = wb_df['year'].max()

# Print results
print("Year Ranges:")
print(f"Moody's Ratings:       {min_year_moodys} to {max_year_moodys}")
print(f"BoC–BoE Defaults:      {min_year_defaults} to {max_year_defaults}")
print(f"World Bank Indicators: {min_year_wb} to {max_year_wb}")


Year Ranges:
Moody's Ratings:       2012 to 2023
BoC–BoE Defaults:      1960 to 2023
World Bank Indicators: 2000 to 2023


In [27]:
# Filter BoC–BoE defaults
default_long_filtered = default_long[default_long['DEBT_YEAR'] >= 2012].copy()

# Filter World Bank indicators
wb_df_filtered = wb_df[wb_df['year'] >= 2012].copy()

# Confirm shapes
print("Filtered Shapes:")
print("Moody’s:", sovereign_only.shape)
print("Defaults:", default_long_filtered.shape)
print("World Bank:", wb_df_filtered.shape)


Filtered Shapes:
Moody’s: (134, 4)
Defaults: (93624, 5)
World Bank: (3150, 10)


In [29]:
# Make sure all filtered datasets have country_cleaned
default_long_filtered['country_cleaned'] = default_long_filtered['country'].apply(clean_name)
wb_df_filtered['country_cleaned'] = wb_df_filtered['country'].apply(clean_name)

# Create set of valid countries based on Moody's
valid_countries = set(sovereign_only['country_cleaned'])


In [30]:
# Filter BoC–BoE to Moody's countries
default_long_filtered = default_long_filtered[
    default_long_filtered['country_cleaned'].isin(valid_countries)
].copy()

# Filter World Bank to Moody's countries
wb_df_filtered = wb_df_filtered[
    wb_df_filtered['country_cleaned'].isin(valid_countries)
].copy()

# Confirm new shapes
print("After Filtering to Moody's Countries:")
print("BoC–BoE Defaults:", default_long_filtered.shape)
print("World Bank:", wb_df_filtered.shape)


After Filtering to Moody's Countries:
BoC–BoE Defaults: (54144, 5)
World Bank: (1535, 10)


In [31]:
# Find countries not covered by Moody's
defaults_missing = set(default_long['country_cleaned']) - valid_countries
wb_missing = set(wb_df['country_cleaned']) - valid_countries

print("Countries in defaults but not in Moody’s:", defaults_missing)
print("Countries in WB but not in Moody’s:", wb_missing)


Countries in defaults but not in Moody’s: {'curaao', 'democratic  congo kinshasa', 'burundi', 'eswatini swaziland', 'liberia', 'marshall islands', 'slovak republic', 'iran', 'sierra leone', 'nauru', 'madagascar', 'central african republic', 'palau', 'bhutan', 'afghanistan', 'cook islands', 'grenada', 'comoros', 'north macedonia', 'the gambia', 'djibouti', 'aruba', 'nepal', 'guineabissau', 'ussrrussia', 'anguila', 'dominica', 'equatorial guinea', 'micronesia', 'cabo verde', 'haiti', 'sint maarten', 'kosovo', 'myanmar', 'sudan', 'guyana', 'libya', 'korea north', 'west bank  gaza', 'algeria', 'samoa', 'somalia', 'netherlands antilles', 'yemen', 'syria', 'czechoslovakia', 'tuvalu', 'tonga', 'turkmenistan', 'bosnia  herzegovina', 'so tom and prncipe', 'yugoslavia', 'st kitts  nevis', 'seychelles', 'burkina faso', 'lesotho', 'chad', 'mauritania', 'south sudan', 'turkey', 'eritrea', 'zimbabwe', 'congo brazzaville', 'puerto rico', 'malawi', 'st lucia', 'vanuatu', 'guinea', 'antigua and barbuda

In [None]:
# Select key indicators (customize as needed)
selected_indicators = ['DEBT_TOTAL_2023', 'DEBT_IMF_2024', 'DEBT_PARIS_CLUB_2024']
filtered_default = default_long_filtered[default_long_filtered['indicator'].isin(selected_indicators)].copy()

# Pivot to wide format (1 row per country, 1 col per indicator)
default_wide = filtered_default.pivot_table(
    index='country_cleaned',
    columns='indicator',
    values='value',
    aggfunc='first'
).reset_index()

# Inspect result
print("Pivoted BoC–BoE Default Data:")
display(default_wide.sample(10, random_state=42))
print("Shape:", default_wide.shape)


Pivoted BoC–BoE Default Data:


indicator,country_cleaned,DEBT_IMF_2024,DEBT_PARIS_CLUB_2024,DEBT_TOTAL_2023
30,guatemala,,,0.0
0,albania,,,1.23
22,egypt,,,6300.0
31,honduras,,0.39,26.87
18,cuba,,10830.0,19177.79
28,ghana,,3454.0,214.88
10,botswana,,,0.0
70,togo,,0.1,41.03
4,bangladesh,,,63.37
12,bulgaria,,,0.95


Shape: (79, 4)


In [36]:
final_selected_indicators = [
    # Total & Sovereign-Level Debt
    'DEBT_TOTAL_2023',
    'DEBT_TOTAL_DEBT_2024',
    'DEBT_TOTAL_DEF_SOVEREIGNS_2024',

    # Multilateral Creditors
    'DEBT_IMF_2024',
    'DEBT_PARIS_CLUB_2024',
    'DEBT_IDA_2024',
    'DEBT_IBRD_2024',
    'DEBT_IADB_2024',

    # Bilateral (e.g., China)
    'DEBT_CHINA_2024',
    'DEBT_CHINA_DEF_SOVEREIGNS_2024',

    # Private/Other
    'DEBT_PRIVATE_CREDITORS_2024',
    'DEBT_OTHER_OFFICIAL_CREDITORS_2024',
    
    # Instruments
    'DEBT_FC_BANK_LOANS_2024',
    'DEBT_FC_BONDS_2024',
    'DEBT_LC_DEBT_2024'
]



In [None]:
# Intersect with actual columns to avoid KeyError
actual_columns = set(default_wide_final.columns)
safe_subset = list(actual_columns.intersection(final_selected_indicators))

# Drop rows where all selected indicators are missing
default_wide_final = default_wide_final.dropna(how='all', subset=safe_subset)

# Preview
print("Cleaned Final Default Dataset:")
display(default_wide_final.sample(10, random_state=42))
print("Shape:", default_wide_final.shape)




✅ Cleaned Final Default Dataset:


indicator,country_cleaned,DEBT_CHINA_2024,DEBT_FC_BANK_LOANS_2024,DEBT_FC_BONDS_2024,DEBT_IADB_2024,DEBT_IBRD_2024,DEBT_IMF_2024,DEBT_LC_DEBT_2024,DEBT_OTHER_OFFICIAL_CREDITORS_2024,DEBT_PARIS_CLUB_2024,DEBT_PRIVATE_CREDITORS_2024,DEBT_TOTAL_2023
30,guatemala,3.91,,,,,,,0.0,,,0.0
0,albania,,,,,,,,1.23,,,1.23
22,egypt,,,,,,,,0.0,,6300.0,6300.0
31,honduras,6.14,,,,0.39,,,26.03,0.39,0.84,26.87
18,cuba,****,6597.79,,,,,,1750.0,10830.0,,19177.79
28,ghana,17.15,,13134.0,,,,15765.0,126.42,3454.0,71.31,214.88
10,botswana,8,,,,,,,0.0,,0.0,0.0
70,togo,12.63,,,,,,,28.4,0.1,,41.03
4,bangladesh,4.6,,,,,,,58.77,,,63.37
12,bulgaria,,,,,,,,0.95,,,0.95


Shape: (79, 12)


In [40]:
# Choose relevant macro indicators
macro_columns = [
    'Current_account_balance',
    'Debt_to_GDP',
    'GDP_growth',
    'GDP_per_capita',
    'Inflation',
    'Political_stability',
    'Unemployment'
]

# Aggregate per country using mean
wb_agg = wb_df_filtered.groupby('country_cleaned')[macro_columns].mean().reset_index()

# Step 3: Preview
print("Aggregated World Bank Macro Indicators:")
display(wb_agg.sample(10, random_state=42))
print("Shape:", wb_agg.shape)


Aggregated World Bank Macro Indicators:


Unnamed: 0,country_cleaned,Current_account_balance,Debt_to_GDP,GDP_growth,GDP_per_capita,Inflation,Political_stability,Unemployment
55,iraq,7.981188,30.042303,3.707715,5489.955718,2.373009,-2.340675,12.745833
40,fiji,-9.0123,,2.723082,5280.49841,2.126775,0.581617,4.401417
19,brazil,-2.674484,77.09911,0.889982,9845.395578,5.994766,-0.359994,10.190417
31,cuba,,,0.48385,8071.553009,,0.523726,2.2375
98,portugal,0.207735,,1.271675,22577.601241,1.663553,0.910073,9.779667
56,ireland,1.180278,85.247092,7.175416,76855.669479,1.667009,0.932023,8.1255
69,lithuania,0.746673,44.68449,3.434806,19033.501975,3.976982,0.787437,8.390417
104,serbia,-5.063834,,2.309561,7828.339363,4.714663,-0.016602,14.205
81,morocco,-3.844168,,2.64093,3404.463501,2.043449,-0.386342,9.55875
26,china,1.693824,,6.335577,9717.357061,1.897938,-0.450484,4.63


Shape: (128, 8)


In [41]:
# Merge Moody's with BoC–BoE Defaults
df_merge_1 = pd.merge(
    sovereign_only,
    default_wide_final,
    on='country_cleaned',
    how='left'
)

# Merge the result with World Bank Macros
master_df = pd.merge(
    df_merge_1,
    wb_agg,
    on='country_cleaned',
    how='left'
)

# Inspect
print("Final Merged Dataset (Ready for Feature Engineering or Modeling):")
display(master_df.sample(10, random_state=42))
print("Shape:", master_df.shape)


Final Merged Dataset (Ready for Feature Engineering or Modeling):


Unnamed: 0,country_cleaned,rating,rating_action_date,rating_year,DEBT_CHINA_2024,DEBT_FC_BANK_LOANS_2024,DEBT_FC_BONDS_2024,DEBT_IADB_2024,DEBT_IBRD_2024,DEBT_IMF_2024,...,DEBT_PARIS_CLUB_2024,DEBT_PRIVATE_CREDITORS_2024,DEBT_TOTAL_2023,Current_account_balance,Debt_to_GDP,GDP_growth,GDP_per_capita,Inflation,Political_stability,Unemployment
127,united arab emirates,Aa2,2012-06-15,2012,,,,,,,...,,,,,1.845685,3.089786,46004.69657,1.49333,0.699607,2.444917
66,kenya,B3,2023-05-12,2023,7.92,,,,,,...,209.0,****,64.29,-5.99708,,4.575126,1724.764931,6.636023,-1.142256,4.0965
104,rwanda,B2,2016-08-12,2016,6.0,,,,,,...,,,3.85,-10.98684,,6.677833,795.9244,7.208886,-0.012479,12.32325
19,botswana,A3,2021-04-23,2021,8.0,,,,,,...,,0,0.0,0.449018,17.619861,3.516551,7103.861668,4.907036,1.038241,20.2905
42,fiji,B1,2021-04-20,2021,6.14,,,,,,...,,0.21,0.34,-9.0123,,2.723082,5280.49841,2.126775,0.581617,4.401417
60,israel,A1,2012-06-15,2012,,,,,,,...,,,,3.405031,,3.71033,42844.60659,1.167607,-1.037469,4.720583
12,belarus,C,2023-06-02,2023,218.5,,3445.0,,116.0,,...,9743.0,12.23,12.23,-2.077436,32.444707,0.530315,6933.89819,14.395302,-0.169249,4.9225
109,slovenia,A3,2020-10-02,2020,,,,,,,...,,,,4.494768,,2.218872,25200.497194,2.243869,0.854751,6.589167
100,poland,A2,2012-06-15,2012,,,,,,,...,,,,-1.189538,,3.329578,15611.844303,3.630959,0.651122,5.571417
101,portugal,A3,2023-11-17,2023,,,,,,,...,,,52712.0,0.207735,,1.271675,22577.601241,1.663553,0.910073,9.779667


Shape: (134, 22)


In [42]:
# Replace '****' and similar non-numeric entries with NaN, then convert
for col in master_df.columns:
    master_df[col] = pd.to_numeric(master_df[col], errors='ignore')

# Explicitly fix common cases
master_df.replace('****', np.nan, inplace=True)
for col in master_df.columns[4:]:  # skip first few string/date columns
    master_df[col] = pd.to_numeric(master_df[col], errors='coerce')


In [43]:
# Create ordinal mapping (from Moody's scale)
rating_scale = {
    'Aaa': 1, 'Aa1': 2, 'Aa2': 3, 'Aa3': 4,
    'A1': 5, 'A2': 6, 'A3': 7,
    'Baa1': 8, 'Baa2': 9, 'Baa3': 10,
    'Ba1': 11, 'Ba2': 12, 'Ba3': 13,
    'B1': 14, 'B2': 15, 'B3': 16,
    'Caa1': 17, 'Caa2': 18, 'Caa3': 19,
    'Ca': 20, 'C': 21
}

master_df['rating_encoded'] = master_df['rating'].map(rating_scale)


In [44]:
# Quick check for % missing per column
missing_percent = master_df.isna().mean().sort_values(ascending=False)
print("Missing Value Report (Top 15):")
display(missing_percent.head(15))


Missing Value Report (Top 15):


DEBT_IADB_2024                        0.992537
DEBT_IMF_2024                         0.992537
DEBT_IBRD_2024                        0.977612
DEBT_LC_DEBT_2024                     0.932836
DEBT_FC_BANK_LOANS_2024               0.932836
DEBT_PARIS_CLUB_2024                  0.843284
DEBT_FC_BONDS_2024                    0.835821
DEBT_PRIVATE_CREDITORS_2024           0.731343
DEBT_CHINA_2024                       0.686567
Debt_to_GDP                           0.514925
DEBT_OTHER_OFFICIAL_CREDITORS_2024    0.440299
DEBT_TOTAL_2023                       0.410448
Inflation                             0.082090
Unemployment                          0.074627
Current_account_balance               0.067164
dtype: float64

In [45]:
# Drop features missing in >95% of rows
threshold = 0.95
sparse_cols = missing_percent[missing_percent > threshold].index.tolist()
master_df = master_df.drop(columns=sparse_cols)

print("Dropped sparse columns:", sparse_cols)


Dropped sparse columns: ['DEBT_IADB_2024', 'DEBT_IMF_2024', 'DEBT_IBRD_2024']


In [46]:
# Debt indicators → fill with 0
debt_cols = [col for col in master_df.columns if col.startswith('DEBT_') and col in master_df.columns]
master_df[debt_cols] = master_df[debt_cols].fillna(0)

# Macroeconomic indicators → fill with mean
macro_cols = ['Debt_to_GDP', 'Inflation', 'Unemployment', 'Current_account_balance']
for col in macro_cols:
    if col in master_df.columns:
        master_df[col] = master_df[col].fillna(master_df[col].mean())


In [None]:
# Drop rows with no target (rating)
master_df = master_df.dropna(subset=['rating_encoded'])

print("Final dataset ready for modeling:")
print("Shape:", master_df.shape)


✅ Final dataset ready for modeling:
Shape: (132, 20)


In [48]:
# Save processed dataset
master_df.to_csv("../data/processed/final_dataset.csv", index=False)
print("Final dataset saved to data/processed/final_dataset.csv")


Final dataset saved to data/processed/final_dataset.csv
