In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgboost

In [3]:
df = pd.read_csv('gdp_population_with_lags.csv')

In [4]:
df.head()

Unnamed: 0,Country Name,Country Code,Year,GDP,Population,GDP_lag_1,GDP_lag_2,GDP_lag_3,GDP_growth_1yr,Population_growth_1yr,GDP_per_capita
0,Albania,ALB,1983,1881413000.0,2843960.0,1861163000.0,1808177000.0,1578102000.0,0.01088,,661.546782
1,Albania,ALB,1984,1857338000.0,2904429.0,1881413000.0,1861163000.0,1808177000.0,-0.012796,60469.0,639.48473
2,Albania,ALB,1985,1897050000.0,2964762.0,1857338000.0,1881413000.0,1861163000.0,0.021381,60333.0,639.865904
3,Albania,ALB,1986,2097326000.0,3022635.0,1897050000.0,1857338000.0,1881413000.0,0.105572,57873.0,693.873475
4,Albania,ALB,1987,2080796000.0,3083605.0,2097326000.0,1897050000.0,1857338000.0,-0.007881,60970.0,674.793383


In [5]:
df_encoded = pd.get_dummies(df, columns=["Country Code"], drop_first=True)


In [6]:
features = [col for col in df_encoded.columns 
            if col not in ["GDP", "Country Name", "Year"]]

target = "GDP"

X = df_encoded[features]
y = df_encoded[target]


In [7]:
train = df_encoded[df_encoded["Year"] <= 2014]
test  = df_encoded[df_encoded["Year"] > 2014]

X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]


In [8]:
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=8000,
    learning_rate=0.03,
    max_depth=7,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42
)



In [9]:
#evals_result = {}

model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_metric=["mae", "rmse"],
    verbose=False,
    early_stopping_rounds=50,
    evals_result=evals_result
)

NameError: name 'evals_result' is not defined

In [10]:
model.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9
,device,
,early_stopping_rounds,
,enable_categorical,False


In [11]:
from sklearn.metrics import mean_absolute_error, r2_score

preds = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, preds))
print("R2:", r2_score(y_test, preds))


MAE: 36717093419.13971
R2: 0.9820344164133309


In [12]:
future_years = range(2019, 2031)
results = []

# get last known values per country
last_rows = df_encoded.groupby("Country Name").tail(1)

for year in future_years:
    new_rows = []

    for _, row in last_rows.iterrows():
        new_row = row.copy()
        new_row["Year"] = year
        
        # Update lag features from model prediction
        new_row["GDP_lag_2"] = new_row["GDP_lag_1"]
        
        # Predict GDP
        X_predict = new_row[features].values.reshape(1, -1)
        predicted_gdp = model.predict(X_predict)[0]
        
        new_row["GDP_lag_1"] = predicted_gdp
        new_row["GDP_pct_change_1"] = (
            predicted_gdp - new_row["GDP_lag_2"]
        ) / new_row["GDP_lag_2"]
        
        new_rows.append(new_row)
        results.append({"Country Name": row["Country Name"], 
                        "Year": year,
                        "Predicted_GDP": predicted_gdp})
    
    # update last_rows for next iteration
    last_rows = pd.DataFrame(new_rows)


In [13]:
future_df = pd.DataFrame(results)


In [14]:
top10_gdp_2030 = (
    future_df[future_df["Year"] == 2022]
    .sort_values("Predicted_GDP", ascending=False)
    .head(10)
)


In [15]:
print(top10_gdp_2030)

       Country Name  Year  Predicted_GDP
643   United States  2022   1.714719e+13
517           China  2022   1.037809e+13
563           Japan  2022   6.047224e+12
541         Germany  2022   4.609628e+12
642  United Kingdom  2022   3.091814e+12
537          France  2022   3.080986e+12
555           India  2022   2.527166e+12
561           Italy  2022   2.272247e+12
505          Brazil  2022   2.114589e+12
567     Korea, Rep.  2022   2.034442e+12
