In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
gdp = pd.read_csv('gdp_long_1980_2018.csv')

In [4]:
pop = pd.read_csv('population_long_1980_2024.csv')

In [5]:
print(gdp.head())
print(pop.head())


  Country Name Country Code  Year           GDP
0      Albania          ALB  1980  1.578102e+09
1      Albania          ALB  1981  1.808177e+09
2      Albania          ALB  1982  1.861163e+09
3      Albania          ALB  1983  1.881413e+09
4      Albania          ALB  1984  1.857338e+09
  Country Name Country Code  Year  Population
0      Albania          ALB  1980   2671997.0
1      Albania          ALB  1981   2726056.0
2      Albania          ALB  1982   2784278.0
3      Albania          ALB  1983   2843960.0
4      Albania          ALB  1984   2904429.0


In [6]:
df = pd.merge(gdp, pop, on=["Country Name", "Year"], how="inner")


In [7]:
df.head()

Unnamed: 0,Country Name,Country Code_x,Year,GDP,Country Code_y,Population
0,Albania,ALB,1980,1578102000.0,ALB,2671997.0
1,Albania,ALB,1981,1808177000.0,ALB,2726056.0
2,Albania,ALB,1982,1861163000.0,ALB,2784278.0
3,Albania,ALB,1983,1881413000.0,ALB,2843960.0
4,Albania,ALB,1984,1857338000.0,ALB,2904429.0


In [8]:
df.to_csv("merged_gdp_population_1980_2018.csv", index=False)

In [9]:
df = df.sort_values(by=["Country Name", "Year"]).reset_index(drop=True)

In [10]:
# GDP lag features
df["GDP_lag_1"] = df.groupby("Country Name")["GDP"].shift(1)
df["GDP_lag_2"] = df.groupby("Country Name")["GDP"].shift(2)
df["GDP_lag_3"] = df.groupby("Country Name")["GDP"].shift(3)

In [11]:
# GDP per capita
df["GDP_per_capita"] = df["GDP"] / df["Population"]

In [12]:
# Population growth rate
df["Population_growth"] = df.groupby("Country Name")["Population"].pct_change()

In [13]:
df = df.dropna().reset_index(drop=True)

In [14]:
df.head()

Unnamed: 0,Country Name,Country Code_x,Year,GDP,Country Code_y,Population,GDP_lag_1,GDP_lag_2,GDP_lag_3,GDP_per_capita,Population_growth
0,Albania,ALB,1983,1881413000.0,ALB,2843960.0,1861163000.0,1808177000.0,1578102000.0,661.546782,0.021435
1,Albania,ALB,1984,1857338000.0,ALB,2904429.0,1881413000.0,1861163000.0,1808177000.0,639.48473,0.021262
2,Albania,ALB,1985,1897050000.0,ALB,2964762.0,1857338000.0,1881413000.0,1861163000.0,639.865904,0.020773
3,Albania,ALB,1986,2097326000.0,ALB,3022635.0,1897050000.0,1857338000.0,1881413000.0,693.873475,0.01952
4,Albania,ALB,1987,2080796000.0,ALB,3083605.0,2097326000.0,1897050000.0,1857338000.0,674.793383,0.020171


Standarizing Columns

In [15]:
# Step 1 — Rename columns
df = df.rename(columns={
    "Country Code_x": "Country Code",
    "Country Code_y": "Country Code_pop"  # temporary only
})

In [16]:
# Step 2 — Drop the duplicate population code column
df = df.drop(columns=["Country Code_pop"])

In [17]:
# Step 3 — Create GDP growth
df["GDP_growth_1yr"] = (df["GDP"] - df["GDP_lag_1"]) / df["GDP_lag_1"]

In [18]:
# Step 4 — Create population growth
df["Population_growth_1yr"] = (df["Population"] - df.groupby("Country Name")["Population"].shift(1))


In [19]:
# Step 5 — GDP per capita
df["GDP_per_capita"] = df["GDP"] / df["Population"]

In [20]:
# Step 6 — Reorder columns neatly
df = df[
    [
        "Country Name",
        "Country Code",
        "Year",
        "GDP",
        "Population",
        "GDP_lag_1",
        "GDP_lag_2",
        "GDP_lag_3",
        "GDP_growth_1yr",
        "Population_growth_1yr",
        "GDP_per_capita",
    ]
]

In [21]:
df.head()

Unnamed: 0,Country Name,Country Code,Year,GDP,Population,GDP_lag_1,GDP_lag_2,GDP_lag_3,GDP_growth_1yr,Population_growth_1yr,GDP_per_capita
0,Albania,ALB,1983,1881413000.0,2843960.0,1861163000.0,1808177000.0,1578102000.0,0.01088,,661.546782
1,Albania,ALB,1984,1857338000.0,2904429.0,1881413000.0,1861163000.0,1808177000.0,-0.012796,60469.0,639.48473
2,Albania,ALB,1985,1897050000.0,2964762.0,1857338000.0,1881413000.0,1861163000.0,0.021381,60333.0,639.865904
3,Albania,ALB,1986,2097326000.0,3022635.0,1897050000.0,1857338000.0,1881413000.0,0.105572,57873.0,693.873475
4,Albania,ALB,1987,2080796000.0,3083605.0,2097326000.0,1897050000.0,1857338000.0,-0.007881,60970.0,674.793383


In [22]:
df.to_csv("gdp_population_with_lags.csv", index=False)