In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from ipywidgets import interact, Dropdown
from IPython.display import display


In [30]:
df_co2 = pd.read_csv('data/co2.csv')
df_co2['year'] = df_co2['year'].astype(int)
country_options = sorted(df_co2['country_code'].unique())
print(f'Loaded {len(df_co2)} rows for {len(country_options)} countries.')


Loaded 9466 rows for 206 countries.


In [31]:
def plot_hw_forecast(country_code):
    country_data = df_co2[df_co2['country_code'] == country_code].sort_values('year')
    if country_data.empty:
        raise ValueError(f'No data found for {country_code}')


    forecast_steps = 6
    forecast_years = np.arange(2020, 2020 + forecast_steps)

    
    train = country_data[(country_data['year'] >= 1970) & (country_data['year'] <= 2019)]
    if len(train) < 8:
        return pd.Series(np.nan, index=forecast_years)
    ts = train['co2'].values
    model = ExponentialSmoothing(ts, trend='add', seasonal=None)
    fitted = model.fit(optimized=True)

    
    forecast = fitted.forecast(steps=forecast_steps)
    forecast = np.asarray(forecast)

    # Forecast years should start at 2020
    

    # Optional: force first forecast point to equal last observed point
    forecast[0] = ts[-1]

    return pd.Series(forecast, index=forecast_years)

predictions = {
    c: plot_hw_forecast(c)
    for c in df_co2['country_code'].unique()
}

predictions = pd.DataFrame(predictions)


In [32]:
preds = (
    predictions
    .T
    .stack()
    .rename("co2")
    .reset_index()
    .rename(columns={"level_1": "year", "level_0": "country_code"})
)


In [33]:
df_co2['population'] = df_co2['co2'] / df_co2['co2_per_capita']

In [34]:
df_co2 = pd.concat([df_co2, preds])

In [35]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

def fill_population_trend(df):
    out = df.copy()
    out = out.sort_values(["country_code", "year"])

    def fill_group(g):
        g = g.copy()
        years = g["year"].values

        for i, row in g[g["population"].isna()].iterrows():
            yr = row["year"]

            # previous five years with known population
            prev = g[(g["year"] < yr) & (~g["population"].isna())].tail(5)

            if len(prev) < 2:
                # not enough data to estimate a trend
                continue

            X = prev["year"].values.reshape(-1, 1)
            y = prev["population"].values

            model = LinearRegression().fit(X, y)
            g.at[i, "population"] = model.predict([[yr]])[0]

        return g

    return out.groupby("country_code").apply(fill_group).reset_index(drop=True)


df_co2 = fill_population_trend(df_co2)


  return out.groupby("country_code").apply(fill_group).reset_index(drop=True)


In [40]:
df_co2.loc[df_co2['year']>=2020, 'co2_per_capita'] = df_co2.loc[df_co2['year']>=2020, 'co2'] / df_co2.loc[df_co2['year']>=2020, 'population']
df_co2.tail(20)

Unnamed: 0,country_code,year,co2,co2_per_capita,population,co2_per_capita_imp
10670,ZWE,2006,9829.999924,0.00079,12437730.0,0.00079
10671,ZWE,2007,9760.000229,0.000778,12538240.0,0.000778
10672,ZWE,2008,7599.999905,0.000601,12638750.0,0.000601
10673,ZWE,2009,7750.0,0.000608,12739260.0,0.000608
10674,ZWE,2010,9600.000381,0.000748,12839770.0,0.000748
10675,ZWE,2011,11409.999847,0.000871,13102800.0,0.000871
10676,ZWE,2012,12010.000229,0.000899,13365840.0,0.000899
10677,ZWE,2013,12279.999733,0.000901,13628870.0,0.000901
10678,ZWE,2014,12079.999924,0.00087,13891900.0,0.00087
10679,ZWE,2015,12430.000305,0.000878,14154940.0,0.000878


In [None]:
df_co2.drop(columns=['co2_per_capita_imp']).set_index('country_code').to_csv('data_explore/co2.csv')