In [28]:
import pandas as pd


In [29]:
co2 = pd.read_csv('data/co2.csv')
aq = pd.read_csv('data/air_quality.csv')
pollution = pd.read_csv('data/pollution.csv')
country_map = pd.read_csv('data/country_map.csv')


In [30]:
pollution['pollutant'].value_counts()

pollutant
PM2.5               116
PM10                 71
NO2                  55
SO2                  55
O3                   52
CO                   49
TEMPERATURE          28
PM1                  17
RELATIVEHUMIDITY     16
UM003                16
NO                    5
NOX                   5
BC                    2
Name: count, dtype: int64

In [31]:
# Only using top pollutants
pollution = pollution[pollution['pollutant'].isin(['PM2.5', 'PM10', 'NO2', 'SO2', 'O3', 'CO'])]

In [32]:
co2.set_index(['country_code'], inplace=True)
co2 = co2[~co2.index.get_level_values('country_code').duplicated(keep='last')]


In [33]:
co2.head()

Unnamed: 0_level_0,year,value,co2_per_capita
country_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABW,2016,883.747,0.008439
AFG,2019,6079.999924,0.00016
AGO,2019,25209.999084,0.000779
ALB,2019,4829.999924,0.001683
AND,2019,500.0,0.006535


In [34]:
aq.set_index(['country_code'], inplace=True)

In [35]:
co2['aq'] = aq['avg']

In [36]:
co2.head()

Unnamed: 0_level_0,year,value,co2_per_capita,aq
country_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ABW,2016,883.747,0.008439,
AFG,2019,6079.999924,0.00016,19.0
AGO,2019,25209.999084,0.000779,14.4
ALB,2019,4829.999924,0.001683,14.0
AND,2019,500.0,0.006535,18.666667


In [37]:
# Suppose your DataFrame has columns: 'country_code', 'year', 'pollutant', 'unit', 'value'

pollution_pivot = pollution.pivot_table(
    index=['country_code'],   # Rows: one per country per year
    columns='pollutant',              # Columns: one per pollutant
    values='value'                    # Fill values from the 'value' column
).reset_index()

# Optional: flatten the column MultiIndex if needed
pollution_pivot.columns.name = None


In [38]:
pollution_pivot.head()

Unnamed: 0,country_code,CO,NO2,O3,PM10,PM2.5,SO2
0,AFG,,,,,-431.5,
1,AND,100.0,4.7,68.333333,11.0,7.9,0.2
2,ARE,306.666667,9.185556,48.202308,247.722222,-204.2,9.108889
3,ARG,590.0,12.5,26.0,10.4,6.148884,5.71
4,ARM,,,,,10.4,


In [39]:
pollution_pivot.set_index(['country_code'], inplace=True)

In [40]:
for pollutant in pollution_pivot.columns:
    co2[pollutant] = pollution_pivot[pollutant]

In [41]:
co2 = co2.merge(country_map, on='country_code', how='left')
co2.drop(columns=['country_name', 'region', 'sub_region'], inplace=True)

In [42]:
co2.head()

Unnamed: 0,country_code,year,value,co2_per_capita,aq,CO,NO2,O3,PM10,PM2.5,SO2,region_code,sub_region_code
0,ABW,2016,883.747,0.008439,,,,,,,,19.0,29.0
1,AFG,2019,6079.999924,0.00016,19.0,,,,,-431.5,,142.0,34.0
2,AGO,2019,25209.999084,0.000779,14.4,,,,,,,2.0,17.0
3,ALB,2019,4829.999924,0.001683,14.0,,,,,,,150.0,39.0
4,AND,2019,500.0,0.006535,18.666667,100.0,4.7,68.333333,11.0,7.9,0.2,150.0,39.0


In [43]:
co2.drop(columns=['year', 'value'], inplace=True)

In [44]:
co2.isna().sum() / len(co2)

country_code       0.000000
co2_per_capita     0.000000
aq                 0.359223
CO                 0.762136
NO2                0.733010
O3                 0.747573
PM10               0.665049
PM2.5              0.456311
SO2                0.733010
region_code        0.000000
sub_region_code    0.000000
dtype: float64

## Imputation Dependant (Test)

In [63]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression

def validate_and_iterative_impute(df, group_col="region_code", method="median", mask_fraction=0.4, random_state=42):
    np.random.seed(random_state)
    df = df.copy()

    # Only numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns

    # Order columns by missingness (ascending)
    missing_order = sorted(numeric_cols, key=lambda c: df[c].isna().mean())

    results = {}

    for target_col in missing_order:
        if df[target_col].isna().sum() == 0:
            continue

        # Mask known values to evaluate error
        known_idx = df[df[target_col].notna()].index
        mask_size = int(len(known_idx) * mask_fraction)
        if mask_size == 0:
            continue

        mask_idx = np.random.choice(known_idx, size=mask_size, replace=False)
        true_values = df.loc[mask_idx, target_col].copy()
        df.loc[mask_idx, target_col] = np.nan

        # --- Region-wise: only impute if region has ≥60% non-missing in this column
        valid_regions = []
        for region, subset in df.groupby(group_col):
            total = len(subset)
            non_missing = subset[target_col].notna().sum()
            if total > 0 and non_missing / total >= 0.4:
                valid_regions.append(region)

        # Apply imputation only to valid regions
        if method == "median":
            for region in valid_regions:
                region_idx = df[df[group_col] == region].index
                median_val = df.loc[region_idx, target_col].median()
                df.loc[region_idx, target_col] = df.loc[region_idx, target_col].fillna(median_val)

        elif method == "knn":
            for region in valid_regions:
                region_idx = df[df[group_col] == region].index
                imputer = KNNImputer(n_neighbors=5)
                df.loc[region_idx, [target_col]] = imputer.fit_transform(df.loc[region_idx, [target_col]])

        elif method == "lr":
            df_train = df[df[target_col].notna()]
            df_test = df[df[target_col].isna()]
            if not df_test.empty:
                X_train = pd.get_dummies(df_train[[group_col]], drop_first=True)
                y_train = df_train[target_col]
                X_test = pd.get_dummies(df_test[[group_col]], drop_first=True)
                X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

                model = LinearRegression()
                model.fit(X_train, y_train)
                df.loc[df[target_col].isna(), target_col] = model.predict(X_test)

        # --- Evaluate Scaled MAE ---
        preds = df.loc[mask_idx, target_col]

        # Skip if any imputed values are still NaN
        valid = preds.notna() & true_values.notna()
        if valid.sum() == 0:
            results[target_col] = None
            continue

        mae = mean_absolute_error(true_values[valid], preds[valid])
        std = true_values[valid].std()

        results[target_col] = mae / std if std else None


    return results

results = {}
df = co2.copy()

for method in ["median", "knn", "lr"]:
    scaled_mae = validate_and_iterative_impute(df, group_col="region_code", method=method)
    results[method] = scaled_mae

results_df = pd.DataFrame(results).T
results_df


Unnamed: 0,aq,PM2.5,PM10,NO2,SO2,O3,CO
median,0.652071,0.322914,0.421448,0.785512,0.681292,0.78152,0.693047
knn,0.695149,0.330328,0.432797,2.519489,0.827117,0.821637,0.759018
lr,0.7474,0.598494,0.423233,1.580919,0.987251,0.633249,12.179137


## Imputation Dependant (Calculation)

In [64]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression

def impute(df, group_col="region_code",  random_state=42):
    np.random.seed(random_state)
    df = df.copy()

    # Only numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns

    # Order columns by missingness (ascending)
    missing_order = sorted(numeric_cols, key=lambda c: df[c].isna().mean())

    for target_col in missing_order:
        if df[target_col].isna().sum() == 0:
            continue

        # --- Region-wise: only impute if region has ≥60% non-missing in this column
        valid_regions = []
        for region, subset in df.groupby(group_col):
            total = len(subset)
            non_missing = subset[target_col].notna().sum()
            if total > 0 and non_missing / total >= 0.4:
                valid_regions.append(region)

        for region in valid_regions:
            region_idx = df[df[group_col] == region].index
            median_val = df.loc[region_idx, target_col].median()
            df.loc[region_idx, target_col] = df.loc[region_idx, target_col].fillna(median_val)

    return df

results = {}
df = co2.copy()

scaled_mae = impute(df, group_col="region_code")
df = results[method] = scaled_mae
df.head()


Unnamed: 0,country_code,co2_per_capita,aq,CO,NO2,O3,PM10,PM2.5,SO2,region_code,sub_region_code
0,ABW,0.008439,,,,,,,,19.0,29.0
1,AFG,0.00016,19.0,,,,,-431.5,,142.0,34.0
2,AGO,0.000779,14.4,,,,,,,2.0,17.0
3,ALB,0.001683,14.0,318.120273,14.909902,52.005742,18.167531,13.4765,3.363485,150.0,39.0
4,AND,0.006535,18.666667,100.0,4.7,68.333333,11.0,7.9,0.2,150.0,39.0


In [66]:
aq_imputed = df.loc[:,['country_code', 'aq', 'PM2.5', 'PM10']]
aq_imputed.set_index('country_code', inplace=True)
aq_imputed.to_csv('data/aq_imputed.csv')


In [67]:
aq_imputed.isna().sum()

aq        39
PM2.5     74
PM10     130
dtype: int64