In [1]:
import pandas as pd


In [2]:
co2 = pd.read_csv('data_explore/co2.csv')
aq = pd.read_csv('data_explore/air_quality.csv')
pollution = pd.read_csv('data_explore/pollution.csv')
country_map = pd.read_csv('data_explore/country_map.csv')


In [3]:
pollution['pollutant'].value_counts()

pollutant
PM2.5               116
PM10                 71
NO2                  55
SO2                  55
O3                   52
CO                   49
TEMPERATURE          28
PM1                  17
RELATIVEHUMIDITY     16
UM003                16
NO                    5
NOX                   5
BC                    2
Name: count, dtype: int64

In [4]:
# Only using top pollutants
pollution = pollution[pollution['pollutant'].isin(['PM2.5', 'PM10', 'NO2', 'SO2', 'O3', 'CO'])]

In [5]:
co2.set_index(['country_code'], inplace=True)
co2 = co2[~co2.index.get_level_values('country_code').duplicated(keep='last')]


In [6]:
co2.head()

Unnamed: 0_level_0,year,co2,co2_per_capita
country_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABW,2016,883.747,0.008439
AFG,2019,6079.999924,0.00016
AGO,2019,25209.999084,0.000779
ALB,2019,4829.999924,0.001683
AND,2019,500.0,0.006535


In [7]:
aq.set_index(['country_code'], inplace=True)

In [8]:
co2['aq'] = aq['aq']

In [9]:
co2.head()

Unnamed: 0_level_0,year,co2,co2_per_capita,aq
country_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ABW,2016,883.747,0.008439,
AFG,2019,6079.999924,0.00016,19.0
AGO,2019,25209.999084,0.000779,14.4
ALB,2019,4829.999924,0.001683,14.0
AND,2019,500.0,0.006535,18.666667


In [10]:
# Suppose your DataFrame has columns: 'country_code', 'year', 'pollutant', 'unit', 'value'

pollution_pivot = pollution.pivot_table(
    index=['country_code'],   # Rows: one per country per year
    columns='pollutant',              # Columns: one per pollutant
    values='value'                    # Fill values from the 'value' column
).reset_index()

# Optional: flatten the column MultiIndex if needed
pollution_pivot.columns.name = None


In [11]:
pollution_pivot.head()

Unnamed: 0,country_code,CO,NO2,O3,PM10,PM2.5,SO2
0,AFG,,,,,-431.5,
1,AND,100.0,4.7,68.333333,11.0,7.9,0.2
2,ARE,306.666667,9.185556,48.202308,247.722222,-204.2,9.108889
3,ARG,590.0,12.5,26.0,10.4,6.148884,5.71
4,ARM,,,,,10.4,


In [12]:
pollution_pivot.set_index(['country_code'], inplace=True)

In [13]:
for pollutant in pollution_pivot.columns:
    co2[pollutant] = pollution_pivot[pollutant]
    co2.loc[co2[pollutant] < 0, pollutant] = None

    

In [14]:
co2 = co2.merge(country_map, on='country_code', how='left')
co2.drop(columns=['country_name', 'region', 'sub_region'], inplace=True)

In [15]:
co2.head()

Unnamed: 0,country_code,year,co2,co2_per_capita,aq,CO,NO2,O3,PM10,PM2.5,SO2,region_code,sub_region_code
0,ABW,2016,883.747,0.008439,,,,,,,,19.0,29.0
1,AFG,2019,6079.999924,0.00016,19.0,,,,,,,142.0,34.0
2,AGO,2019,25209.999084,0.000779,14.4,,,,,,,2.0,17.0
3,ALB,2019,4829.999924,0.001683,14.0,,,,,,,150.0,39.0
4,AND,2019,500.0,0.006535,18.666667,100.0,4.7,68.333333,11.0,7.9,0.2,150.0,39.0


In [16]:
co2.drop(columns=['year', 'co2'], inplace=True)

In [17]:
co2.isna().sum() / len(co2)

country_code       0.000000
co2_per_capita     0.000000
aq                 0.359223
CO                 0.766990
NO2                0.742718
O3                 0.757282
PM10               0.684466
PM2.5              0.553398
SO2                0.747573
region_code        0.000000
sub_region_code    0.000000
dtype: float64

## Imputation Dependant (Calculation)

In [18]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression

def impute(df, group_col="region_code",  random_state=42):
    np.random.seed(random_state)
    df = df.copy()

    # Only numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns

    # Order columns by missingness (ascending)
    missing_order = sorted(numeric_cols, key=lambda c: df[c].isna().mean())

    for target_col in missing_order:
        if df[target_col].isna().sum() == 0:
            continue

        # --- Region-wise: only impute if region has â‰¥60% non-missing in this column
        valid_regions = []
        for region, subset in df.groupby(group_col):
            total = len(subset)
            non_missing = subset[target_col].notna().sum()
            if total > 0 and non_missing / total >= 0.4:
                valid_regions.append(region)

        for region in valid_regions:
            region_idx = df[df[group_col] == region].index
            median_val = df.loc[region_idx, target_col].median()
            df.loc[region_idx, target_col] = df.loc[region_idx, target_col].fillna(median_val)

    return df

df = co2.copy()

df = impute(df, group_col="region_code")
df.head()


Unnamed: 0,country_code,co2_per_capita,aq,CO,NO2,O3,PM10,PM2.5,SO2,region_code,sub_region_code
0,ABW,0.008439,,,,,,,,19.0,29.0
1,AFG,0.00016,19.0,,,,,24.5,,142.0,34.0
2,AGO,0.000779,14.4,,,,,,,2.0,17.0
3,ALB,0.001683,14.0,319.097689,15.335486,52.565964,20.231759,13.97136,3.545794,150.0,39.0
4,AND,0.006535,18.666667,100.0,4.7,68.333333,11.0,7.9,0.2,150.0,39.0


In [19]:
aq_imputed = df.loc[:,['country_code', 'aq', 'PM2.5', 'PM10']]
aq_imputed.set_index('country_code', inplace=True)
aq_imputed.to_csv('data/aq_imputed.csv')


In [20]:
aq_imputed.isna().sum()

aq        39
PM2.5     88
PM10     132
dtype: int64