In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm

# Preprocessing

# Load dataset
df = pd.read_csv("pollution.csv", parse_dates=["date"])
df = df.sort_values("date")

pollutants = ["PM2.5", "PM10", "O3", "NO2", "CO", "SO2"]

# Interpolate missing values
df[pollutants] = df[pollutants].interpolate(method="linear")

# Outlier removal
# Rule: value is removed if both previous and next are NOT within 1 standard deviation range
for col in pollutants:
    mean = df[col].mean()
    std = df[col].std()

    lower = mean - std
    upper = mean + std

    mask_prev = df[col].shift(1).between(lower, upper)
    mask_next = df[col].shift(-1).between(lower, upper)

    outlier_mask = ~(mask_prev & mask_next)
    df.loc[outlier_mask, col] = np.nan

# Interpolate again after outlier removal
df[pollutants] = df[pollutants].interpolate(method="linear")

# Remove lat/long/city/country
drop_cols = ["latitude", "longitude", "city", "country"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Use district-level mean values
df = df.groupby(["date", "district"])[pollutants].mean().reset_index()

# Normalize pollutants
scaler = MinMaxScaler()
df[pollutants] = scaler.fit_transform(df[pollutants])

# Time-series decomposition
decompositions = {}

for col in pollutants:
    # Choose additive or multiplicative based on variance heuristic
    if df[col].std() > 0.2:
        model_type = "multiplicative"
    else:
        model_type = "additive"
    
    decomposition = sm.tsa.seasonal_decompose(
        df[col], model=model_type, period=365, extrapolate_trend="freq"
    )
    decompositions[col] = decomposition

# Train-test split (80/20)
split_idx = int(len(df) * 0.8)
train = df.iloc[:split_idx]
test = df.iloc[split_idx:]

print("Train shape:", train.shape)
print("Test shape:", test.shape)
