# SkyGuard: AQI Prediction Model Training

## 1. Data Loading

In [1]:
import pandas as pd
import numpy as np
import joblib
import os

df = pd.read_csv('../data/city_day.csv')
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(['City', 'Date'])
df.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


## 2. Feature Engineering

In [2]:
from sklearn.preprocessing import LabelEncoder

# Create lag features for AQI
df['AQI_lag1'] = df.groupby('City')['AQI'].shift(1)
df['AQI_lag2'] = df.groupby('City')['AQI'].shift(2)
df['AQI_lag3'] = df.groupby('City')['AQI'].shift(3)

# Create rolling mean features
df['AQI_rolling_mean_3'] = df.groupby('City')['AQI'].rolling(window=3, min_periods=1).mean().reset_index(0, drop=True)
df['AQI_rolling_mean_7'] = df.groupby('City')['AQI'].rolling(window=7, min_periods=1).mean().reset_index(0, drop=True)

# Extract date features
df['day_of_year'] = df['Date'].dt.dayofyear
df['month'] = df['Date'].dt.month
df['day_of_week'] = df['Date'].dt.dayofweek

# Encode city
le = LabelEncoder()
df['city_encoded'] = le.fit_transform(df['City'])

# Select features for training
feature_columns = [
    'city_encoded', 'day_of_year', 'month', 'day_of_week',
    'AQI_lag1', 'AQI_lag2', 'AQI_lag3',
    'AQI_rolling_mean_3', 'AQI_rolling_mean_7'
]

# Add pollutant features if available
pollutant_columns = ['PM2.5', 'PM10', 'NO2', 'CO', 'SO2', 'O3']
for col in pollutant_columns:
    if col in df.columns:
        feature_columns.append(col)

# Remove rows with missing target values
df = df.dropna(subset=['AQI'])

# Remove rows with too many missing features
df = df.dropna(subset=feature_columns)

X = df[feature_columns]
y = df['AQI']

joblib.dump(le, '../models/city_encoder.joblib')

X.head()

Unnamed: 0,city_encoded,day_of_year,month,day_of_week,AQI_lag1,AQI_lag2,AQI_lag3,AQI_rolling_mean_3,AQI_rolling_mean_7,PM2.5,PM10,NO2,CO,SO2,O3
1598,0,138,5,5,356.0,330.0,281.0,348.333333,331.5,34.11,138.31,75.23,13.27,88.66,42.22
1599,0,139,5,6,359.0,356.0,330.0,420.666667,374.6,33.69,111.73,68.9,34.56,80.9,36.95
1600,0,140,5,0,547.0,359.0,356.0,573.0,447.666667,42.31,118.65,81.84,17.47,89.57,46.68
1601,0,141,5,1,813.0,547.0,359.0,560.333333,429.571429,24.6,103.88,81.24,11.03,80.74,46.65
1602,0,142,5,2,321.0,813.0,547.0,468.0,428.0,27.93,103.3,76.75,11.44,86.48,54.34


## 3. Model Training

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train RandomForest model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_scaled, y_train)

## 4. Evaluation

In [4]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'Model Performance:')
print(f'MAE: {mae:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R²: {r2:.3f}')

Model Performance:
MAE: 7.55
RMSE: 20.84
R²: 0.963


## 5. Save Model and Scaler

In [5]:
# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save model and scaler
joblib.dump(model, '../models/aqi_model.joblib')
joblib.dump(scaler, '../models/scaler.joblib')

# Save feature names
joblib.dump(X_train.columns.tolist(), '../models/feature_names.joblib')

print('Model and scaler saved successfully!')

Model and scaler saved successfully!
