In [24]:
import pandas as pd
import numpy as np

# Load the processed dataset
df = pd.read_csv('../data/processed_city_data.csv', parse_dates=['Date'])
df.set_index('Date', inplace=True)
df.sort_index(inplace=True)

# 1. Datetime-based features
df['hour'] = df.index.hour
df['day'] = df.index.day
df['month'] = df.index.month
df['weekday'] = df.index.weekday
df['year'] = df.index.year

# 2. Season Encoding
def get_season(month):
    if month in [12, 1, 2]: return 'winter'
    elif month in [3, 4, 5]: return 'spring'
    elif month in [6, 7, 8]: return 'summer'
    else: return 'fall'

df['season'] = df['month'].apply(get_season)
df = pd.get_dummies(df, columns=['season'], drop_first=True)

# 3. Rolling statistics
for col in ['pm2_5', 'pm10', 'no2', 'so2', 'co', 'o3']:
    df[f'{col}_roll_mean_7'] = df[col].rolling(window=7, min_periods=1).mean()
    df[f'{col}_roll_std_7'] = df[col].rolling(window=7, min_periods=1).std()

# 4. Lag features
for col in ['pm2_5', 'pm10', 'no2', 'so2', 'co', 'o3']:
    df[f'{col}_lag1'] = df[col].shift(1)
    df[f'{col}_lag2'] = df[col].shift(2)
    df[f'{col}_lag3'] = df[col].shift(3)

# 5. AQI bucket encoding (Ordinal)
aqi_mapping = {
    'Good': 0,
    'Satisfactory': 1,
    'Moderate': 2,
    'Poor': 3,
    'Very Poor': 4,
    'Severe': 5
}
df['aqi_bucket_encoded'] = df['aqi_bucket'].map(aqi_mapping)

# Drop rows with NaNs from rolling and shifting
df.dropna(inplace=True)

# Save to file
df.to_csv('../data/feature_engineered_data.csv')
print("Phase 1 complete: Saved to '../data/feature_engineered_data.csv'")


Phase 1 complete: Saved to '../data/feature_engineered_data.csv'
