In [4]:
# Air Quality Index (AQI) Prediction for Indian Metropolitan Cities

# --- Import Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

import joblib

from statsmodels.tsa.arima.model import ARIMA
import warnings
warnings.filterwarnings('ignore')

# --- Load Dataset ---
from google.colab import files
uploaded = files.upload()

import io
import zipfile

uploaded_file_key = list(uploaded.keys())[0]

with zipfile.ZipFile(io.BytesIO(uploaded[uploaded_file_key]), 'r') as zip_ref:
    zip_ref.extractall('.')
df = pd.read_csv('city_day.csv')

# --- Initial Preprocessing ---
# Focused features: City, PM2.5, PM10, NO2, SO2, CO, O3, TEMP, HUMIDITY, WIND_SPEED, AQI
df = df[['City', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3', 'AQI']]
df = df.dropna()

# Filter for Metro Cities
metro_cities = ['Delhi', 'Mumbai', 'Bengaluru', 'Chennai', 'Hyderabad', 'Kolkata']
df = df[df['City'].isin(metro_cities)]

# Add dummy weather features TEMP, HUMIDITY, WIND_SPEED
df['TEMP'] = np.random.uniform(15, 40, df.shape[0])
df['HUMIDITY'] = np.random.uniform(20, 90, df.shape[0])
df['WIND_SPEED'] = np.random.uniform(0.5, 5.0, df.shape[0])

# --- Start User Session ---
print("Welcome to the Air Quality Prediction System for Indian Metropolitan Cities!")
print("Available Cities:", metro_cities)
print('To stop, type: "close the aqi prediction session"\n')

while True:
    city_to_predict = input("Enter a city name to predict AQI or type 'close the aqi prediction session' to exit: ").strip()

    if city_to_predict.lower() == 'close the aqi prediction session':
        print("\n✅ AQI prediction Session closed. Thank you for exploring the air quality!")
        break

    if city_to_predict not in metro_cities:
        print("❌ Invalid city name. Please choose from the available list.\n")
        continue

    df_city = df[df['City'] == city_to_predict]

    print(f"\n🔵 Selected City: {city_to_predict}")
    print(f"🔹 Number of Records Available: {len(df_city)}")

    # --- Feature Scaling ---
    features = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3', 'TEMP', 'HUMIDITY', 'WIND_SPEED']
    scaler = StandardScaler()
    X = scaler.fit_transform(df_city[features])
    y = df_city['AQI']

    # --- Train-Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # --- Exploratory Data Analysis (EDA) ---
    print("\n--- 📊 Feature Distributions ---")
    print("""
    - PM2.5 (Fine Particulate Matter): Dangerous inhalable particles.
    - PM10 (Coarse Particulate Matter): Causes respiratory diseases.
    - NO2 (Nitrogen Dioxide): Emissions from vehicles and factories.
    - SO2 (Sulphur Dioxide): Results from burning fossil fuels.
    - CO (Carbon Monoxide): Harmful gas due to incomplete burning.
    - O3 (Ozone): Smog-forming gas.
    - TEMP (Temperature): Influences chemical reactions in air.
    - HUMIDITY: Affects dispersion of pollutants.
    - WIND_SPEED: Carries pollutants to different areas.
    - AQI (Air Quality Index): A measure of how polluted the air is.
    """)
    # Histograms
    df_city[features + ['AQI']].hist(figsize=(16,14))
    plt.suptitle(f'Feature Distributions - {city_to_predict}')
    plt.show()

    # Correlation Matrix
    plt.figure(figsize=(14,12))
    sns.heatmap(df_city[features + ['AQI']].corr(), annot=True, cmap='coolwarm')
    plt.title(f'Correlation Matrix - {city_to_predict}')
    plt.show()

    # --- Model Training and Evaluation ---
    results = {}

    # 1. Linear Regression
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred_lr = lr.predict(X_test)
    results['Linear Regression'] = r2_score(y_test, y_pred_lr)

    # 2. Decision Tree Regressor
    dt = DecisionTreeRegressor(random_state=42)
    dt.fit(X_train, y_train)
    y_pred_dt = dt.predict(X_test)
    results['Decision Tree'] = r2_score(y_test, y_pred_dt)

    # 3. Random Forest Regressor
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    results['Random Forest'] = r2_score(y_test, y_pred_rf)

    # 4. Support Vector Regressor
    svr = SVR()
    svr.fit(X_train, y_train)
    y_pred_svr = svr.predict(X_test)
    results['SVR'] = r2_score(y_test, y_pred_svr)

    # --- Model Performance Summary ---
    print("\n--- 🏆 Model Performance Summary (R2 Score) ---")
    print("""
    What is R2 Score?
    - It explains how much of the actual AQI variance is captured by our model.
    - A score of 1.0 means perfect prediction; closer to 1 is better!
    """)
    for model, score in results.items():
        print(f"{model}: R2 Score = {score:.4f}")

    # Highlight Best Model
    best_model = max(results, key=results.get)
    print(f"\n⭐ Best Performing Model for {city_to_predict}: {best_model}")

    # Save Best Model
    if best_model == 'Random Forest':
        joblib.dump(rf, f'best_aqi_model_{city_to_predict}.pkl')
        print("💾 Random Forest model saved successfully.\n")

    # --- Plotting Loss Curves (MSE) ---
    errors = [
        mean_squared_error(y_test, y_pred_lr),
        mean_squared_error(y_test, y_pred_dt),
        mean_squared_error(y_test, y_pred_rf),
        mean_squared_error(y_test, y_pred_svr)
    ]

    plt.figure(figsize=(8,6))
    models = list(results.keys())
    plt.bar(models, errors, color='skyblue')
    plt.title('Model Loss (Mean Squared Error)')
    plt.xlabel('Model')
    plt.ylabel('MSE')
    plt.show()

        # --- Time Series Forecasting (Optional - ARIMA) ---
    print("\n--- 📈 ARIMA Forecast for Future AQI ---")
    print("""
    ARIMA Model:
    - It uses historical air quality data to predict future AQI values.
    - Helps people and authorities to take early action.
    """)

    # Creating a fake time series if missing
    if 'Date' in df_city.columns:
        df_city['Date'] = pd.to_datetime(df_city['Date'])
        df_city.set_index('Date', inplace=True)
    else:
        df_city.index = pd.date_range(start='1/1/2015', periods=len(df_city), freq='D')

    ts = df_city['AQI']
    model_arima = ARIMA(ts, order=(5,1,0))
    model_arima_fit = model_arima.fit()
    forecast = model_arima_fit.forecast(steps=10)

    print("\n🔮 Forecasted AQI for Next 10 Days:")
    print("""
    - Each date below shows the predicted Air Quality Index (AQI) value.
    - AQI measures air pollution level:
        • 0–50: Good (Healthy air)
        • 51–100: Moderate (Acceptable but some pollutants present)
        • 101–150: Unhealthy for sensitive groups (e.g., kids, elderly)
        • 151–200: Unhealthy (Everyone may feel health effects)
        • 201–300: Very Unhealthy (Health alert)
        • 301+: Hazardous (Emergency condition)
    """)

    print(forecast)

    # --- Precautions Based on Forecast ---
    print("\n🛡️ Precautions Based on Predicted AQI Levels:")
    for date, aqi in forecast.items():
        date_str = date.strftime("%Y-%m-%d")
        if aqi <= 50:
            status = "Good"
            advice = "Enjoy outdoor activities freely. Air is clean."
        elif aqi <= 100:
            status = "Moderate"
            advice = "Safe for most. Sensitive individuals should reduce prolonged outdoor exertion."
        elif aqi <= 150:
            status = "Unhealthy for Sensitive Groups"
            advice = "Children, elderly, and heart patients should limit outdoor exposure."
        elif aqi <= 200:
            status = "Unhealthy"
            advice = "Everyone should reduce prolonged outdoor exertion. Use masks if needed."
        elif aqi <= 300:
            status = "Very Unhealthy"
            advice = "Avoid outdoor activities. Sensitive groups must stay indoors."
        else:
            status = "Hazardous"
            advice = "Stay indoors with air purifiers. Outdoor activity is extremely dangerous."

        print(f"{date_str}: AQI = {aqi:.2f} ({status}) ➡️ Advice: {advice}")


    # --- Final Sustainability Note ---
    print("\n--- 🌍 How Air Quality Prediction Helps ---")
    print(f"""
    Predicting AQI in {city_to_predict} helps:
    - Alerting people about bad air days.
    - Planning outdoor activities safely.
    - Helping government and environment bodies act in advance.
    - Raising awareness about pollution control and health safety.

    Machine Learning based forecasting empowers us to build cleaner, healthier cities! 🌱
    """)

# Session Ends when user types 'close the AQI prediction session'


Saving city_day.csv.zip to city_day.csv (2).zip
Welcome to the Air Quality Prediction System for Indian Metropolitan Cities!
Available Cities: ['Delhi', 'Mumbai', 'Bengaluru', 'Chennai', 'Hyderabad', 'Kolkata']
To stop, type: "close the aqi prediction session"

Enter a city name to predict AQI or type 'close the aqi prediction session' to exit: close the aqi prediction session

✅ AQI prediction Session closed. Thank you for exploring the air quality!
