# Air Quality Prediction Model

Train a machine learning model to predict Air Quality Index (AQI) using sensor data.

**Model**: Random Forest Regressor
**Features**: 10 sensor readings
**Target**: Air Quality Index (0-300)

## 1. Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import os

# Display settings
plt.style.use('seaborn-v0_8-darkgrid')
np.random.seed(42)

print('âœ“ Libraries imported successfully')

## 2. Create Synthetic Air Quality Dataset

In [None]:
# Generate synthetic air quality data
n_samples = 1000

# Create features: 10 sensor readings
data = {
    'sensor_1': np.random.uniform(10, 50, n_samples),  # PM2.5
    'sensor_2': np.random.uniform(20, 100, n_samples),  # PM10
    'sensor_3': np.random.uniform(0, 50, n_samples),   # NO2
    'sensor_4': np.random.uniform(0, 40, n_samples),   # SO2
    'sensor_5': np.random.uniform(0, 30, n_samples),   # CO
    'sensor_6': np.random.uniform(0, 100, n_samples),  # O3
    'sensor_7': np.random.uniform(30, 80, n_samples),  # Humidity
    'sensor_8': np.random.uniform(0, 40, n_samples),   # Temperature
    'sensor_9': np.random.uniform(900, 1050, n_samples),  # Pressure
    'sensor_10': np.random.uniform(0, 20, n_samples),  # Wind Speed
}

# Create AQI based on sensor values (simplified calculation)
aqi = (data['sensor_1'] * 1.2 + 
       data['sensor_2'] * 0.8 + 
       data['sensor_3'] * 0.6 +
       data['sensor_4'] * 0.5 -
       data['sensor_7'] * 0.3 +
       data['sensor_8'] * 0.2)

aqi = np.clip(aqi, 0, 300) + np.random.normal(0, 5, n_samples)

data['AQI'] = np.maximum(aqi, 0)

# Create DataFrame
df = pd.DataFrame(data)

print(f'Dataset shape: {df.shape}')
print(f'\nFirst 5 rows:')
print(df.head())
print(f'\nDataset info:')
print(df.describe())

## 3. Data Preparation

In [None]:
# Check for missing values
print('Missing values:')
print(df.isnull().sum())

# Separate features and target
X = df.drop('AQI', axis=1)  # Features (10 sensors)
y = df['AQI']  # Target (AQI)

# Split into train and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'\nTraining set size: {X_train.shape[0]}')
print(f'Test set size: {X_test.shape[0]}')
print(f'\nFeatures: {X_train.columns.tolist()}')

## 4. Train Random Forest Model

In [None]:
# Create and train Random Forest Regressor
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print('Training model...')
model.fit(X_train, y_train)
print('âœ“ Model training completed')

## 5. Evaluate Model Performance

In [None]:
# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Calculate metrics
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print('='*50)
print('MODEL EVALUATION METRICS')
print('='*50)
print(f'\nTraining Set:')
print(f'  RMSE: {train_rmse:.4f}')
print(f'  MAE:  {train_mae:.4f}')
print(f'  RÂ²:   {train_r2:.4f}')
print(f'\nTest Set:')
print(f'  RMSE: {test_rmse:.4f}')
print(f'  MAE:  {test_mae:.4f}')
print(f'  RÂ²:   {test_r2:.4f}')
print('='*50)

## 6. Feature Importance Analysis

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print('\nFeature Importance:')
print(feature_importance)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'], color='steelblue')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.title('Feature Importance - Air Quality Model')
plt.tight_layout()
plt.show()

## 7. Save Trained Model

In [None]:
# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save model
model_path = '../models/air_quality_model.pkl'
joblib.dump(model, model_path)

# Save feature names for later use
features_path = '../models/air_quality_features.pkl'
joblib.dump(X_train.columns.tolist(), features_path)

print(f'âœ“ Model saved to {model_path}')
print(f'âœ“ Features saved to {features_path}')
print(f'\nModel file size: {os.path.getsize(model_path) / 1024:.2f} KB')

## 8. Test Predictions with Sample Data

In [None]:
# Create sample input data for testing
sample_data = pd.DataFrame({
    'sensor_1': [35],   # PM2.5
    'sensor_2': [60],   # PM10
    'sensor_3': [20],   # NO2
    'sensor_4': [15],   # SO2
    'sensor_5': [10],   # CO
    'sensor_6': [50],   # O3
    'sensor_7': [55],   # Humidity
    'sensor_8': [25],   # Temperature
    'sensor_9': [1013], # Pressure
    'sensor_10': [5],   # Wind Speed
})

# Make prediction
prediction = model.predict(sample_data)[0]

print('Sample Input Data:')
print(sample_data)
print(f'\nPredicted AQI: {prediction:.2f}')

# Interpret AQI
if prediction <= 50:
    quality = 'ðŸŸ¢ Good'
elif prediction <= 100:
    quality = 'ðŸŸ¡ Moderate'
elif prediction <= 150:
    quality = 'ðŸŸ  Unhealthy for Sensitive Groups'
elif prediction <= 200:
    quality = 'ðŸ”´ Unhealthy'
else:
    quality = 'âš« Very Unhealthy'

print(f'Air Quality Status: {quality}')