In [None]:
# --- 1. Installation and Imports ---
!pip install xgboost scikit-learn pandas
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from google.colab import files
import io
import os

# Define file names and constants
INPUT_FILE_NAME = 'airnow_cleaned.csv'
MODEL_FILE_NAME = 'xgboost_air_quality_model.joblib'

print("Starting Air Quality Prediction Model Training...")
print("-" * 50)



Starting Air Quality Prediction Model Training...
--------------------------------------------------


In [None]:
# --- 2. Upload Data (Run this cell and upload 'airnow_cleaned.csv') ---
print(f"Please upload the '{INPUT_FILE_NAME}' file now...")
uploaded = files.upload()

if INPUT_FILE_NAME not in uploaded:
    print(f"Error: Could not find {INPUT_FILE_NAME} in the uploaded files. Please check the file name.")
else:
    # Read the data from the uploaded file
    data_io = io.StringIO(uploaded[INPUT_FILE_NAME].decode('utf-8'))
    df = pd.read_csv(data_io)

    print(f"\nSuccessfully loaded {len(df)} rows from '{INPUT_FILE_NAME}'.")



Please upload the 'airnow_cleaned.csv' file now...


Saving airnow_cleaned.csv to airnow_cleaned.csv

Successfully loaded 2716 rows from 'airnow_cleaned.csv'.


In [None]:
def preprocess_and_split_data(df):
# --- 3. Data Preprocessing and Feature Selection ---

# Define features (X) and targets (Y)
# Since all dates are the same, we only use spatial data: Latitude and Longitude
    FEATURES = ['Latitude', 'Longitude']
    TARGETS = ['AQI', 'PM2.5', 'PM10', 'O3', 'NO2', 'CO', 'SO2']

    X = df[FEATURES]
    Y = df[TARGETS]

    # Split data into training and testing sets (80% train, 20% test)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.2, random_state=42
    )

    print(f"Training set size: {len(X_train)} samples")
    print(f"Test set size: {len(X_test)} samples")

    return X_train, X_test, Y_train, Y_test # A function would typically return the split data

In [None]:
# Call the data preprocessing and splitting function
X_train, X_test, Y_train, Y_test = preprocess_and_split_data(df)

Training set size: 2172 samples
Test set size: 544 samples


In [None]:
# --- 4. Model Definition and Training ---

# Initialize the base XGBoost Regressor
# Using 'gbtree' booster for tree-based modeling
# n_estimators can be increased for potentially better performance
base_regressor = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=150,           # Number of boosting rounds
    learning_rate=0.05,         # Step size shrinkage to prevent overfitting
    random_state=42,
    n_jobs=-1                   # Use all available cores
)

# Wrap the XGBoost model in the MultiOutputRegressor
# This trains a separate XGBoost model for each of the 6 target variables
multioutput_model = MultiOutputRegressor(base_regressor)

print("\nTraining the Multi-Output XGBoost model...")
multioutput_model.fit(X_train, Y_train)
print("Training complete.")


Training the Multi-Output XGBoost model...
Training complete.


In [None]:
# --- 5. Model Evaluation ---
TARGETS = ['AQI', 'PM2.5', 'PM10', 'O3', 'NO2', 'CO', 'SO2']
# Make predictions on the test set
Y_pred = multioutput_model.predict(X_test)

# Evaluate performance for each target
print("\nModel Evaluation (Test Set):")
for i, target in enumerate(TARGETS):
    # Calculate R-squared (coefficient of determination)
    r2 = r2_score(Y_test.iloc[:, i], Y_pred[:, i])
    # Calculate Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mean_squared_error(Y_test.iloc[:, i], Y_pred[:, i]))
    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(Y_test.iloc[:, i], Y_pred[:, i])

    print(f"  {target:<5} -> R²: {r2:.4f}, RMSE: {rmse:.2f}, MAE: {mae:.2f}")


Model Evaluation (Test Set):
  AQI   -> R²: -0.0338, RMSE: 46.12, MAE: 33.69
  PM2.5 -> R²: 0.0033, RMSE: 3.81, MAE: 1.71
  PM10  -> R²: -0.3398, RMSE: 7.34, MAE: 2.58
  O3    -> R²: 0.2021, RMSE: 7.63, MAE: 5.05
  NO2   -> R²: 0.0443, RMSE: 2.70, MAE: 0.88
  CO    -> R²: -0.4315, RMSE: 0.06, MAE: 0.02
  SO2   -> R²: -0.1775, RMSE: 0.25, MAE: 0.07


In [None]:
# --- 6. Save and Download the Model ---
# Save the trained model to a file using joblib
joblib.dump(multioutput_model, MODEL_FILE_NAME)
print(f"\nModel successfully saved as '{MODEL_FILE_NAME}'.")

# Trigger the download of the model file
files.download(MODEL_FILE_NAME)
print(f"'{MODEL_FILE_NAME}' download initiated.")


Model successfully saved as 'xgboost_air_quality_model.joblib'.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

'xgboost_air_quality_model.joblib' download initiated.


In [None]:
import joblib
import pandas as pd
from google.colab import files
import io

In [None]:
# Load the model file
print("Upload the 'xgboost_air_quality_model.joblib' file to load the model.")
files.upload()

In [None]:
# Load the model from the file
loaded_model = joblib.load('xgboost_air_quality_model.joblib')

In [None]:
# Example new data for prediction
new_locations = pd.DataFrame({
    'Latitude': [144.0],
    'Longitude': [110.77]
})

In [None]:
# Make predictions
predictions = loaded_model.predict(new_locations)

In [None]:
# Display results in a clear format
prediction_df = pd.DataFrame(
    predictions,
    columns=['AQI', 'PM2.5', 'PM10', 'O3', 'NO2', 'CO', 'SO2']
)
prediction_df['Latitude'] = new_locations['Latitude']
prediction_df['Longitude'] = new_locations['Longitude']

print("\nPredictions for new locations:")
print(prediction_df[['Latitude', 'Longitude', 'AQI', 'PM2.5', 'PM10', 'O3', 'NO2', 'CO', 'SO2']])


Predictions for new locations:
   Latitude  Longitude        AQI     PM2.5       PM10        O3       NO2  \
0     144.0     110.77  34.548126  6.874144  17.014456  23.61998  5.104879   

         CO       SO2  
0  0.200611  0.200131  
