In [1]:
import requests
import pandas as pd
from datetime import datetime
import time
import os


AQICN_API_KEY = "3ab568459364d9a295405394a113823df31a9a82"
OPENWEATHER_API_KEY = "c78b17200559431652d643ad3e0259a9"

CITY = "Karachi"

# Fetch AQICN data
def fetch_aqicn_data(city):
    url = f"https://api.waqi.info/feed/{city}/?token={AQICN_API_KEY}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if data.get("status") == "ok":
            return {
                "timestamp": datetime.now(),
                "aqi": data["data"]["aqi"],
                "dominant_pollutant": data["data"]["dominentpol"]
            }
    return None

# Fetch OpenWeather data
def fetch_openweather_data(city):
    url = f"https://api.openweathermap.org/data/2.5/weather?q={city}&appid={OPENWEATHER_API_KEY}&units=metric"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return {
            "timestamp": datetime.now(),
            "temperature": data["main"]["temp"],
            "humidity": data["main"]["humidity"]
        }
    return None

# Fetch combined data
def fetch_combined_data(city):
    aqi_data = fetch_aqicn_data(city)
    weather_data = fetch_openweather_data(city)
    if aqi_data and weather_data:
        return {**aqi_data, **weather_data}
    return None

# Generate time-based features
def add_time_features(df):
    df['hour'] = df['timestamp'].dt.hour
    df['day'] = df['timestamp'].dt.day
    df['month'] = df['timestamp'].dt.month
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['season'] = df['month'] % 12 // 3 + 1  # 1: Winter, 2: Spring, etc.
    return df

# Generate derived features
def add_derived_features(df):
    df['aqi_change_rate'] = df['aqi'].diff() / df['timestamp'].diff().dt.total_seconds() * 3600  # AQI/hour
    df['temp_change_rate'] = df['temperature'].diff() / df['timestamp'].diff().dt.total_seconds() * 3600  # °C/hour
    df['humidity_change_rate'] = df['humidity'].diff() / df['timestamp'].diff().dt.total_seconds() * 3600  # %/hour
    return df

# Define target (e.g., next AQI)
def add_targets(df):
    df['target_aqi'] = df['aqi'].shift(-1)  # Next AQI value
    return df

# Main function to fetch, process, and generate features
def main():
    raw_data = []
    # Collect 10 samples with a delay to simulate a time series
    for _ in range(10):
        data = fetch_combined_data(CITY)
        if data:
            raw_data.append(data)
        time.sleep(60)  # Wait 60 seconds between API calls

    # Convert raw data to DataFrame
    df = pd.DataFrame(raw_data)
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # Compute features and targets
    df = add_time_features(df)
    df = add_derived_features(df)
    df = add_targets(df)

    # Drop rows with NaN (from diff or shift operations)
    df = df.dropna()

    # Save the processed DataFrame to a CSV file
    output_file = "processed_aqi_weather_data.csv"
    file_exists = os.path.isfile(output_file)
    df.to_csv(output_file, mode='a', header=not file_exists, index=False)
    print(f"Processed data appended to {output_file}")

# Run the main function
if __name__ == "__main__":
    main()


Processed data appended to processed_aqi_weather_data.csv


In [2]:
import requests
import pandas as pd
from datetime import datetime
import time
import hopsworks
from hsfs.feature_group import FeatureGroup

# Replace with your API keys
AQICN_API_KEY = "3ab568459364d9a295405394a113823df31a9a82"
OPENWEATHER_API_KEY = "c78b17200559431652d643ad3e0259a9"

# Example location
CITY = "Karachi"

# Fetch AQICN data
def fetch_aqicn_data(city):
    url = f"https://api.waqi.info/feed/{city}/?token={AQICN_API_KEY}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if data.get("status") == "ok":
            return {
                "timestamp": datetime.now(),
                "aqi": data["data"]["aqi"],
                "dominant_pollutant": data["data"]["dominentpol"]
            }
    return None

# Fetch OpenWeather data
def fetch_openweather_data(city):
    url = f"https://api.openweathermap.org/data/2.5/weather?q={city}&appid={OPENWEATHER_API_KEY}&units=metric"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return {
            "timestamp": datetime.now(),
            "temperature": data["main"]["temp"],
            "humidity": data["main"]["humidity"]
        }
    return None

# Fetch combined data
def fetch_combined_data(city):
    aqi_data = fetch_aqicn_data(city)
    weather_data = fetch_openweather_data(city)
    if aqi_data and weather_data:
        return {**aqi_data, **weather_data}
    return None

# Generate time-based features
def add_time_features(df):
    df['hour'] = df['timestamp'].dt.hour
    df['day'] = df['timestamp'].dt.day
    df['month'] = df['timestamp'].dt.month
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['season'] = df['month'] % 12 // 3 + 1  # 1: Winter, 2: Spring, etc.
    return df

# Generate derived features
def add_derived_features(df):
    df['aqi_change_rate'] = df['aqi'].diff() / df['timestamp'].diff().dt.total_seconds() * 3600  # AQI/hour
    df['temp_change_rate'] = df['temperature'].diff() / df['timestamp'].diff().dt.total_seconds() * 3600  # °C/hour
    df['humidity_change_rate'] = df['humidity'].diff() / df['timestamp'].diff().dt.total_seconds() * 3600  # %/hour
    return df

# Define target (e.g., next AQI)
def add_targets(df):
    df['target_aqi'] = df['aqi'].shift(-1)  # Next AQI value
    return df

# Main function to fetch, process, and save data to Feature Store
def main():
    raw_data = []

    # Collect 10 samples with a delay to simulate a time series
    for _ in range(10):
        data = fetch_combined_data(CITY)
        if data:
            raw_data.append(data)
        time.sleep(60)  # Wait 60 seconds between API calls

    # Convert raw data to DataFrame
    df = pd.DataFrame(raw_data)
    df['timestamp'] = pd.to_datetime(df['timestamp'])  # Keep as datetime for feature generation

    # Compute features and targets
    df = add_time_features(df)
    df = add_derived_features(df)
    df = add_targets(df)

    # Convert timestamp to string for compatibility with Hopsworks
    df['timestamp'] = df['timestamp'].astype(str)

    # Drop rows with NaN (from diff or shift operations)
    df = df.dropna()

    # Connect to Hopsworks
    project = hopsworks.login(api_key_value="NzKsbygmeyP444mJ.xIpih1OWQwPuer5J6bFSo6wDCiTrPLsiMdk8l1ErXL77cuxMEkQ9tM0z2ambpVqy")
    fs = project.get_feature_store()

    # Define the feature group
    feature_group = fs.get_or_create_feature_group(
        name="aqi_weather_features",
        version=1,
        description="AQI and weather data with derived and target features",
        primary_key=["timestamp"],
        online_enabled=True
    )

    # Save the DataFrame to the Feature Store
    feature_group.insert(df)
    print("Features successfully inserted into Hopsworks Feature Store.")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm


2025-01-20 22:29:44,189 INFO: Initializing external client
2025-01-20 22:29:44,189 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-20 22:30:01,020 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1200282



ploading Dataframe: 100.00% |█████████████████████████████████| Rows 8/8 | Elapsed Time: 00:02 | Remaining Time: 00:00

Launching job: aqi_weather_features_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1200282/jobs/named/aqi_weather_features_1_offline_fg_materialization/executions
Features successfully inserted into Hopsworks Feature Store.


In [5]:

import requests
import pandas as pd
from datetime import datetime, timedelta
import time
import hopsworks
from hsfs.feature_group import FeatureGroup

# Replace with your API keys
AQICN_API_KEY = "3ab568459364d9a295405394a113823df31a9a82"
OPENWEATHER_API_KEY = "c78b17200559431652d643ad3e0259a9"

# Example location
CITY = "Karachi"

# Fetch AQICN data
def fetch_aqicn_data(city):
    url = f"https://api.waqi.info/feed/{city}/?token={AQICN_API_KEY}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if data.get("status") == "ok":
            return {
                "timestamp": datetime.now(),  # Temporary placeholder; update later
                "aqi": data["data"]["aqi"],
                "dominant_pollutant": data["data"]["dominentpol"]
            }
    return None

# Fetch OpenWeather data
def fetch_openweather_data(city):
    url = f"https://api.openweathermap.org/data/2.5/weather?q={city}&appid={OPENWEATHER_API_KEY}&units=metric"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return {
            "timestamp": datetime.now(),  # Temporary placeholder; update later
            "temperature": data["main"]["temp"],
            "humidity": data["main"]["humidity"]
        }
    return None

# Fetch combined data for a specific date
def fetch_combined_data(city, date):
    aqi_data = fetch_aqicn_data(city)
    weather_data = fetch_openweather_data(city)
    
    if aqi_data:
        print("AQI Data:", aqi_data)  # Debugging output for AQI data
    else:
        print("Failed to fetch AQI data for date:", date)

    if weather_data:
        print("Weather Data:", weather_data)  # Debugging output for weather data
    else:
        print("Failed to fetch weather data for date:", date)

    if aqi_data and weather_data:
        combined_data = {**aqi_data, **weather_data}
        combined_data['timestamp'] = date  # Use the date passed to the function
        return combined_data
    return None

# Generate time-based features
def add_time_features(df):
    df['hour'] = df['timestamp'].dt.hour
    df['day'] = df['timestamp'].dt.day
    df['month'] = df['timestamp'].dt.month
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['season'] = df['month'] % 12 // 3 + 1  # 1: Winter, 2: Spring, etc.
    return df

# Generate derived features
def add_derived_features(df):
    df['aqi_change_rate'] = df['aqi'].diff() / df['timestamp'].diff().dt.total_seconds() * 3600  # AQI/hour
    df['temp_change_rate'] = df['temperature'].diff() / df['timestamp'].diff().dt.total_seconds() * 3600  # °C/hour
    df['humidity_change_rate'] = df['humidity'].diff() / df['timestamp'].diff().dt.total_seconds() * 3600  # %/hour
    return df

# Define target (e.g., next AQI)
def add_targets(df):
    df['target_aqi'] = df['aqi'].shift(-1)  # Next AQI value
    return df

# Main function to fetch, process, and save data to Feature Store
def main():
    raw_data = []
    start_date = datetime.now() - timedelta(days=30)  # Fetch data for the last 30 days

    # Loop over each day in the range
    for i in range(30):
        date = start_date + timedelta(days=i)
        data = fetch_combined_data(CITY, date)
        if data:
            raw_data.append(data)
        else:
            print(f"No data collected for {date}")  # Indicate if no data was collected for a date

        time.sleep(60)  # Wait 60 seconds between API calls to avoid hitting rate limits

    # Debugging output for raw data
    print("Raw data collected:", raw_data)  # Check if raw_data is populated

    # Convert raw data to DataFrame
    df = pd.DataFrame(raw_data)
    if df.empty:
        print("No data available to process.")
        return

    df['timestamp'] = pd.to_datetime(df['timestamp'])  # Keep as datetime for feature generation

    # Compute features and targets
    df = add_time_features(df)
    df = add_derived_features(df)
    df = add_targets(df)

    # Convert timestamp to string for compatibility with Hopsworks
    df['timestamp'] = df['timestamp'].astype(str)

    # Drop rows with NaN (from diff or shift operations)
    df = df.dropna()

    # Connect to Hopsworks
    project = hopsworks.login(api_key_value="NzKsbygmeyP444mJ.xIpih1OWQwPuer5J6bFSo6wDCiTrPLsiMdk8l1ErXL77cuxMEkQ9tM0z2ambpVqy")
    fs = project.get_feature_store()

    # Define the feature group
    feature_group = fs.get_or_create_feature_group(
        name="aqi_weather_features",
        version=1,
        description="AQI and weather data with derived and target features",
        primary_key=["timestamp"],
        online_enabled=True
    )

    # Save the DataFrame to the Feature Store
    feature_group.insert(df)
    print("Features successfully inserted into Hopsworks Feature Store.")

if __name__ == "__main__":
    main()


AQI Data: {'timestamp': datetime.datetime(2025, 1, 20, 23, 0, 57, 431889), 'aqi': 174, 'dominant_pollutant': 'pm25'}
Weather Data: {'timestamp': datetime.datetime(2025, 1, 20, 23, 0, 57, 854419), 'temperature': 15.9, 'humidity': 39}
AQI Data: {'timestamp': datetime.datetime(2025, 1, 20, 23, 1, 59, 241118), 'aqi': 174, 'dominant_pollutant': 'pm25'}
Weather Data: {'timestamp': datetime.datetime(2025, 1, 20, 23, 2, 0, 161690), 'temperature': 15.9, 'humidity': 39}
AQI Data: {'timestamp': datetime.datetime(2025, 1, 20, 23, 3, 0, 771651), 'aqi': 174, 'dominant_pollutant': 'pm25'}
Weather Data: {'timestamp': datetime.datetime(2025, 1, 20, 23, 3, 1, 277155), 'temperature': 15.9, 'humidity': 39}
AQI Data: {'timestamp': datetime.datetime(2025, 1, 20, 23, 4, 1, 933461), 'aqi': 174, 'dominant_pollutant': 'pm25'}
Weather Data: {'timestamp': datetime.datetime(2025, 1, 20, 23, 4, 2, 322731), 'temperature': 15.9, 'humidity': 39}
AQI Data: {'timestamp': datetime.datetime(2025, 1, 20, 23, 5, 2, 997455),


ploading Dataframe: 100.00% |███████████████████████████████| Rows 28/28 | Elapsed Time: 00:03 | Remaining Time: 00:00

Launching job: aqi_weather_features_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1200282/jobs/named/aqi_weather_features_1_offline_fg_materialization/executions
Features successfully inserted into Hopsworks Feature Store.


In [10]:
import hsfs
print(hsfs.__version__)


4.1.5


In [2]:
# create a simple feature view
feature_view = fs.create_feature_view(
    name='aqiview',
    query=query
)

# create a feature view with transformation and label
feature_view = fs.create_feature_view(
    name='aqiview',
    query=query,
labels = 
    ["timestamp"],
    ["aqi"],
    ["dominant_pollutant"],
    ["temperature"],
    ["humidity"],
    ["hour"],
    ["day"],
    ["month"],
    ["day_of_week"],
    ["season"],
    ["aqi_change_rate"],
    ["temp_change_rate"],
    ["humidity_change_rate"],
    ["target_aqi"]
]
    transformation_functions={
        "amount": fs.get_transformation_function(name="standard_scaler", version=1)
    }
)


SyntaxError: closing parenthesis ']' does not match opening parenthesis '(' on line 8 (2193214477.py, line 26)

In [3]:
import hopsworks

def create_feature_views():
    # Step 1: Connect to Hopsworks
    project = hopsworks.login(api_key_value="NzKsbygmeyP444mJ.xIpih1OWQwPuer5J6bFSo6wDCiTrPLsiMdk8l1ErXL77cuxMEkQ9tM0z2ambpVqy")
    fs = project.get_feature_store()

    # Step 2: Retrieve the feature group
    feature_group = fs.get_feature_group("aqi_weather_features", version=1)

    # Step 3: Define the query
    query = feature_group.select_all()

    # Step 4: Create a simple feature view
    feature_view_simple = fs.create_feature_view(
        name='aqiview_simple',
        query=query
    )
    print("Simple Feature View created successfully.")

 

if __name__ == "__main__":
    create_feature_views()


2025-01-24 20:41:03,358 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-24 20:41:03,534 INFO: Initializing external client
2025-01-24 20:41:03,534 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-24 20:41:05,878 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1200282
Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1200282/fs/1189960/fv/aqiview_simple/version/1
Simple Feature View created successfully.


FeatureStoreException: Please use the hopsworks_udf decorator when defining transformation functions.

In [2]:
import hopsworks
import hsfs

# Test connection
project = hopsworks.login()
fs = project.get_feature_store()
print("Successfully connected to Hopsworks!")


Copy your Api Key (first register/login): https://c.app.hopsworks.ai/account/api/generated



Paste it here:  ········


2025-01-24 20:40:45,413 INFO: Initializing external client
2025-01-24 20:40:45,413 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-24 20:40:49,101 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1200282
Successfully connected to Hopsworks!


In [12]:
import hopsworks

def read_simple_feature_view():
    # Step 1: Connect to Hopsworks
    project = hopsworks.login(api_key_value="NzKsbygmeyP444mJ.xIpih1OWQwPuer5J6bFSo6wDCiTrPLsiMdk8l1ErXL77cuxMEkQ9tM0z2ambpVqy")
    fs = project.get_feature_store()

    # Step 2: Retrieve the simple feature view
    feature_view_simple = fs.get_feature_view(name="aqiview_simple", version=1)

    # Step 3: Read the data from the feature view
    df = feature_view_simple.get_batch_data()

    # Step 4: Print or process the data
    print("Data from Simple Feature View:")
    print(df.head())  # Display the first few rows of the DataFrame

if __name__ == "__main__":
    read_simple_feature_view()


2025-01-24 21:04:59,393 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-24 21:04:59,412 INFO: Initializing external client
2025-01-24 21:04:59,413 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-24 21:05:01,972 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1200282
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.34s) 
Data from Simple Feature View:
                    timestamp  aqi dominant_pollutant  temperature  humidity  \
0  2025-01-20 22:27:42.717600  174               pm25         17.9        34   
1  2025-01-20 22:25:40.121891  174               pm25         17.9        34   
2  2025-01-20 22:22:35.157212  174               pm25         17.9        34   
3  2025-01-20 22:21:33.861252  174               pm25         17.9        34   
4  2025-01-20 22:20:31.747064  174               pm25         17.9        34   

   hour  day  month  day_of_week

In [16]:
import hopsworks
from hsfs import connection
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib

def train_pipeline():
    # Step 1: Connect to Hopsworks and fetch data
    project = hopsworks.login(api_key_value="NzKsbygmeyP444mJ.xIpih1OWQwPuer5J6bFSo6wDCiTrPLsiMdk8l1ErXL77cuxMEkQ9tM0z2ambpVqy")
    fs = project.get_feature_store()

    # Load the feature group
    feature_group = fs.get_feature_group("aqi_weather_features", version=1)

    # Fetch the data as a DataFrame using fetch()
   # Fetch the data as a DataFrame using read()
    df = feature_group.read()

    # Step 2: Prepare the Data
    # Drop the target and timestamp columns
    X = df.drop(columns=['target_aqi', 'timestamp'])
    y = df['target_aqi']

    # Handle non-numeric columns
    X_numeric = X.select_dtypes(include=['number'])  # Select only numeric columns

    # Fill missing values with the mean
    X_numeric = X_numeric.fillna(X_numeric.mean())

    # Step 3: Split the Data
    X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)

    # Step 4: Select and Train a Model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Step 5: Evaluate the Model
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error: {mse}')

    # Step 6: Save the Model
    joblib.dump(model, 'aqi_prediction_model.pkl')
    print("Model saved as 'aqi_prediction_model.pkl'")

if __name__ == "__main__":
    train_pipeline()


2025-01-24 21:11:03,391 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-24 21:11:03,405 INFO: Initializing external client
2025-01-24 21:11:03,406 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-24 21:11:06,094 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1200282
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.77s) 
Mean Squared Error: 3.884537500000017
Model saved as 'aqi_prediction_model.pkl'


In [None]:
import hopsworks
from hsml.model_schema import ModelSchema

# Login to Hopsworks
project = hopsworks.login(api_key_value="qm82d9bpdrGwcAj0.q4qnhHm9W6nIcrSeMC5BKWuFW89XQ3N1u1oSlTZit8OOKRoDwTIDHhQoSllqZjjE")

# Access the Model Registry
mr = project.get_model_registry()

# Save your trained model locally if not already done
import joblib
joblib.dump(model, "aqi_prediction_model.pkl")

# Define metadata for the model
input_schema = {"features": list(X.columns)}  # Replace with your feature names
output_schema = {"target": "target_aqi"}     # Replace with your target variable name
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

# Register the model in the registry
hopsworks_model = mr.tensorflow.create_model(
    name="aqi_prediction_model",
    description="A model for predicting air quality index (AQI).",
    model_schema=model_schema
)

# Save the model files to the registry
hopsworks_model.save("./")  # Specify the directory containing the model file

print("Model registered successfully!")


In [None]:
import hopsworks
import hsml  # Import the hsml library
from hsml.model_schema import ModelSchema  # Import ModelSchema from hsml

# Log in to your Hopsworks project
project = hopsworks.login(api_key_value="qm82d9bpdrGwcAj0.q4qnhHm9W6nIcrSeMC5BKWuFW89XQ3N1u1oSlTZit8OOKRoDwTIDHhQoSllqZjjE")

# Set the Hopsworks connection for hsml - this is done automatically upon login
# No need to manually call hsml.connection.connection().set_instance(project)

# Access the Model Registry directly through the project object
mr = project.get_model_registry()

# Now you can use 'mr' to interact with the Model Registry

input_schema = {
    "features": [
        "timestamp",
        "aqi",
        "dominant_pollutant",
        "temperature",
        "humidity",
        "hour",
        "day",
        "month",
        "day_of_week",
        "season",
        "aqi_change_rate",
        "temp_change_rate",
        "humidity_change_rate"
    ]
}  # Replace with your feature names

output_schema = {"target": "target_aqi"}  # Replace with your target variable name
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

# Register the model in the registry
hopsworks_model = mr.sklearn.create_model(
    name="aqi_prediction_model",
    description="A model for predicting air quality index (AQI).",
    model_schema=model_schema
)

# Save the model files to the registry
hopsworks_model.save(".")  # Save the model to the registry
print("Model registered successfully!")


In [None]:

import hopsworks

# Log in to your Hopsworks project
project = hopsworks.login(api_key_value="qm82d9bpdrGwcAj0.q4qnhHm9W6nIcrSeMC5BKWuFW89XQ3N1u1oSlTZit8OOKRoDwTIDHhQoSllqZjjE")

# Get the model registry
model_registry = project.get_model_registry()

# Retrieve the model from the registry
model_name = "aqi_prediction_model"  # Change to your model name
model_version = 1  # Update the version if necessary
model = model_registry.get_model(model_name, version=model_version)

# Step 1: Download the latest model
model.download()


# For demonstration, we'll just print that the model is deployed
print("Model deployed successfully!")



In [None]:
import pandas as pd
import requests
from datetime import datetime, timedelta
import requests_cache
from retrying import retry

def fetch_aqi_data():
    today = datetime.utcnow()
    two_years_ago = today - timedelta(days=2 * 365)
    current_unix_time = int(today.timestamp())
    unix_start = int(two_years_ago.timestamp())

    url = f"http://api.openweathermap.org/data/2.5/air_pollution/history?lat=24.8546842&lon=67.0207055&start={unix_start}&end={current_unix_time}&appid=c78b17200559431652d643ad3e0259a9"
    response = requests.get(url)
    raw = response.json()
    aqi_df = pd.json_normalize(raw["list"])
    aqi_df['dt'] = pd.to_datetime(aqi_df['dt'], unit='s')
    aqi_df.set_index('dt', inplace=True)
    aqi_df.index = aqi_df.index.tz_localize(None)
    return aqi_df

def fetch_weather_data():
    cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
    start_date = datetime.utcnow() - timedelta(days=2 * 365)
    end_date = datetime.utcnow() - timedelta(days=1)

    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": 24.8546842,
        "longitude": 67.0207055,
        "hourly": ["temperature_2m", "relative_humidity_2m", "wind_speed_10m"],
        "start_date": start_date.strftime("%Y-%m-%d"),
        "end_date": end_date.strftime("%Y-%m-%d"),
    }
    response = requests.get(url, params=params)
    weather_data = response.json()
    weather_df = pd.DataFrame(weather_data["hourly"])
    weather_df["time"] = pd.to_datetime(weather_df["time"])
    weather_df.set_index("time", inplace=True)
    return weather_df

# Combine AQI and weather data
aqi_data = fetch_aqi_data()
weather_data = fetch_weather_data()
combined_data = aqi_data.join(weather_data, how="inner")
combined_data.reset_index(inplace=True)

# Add additional features
combined_data.index = pd.to_datetime(combined_data.index)

# Extract time-related features
combined_data["hour"] = combined_data.index.hour
combined_data["day"] = combined_data.index.day
combined_data["month"] = combined_data.index.month
combined_data["day_of_week"] = combined_data.index.dayofweek
combined_data["season"] = combined_data["month"].apply(lambda x: (x % 12 + 3) // 3)
combined_data["aqi_change_rate"] = combined_data["main.aqi"].diff()

# Calculate temperature and humidity change rates
combined_data["temp_change_rate"] = combined_data["temperature_2m"].diff()
combined_data["humidity_change_rate"] = combined_data["relative_humidity_2m"].diff()

# Define features and target
X = combined_data[[  # Ensure all required columns exist
    "hour", "day", "month", "day_of_week", "season",
    "main.aqi", "temperature_2m", "relative_humidity_2m",
    "aqi_change_rate", "temp_change_rate", "humidity_change_rate"
]]
y = combined_data["main.aqi"].shift(-1).fillna(method="ffill")  # Predict next AQI

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the trained model locally
joblib.dump(model, "aqi_model.pkl")

import hopsworks
from hsml.model_schema import ModelSchema

project = hopsworks.login(api_key_value="qm82d9bpdrGwcAj0.q4qnhHm9W6nIcrSeMC5BKWuFW89XQ3N1u1oSlTZit8OOKRoDwTIDHhQoSllqZjjE")
mr = project.get_model_registry()

# Define input and output schemas
input_schema = {
    "features": [
        "hour", "day", "month", "day_of_week", "season",
        "main.aqi", "temperature_2m", "relative_humidity_2m",
        "aqi_change_rate", "temp_change_rate", "humidity_change_rate"
    ]
}
output_schema = {"target": "Predicted AQI"}
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

# Register and save the model
hopsworks_model = mr.sklearn.create_model(
    name="aqi_prediction_model",
    description="A model for predicting AQI based on historical data.",
    model_schema=model_schema
)
hopsworks_model.save("aqi_model.pkl")
print("Model registered successfully!")


In [None]:
import pandas as pd
import joblib

# Load the trained model
model = joblib.load("aqi_model.pkl")

# Create the DataFrame from the provided data
data = {
    "timestamp": [
        "51:28.2", "52:28.9", "53:29.6", "54:30.2", "55:31.0",
        "56:31.7", "57:32.4", "58:33.2", "43:27.4", "44:28.2",
        "45:28.8", "46:29.5", "47:30.1", "48:30.9", "49:31.6",
        "50:32.4"
    ],
    "aqi": [124] * 16,
    "dominant_pollutant": ["pm25"] * 16,
    "temperature": [24.9, 24.9, 24.9, 24.9, 24.9, 24.9, 24.9, 24.9, 20.9, 19.9, 20.9, 19.9, 20.9, 20.9, 19.9, 19.9],
    "humidity": [14, 14, 14, 14, 14, 14, 14, 14, 16, 17, 16, 17, 16, 16, 17, 17],
    "target_aqi": [124] * 16
}

# Create the DataFrame
df = pd.DataFrame(data)

# Adjust the timestamp format from MM:SS.s to 00:MM:SS.s
df['timestamp'] = df['timestamp'].apply(lambda x: '00:' + x)

# Convert the timestamp to timedelta
df['timestamp'] = pd.to_timedelta(df['timestamp'])

# Set the timestamp as the index
df.set_index('timestamp', inplace=True)

# Create a combined DataFrame to extract additional features
combined_data = df.copy()

# Extract time-related features from the index
combined_data["hour"] = combined_data.index.components['hours']
combined_data["day"] = combined_data.index.components['days']  # Use days as index is Timedelta
combined_data["month"] = 1  # Placeholder value for month (adjust as necessary)
combined_data["day_of_week"] = 4  # Placeholder value

# Calculate the season based on the month (1-4 for the seasons)
combined_data["season"] = (combined_data["month"] % 12 + 3) // 3

# Calculate AQI change rate
combined_data["aqi_change_rate"] = combined_data["aqi"].diff().fillna(0)  # Fill NaN with 0

# Create missing features with placeholder values or calculations
combined_data["humidity_change_rate"] = combined_data["humidity"].diff().fillna(0)
combined_data["temp_change_rate"] = combined_data["temperature"].diff().fillna(0)

# Prepare X_test - ensure it matches the trained model's feature names
X_test = combined_data.drop(columns=["target_aqi"])

# Add the required features (make sure to match exactly what the model expects)
X_test["month"] = combined_data["month"]

# Rename the columns to match the trained model's expectations
X_test.rename(columns={
    "aqi": "main.aqi",
    "temperature": "temperature_2m",
    "humidity": "relative_humidity_2m",
    "hour": "hour",
    "day": "day",
    "day_of_week": "day_of_week",
    "season": "season",
    "aqi_change_rate": "aqi_change_rate",
    "humidity_change_rate": "humidity_change_rate",
    "temp_change_rate": "temp_change_rate"
}, inplace=True)

# Ensure the order of columns matches the trained model's expectations
X_test = X_test[model.feature_names_in_]  # Use the feature names from the model directly

# Make predictions
predictions = model.predict(X_test)

# Compare predictions with actual target values
actual_aqi = combined_data["target_aqi"].values
comparison_df = pd.DataFrame({"Actual AQI": actual_aqi, "Predicted AQI": predictions})

print("Comparison of Actual and Predicted AQI:")
print(comparison_df)


In [None]:
# Combine AQI and weather data
aqi_data = fetch_aqi_data()
weather_data = fetch_weather_data()
combined_data = aqi_data.join(weather_data, how="inner")

# Debugging: Check the columns of combined_data
print("Columns in combined_data:", combined_data.columns)

# Reset the index to ensure 'dt' is accessible
combined_data.reset_index(inplace=True)

# Check the DataFrame after reset
print("DataFrame after reset index:\n", combined_data.head())

# Rename the 'index' column to 'dt' for clarity
combined_data.rename(columns={'index': 'dt'}, inplace=True)

# Filter for the last 3 days of data
three_days_ago = datetime.utcnow() - timedelta(days=3)
combined_data = combined_data[combined_data['dt'] >= three_days_ago]

# Check the filtered DataFrame
print("Filtered DataFrame:\n", combined_data)

# Set the index back to 'dt' for further processing
combined_data.set_index('dt', inplace=True)

# Add additional features
combined_data["hour"] = combined_data.index.hour
combined_data["day"] = combined_data.index.day
combined_data["month"] = combined_data.index.month
combined_data["day_of_week"] = combined_data.index.dayofweek
combined_data["season"] = combined_data["month"].apply(lambda x: (x % 12 + 3) // 3)
combined_data["aqi_change_rate"] = combined_data["main.aqi"].diff()

# Calculate temperature and humidity change rates
combined_data["temp_change_rate"] = combined_data["temperature_2m"].diff()
combined_data["humidity_change_rate"] = combined_data["relative_humidity_2m"].diff()

# Define features and target
X = combined_data[[
    "hour", "day", "month", "day_of_week", "season",
    "main.aqi", "temperature_2m", "relative_humidity_2m",
    "aqi_change_rate", "temp_change_rate", "humidity_change_rate"
]]
y = combined_data["main.aqi"].shift(-1).fillna(method="ffill")  # Predict next AQI


In [None]:
import pandas as pd
import joblib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the trained model
model = joblib.load("aqi_model.pkl")

# Load the training data (you might need to adjust the path)
combined_data = pd.read_csv('training_data.csv', index_col=0, parse_dates=True)

# Prepare features and target
X = combined_data[[  # Ensure all required columns exist
    "hour", "day", "month", "day_of_week", "season",
    "main.aqi", "temperature_2m", "relative_humidity_2m",
    "aqi_change_rate", "temp_change_rate", "humidity_change_rate"
]]
y_true = combined_data["main.aqi"].shift(-1).ffill()  # Actual AQI (shifted and filled)

# Make predictions
y_pred = model.predict(X)

# Remove the last entry from y_true to match lengths
y_true = y_true[:-1]
y_pred = y_pred[:-1]

# Evaluate performance metrics
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_true, y_pred)

# Print the evaluation metrics
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared: {r2:.2f}")

# Visualize predictions vs actual values
plt.figure(figsize=(12, 6))
plt.plot(y_true.index, y_true, label='Actual AQI', color='blue')
plt.plot(y_true.index, y_pred, label='Predicted AQI', color='orange', linestyle='--')
plt.title('Actual vs Predicted AQI')
plt.xlabel('Date')
plt.ylabel('AQI')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import joblib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.dates as mdates

# Load the trained model
model = joblib.load("aqi_model.pkl")

# Load the training data (adjust the path as needed)
combined_data = pd.read_csv('training_data.csv', index_col='index', parse_dates=True)

# Filter to get only the last 3 days of data
latest_data = combined_data.last('3D')

# Prepare features and target
X = latest_data[[  # Ensure all required columns exist
    "hour", "day", "month", "day_of_week", "season",
    "main.aqi", "temperature_2m", "relative_humidity_2m",
    "aqi_change_rate", "temp_change_rate", "humidity_change_rate"
]]

y_true = latest_data["main.aqi"].shift(-1).ffill()  # Actual AQI (shifted and filled)

# Make predictions
y_pred = model.predict(X)

# Remove the last entry from y_true and y_pred to match lengths
y_true = y_true[:-1]
y_pred = y_pred[:-1]

# Evaluate performance metrics
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_true, y_pred)

# Print the evaluation metrics
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared: {r2:.2f}")

# Visualize predictions vs actual values
plt.figure(figsize=(12, 6))
plt.plot(latest_data.index[:-1], y_true, label='Actual AQI', color='blue')  # Use index for actual values
plt.plot(latest_data.index[:-1], y_pred, label='Predicted AQI', color='orange', linestyle='--')  # Use index for predicted values
plt.title('Actual vs Predicted AQI (Last 3 Days)')
plt.xlabel('Date')
plt.ylabel('AQI')

# Set date format on the x-axis
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))  # Format to show only the date

# Optional: Set locator for the x-axis to show every nth date (adjust as needed)
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=1))  # Show every day

plt.xticks(rotation=45)  # Rotate x-axis ticks for better visibility
plt.legend()
plt.tight_layout()  # Adjust layout to make room for rotated x-axis labels
plt.show()
