In [None]:
# Import libraries here
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import pytz
from IPython.display import VimeoVideo
from pymongo import MongoClient
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import wqet_grader


In [None]:
client = MongoClient(host="localhost", port=27017)
# Connect to the "air-quality" database
db = client["air-quality"]
print(db.list_collection_names())

# Access the collection for Dar es Salaam
dar = db["dar-es-salaam"]

In [None]:
sites = dar.distinct("metadata.site")
sites

In [None]:
result = dar.aggregate(
    [
        {"$group": {"_id": "$metadata.site", "count": {"$count": {}}}}
    ]
)
readings_per_site = list(result)
readings_per_site


In [None]:
def wrangle(collection):
    results = collection.find(
        {"metadata.site": 11, "metadata.measurement": "P2"},
        projection={"P2": 1, "timestamp": 1, "_id": 0},   # ---> focus/ limit to only "P2" and timestamp
    )

    df = pd.DataFrame(results).set_index("timestamp")

    # Localize time
    df.index = df.index.tz_localize("UTC").tz_convert("Africa/Dar_es_Salaam")

    # Remove outliers
    df = df[df["P2"] < 100]

    # Resample to 1hour period, fill in missing values
    y = df["P2"].resample("1H").mean().fillna(method='ffill')

    return y



In [None]:
fig, ax = plt.subplots(figsize=(15, 6))

plt.xlabel("Date")
plt.ylabel("PM2.5 Level")
plt.title("Dar es Salaam PM2.5 Levels");
# Don't delete the code below 👇
plt.savefig("images/3-5-5.png", dpi=150)


In [None]:
fig, ax = plt.subplots(figsize=(15, 6))

# Don't delete the code below 👇
plt.savefig("images/3-5-6.png", dpi=150)


In [None]:
fig, ax = plt.subplots(figsize=(15, 6))

# Don't delete the code below 👇
plt.savefig("images/3-5-7.png", dpi=150)


In [None]:
fig, ax = plt.subplots(figsize=(15, 6))

# Don't delete the code below 👇
plt.savefig("images/3-5-8.png", dpi=150)


In [None]:
cutoff_test = int(len(y) * 0.90)
y_train = y.iloc[:cutoff_test]
y_test = y.iloc[cutoff_test:]
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
y_train_mean = y_train.mean()
y_pred_baseline = [y_train_mean] * len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)

print("Mean P2 Reading:", y_train_mean)
print("Baseline MAE:", mae_baseline)

In [None]:
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_absolute_error
import pandas as pd

# Use AR model to predict PM2.5 readings
# Define hyperparameter range
p_params = range(1, 31)  # lag values from 1 to 30
maes = []

# Loop through each hyperparameter
for p in p_params:
    # Train the AR model
    model = AutoReg(y_train, lags=p).fit()

    # Generate predictions
    y_pred = model.predict().dropna()

    # Calculate mean absolute error
    mae = mean_absolute_error(y_train.iloc[p:], y_pred)
    maes.append(mae)

# Store results in a pandas Series
mae_series = pd.Series(maes, name="mae", index=p_params)
print(mae_series.head())


In [None]:
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima.model import ARIMA

# Locate the best hyperparameter value of p
best_p = 28  # Replace with the value that minimizes the MAE from Task 3.5.11

# Build and train the model using the best p
best_model = AutoReg(y_train, lags=best_p).fit()

# Calculate residuals for the best model
y_train_resid = best_model.resid
y_train_resid.name = "residuals"

# Display residuals
print(y_train_resid.head())


In [None]:
# Calculate the training residuals for the best model
y_train_resid = best_model.resid
y_train_resid.name = "residuals"

# Display the last 1500 residuals
print(y_train_resid.tail(1500))


In [None]:
fig, ax = plt.subplots(figsize=(15, 6))

# Don't delete the code below 👇
plt.savefig("images/3-5-15.png", dpi=150)


In [None]:
from statsmodels.tsa.arima.model import ARIMA

# walk-forward validation for model for test data --> y_test
# predictions stored in series: y_pred_wfv
y_pred_wfv = pd.Series()
history = y_train.copy()
for i in range(len(y_test)):
    model = AutoReg(history, lags=best_p).fit()
    next_pred = model.forecast()      # next value after end of history
    y_pred_wfv = y_pred_wfv.append(next_pred)
    history = history.append(y_test[next_pred.index])

y_pred_wfv.name = "prediction"
y_pred_wfv.index.name = "timestamp"
y_pred_wfv.head()

In [None]:
import plotly.express as px
import pandas as pd

# Put test and walk-forward validation values
# in a dataframe and plot df
df_pred_test = pd.DataFrame(
    {"y_test": y_test, "y_pred_wfv": y_pred_wfv}
)
fig = px.line(df_pred_test, labels={"value": "PM2.5"})
fig.update_layout(
    title="Dar es Salaam, WFV Predictions",
    xaxis_title="Date",
    yaxis_title="PM2.5 Level",
)

# Don't delete the code below 👇
fig.write_image("images/3-5-18.png", scale=1, height=500, width=700)

fig.show()