# Beijing Air Quality Forecasting Starter Notebook

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import tensorflow as tf

In [None]:
# Load the datasets
# Ensure train.csv and test.csv are saved in your Google Drive in the same folder.
# Replace the file paths below with the actual paths to your dataset.
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')



# Explore the training data

In this sections explore your dataset with appropiate statistics and visualisations to understand your better. Ensure that you explain output of every code cell and what it entails.

In [None]:
# Inspecting the first few rows of the dataset to understand its structure.
print("Training Data Overview:")
train.head()

In [None]:
train.columns

In [None]:
# Ensure 'datetime' column is in datetime format
train['datetime'] = pd.to_datetime(train['datetime'])

test['datetime'] = pd.to_datetime(test['datetime'])

# Set the 'datetime' column as the index for better time-series handling
train.set_index('datetime', inplace=True)
# val.set_index('datetime', inplace=True)
test.set_index('datetime', inplace=True)


# Handle missing values


- Check the dataset for missing values and decide how to handle them.
- In this example, missing values are filled with the mean. You can experiment with other strategies.

In [None]:
train.fillna(train.mean(), inplace=True)
test.fillna(test.mean(), inplace=True)

n_lags = 3

for lag in range(1, n_lags + 1):
    train[f'pm2.5_lag{lag}'] = train['pm2.5'].shift(lag)

for lag in range(1, n_lags + 1):
    test[f'pm2.5_lag{lag}'] = test['pm2.5'].shift(lag) if 'pm2.5' in test.columns else 0

test.fillna(test.mean(), inplace=True)

train = train.dropna()




# Separate features and target

- Feel free to trop any non-essential columns like that you think might not contribute to modeling.

In [None]:

val_size = int(len(train) * 0.2)
train_data = train.iloc[:-val_size]
val_data = train.iloc[-val_size:]

X_train = train_data.drop(['pm2.5', 'No'], axis=1)
y_train = train_data['pm2.5']

X_val = val_data.drop(['pm2.5', 'No'], axis=1)
y_val = val_data['pm2.5']

# Reshape for LSTM
X_train = np.expand_dims(X_train, axis=1)
X_val = np.expand_dims(X_val, axis=1)

In [None]:
# Reshape data for LSTM input
# LSTM models require data in the shape (samples, timesteps, features).
# Here, the data is reshaped to add a "timesteps" dimension.
# X_train = np.expand_dims(X_train, axis=1)

# Build model

Below is a simple LSTM model. Your task is to experiment with different parameters like, numbers of layers, units, activation functions, and optimizers, etc to get the best performing model. Experiment with other optimizers (e.g., SGD) or hyperparameters to improve performance.

In [None]:
# define model
model = Sequential([
    LSTM(32, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
    Dense(1)
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='mse',
    metrics=[lambda y, y_pred: tf.sqrt(tf.reduce_mean(tf.square(y - y_pred)))]  # RMSE metric
)

# Display the model architecture
model.summary()



In [None]:
# Train the model
# You can adjust the number of epochs and batch size to improve performance.
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_val, y_val)
)

In [None]:
# Calculate training loss
train_predictions = model.predict(X_train)
train_loss = np.mean((y_train - train_predictions.flatten())**2)

# Calculate validation loss (RMSE)
val_predictions = model.predict(X_val)
val_rmse = np.sqrt(np.mean((y_val - val_predictions.flatten())**2))

# Persistence baseline: predict next value as previous value
y_val_pred_persistence = y_val.shift(1).fillna(method='bfill')
rmse_persistence = np.sqrt(np.mean((y_val - y_val_pred_persistence)**2))

# Plot training and validation loss
plt.figure(figsize=(8, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss During Training')
plt.xlabel('Epochs')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.show()

print(f"Final Training Loss (MSE): {train_loss:.2f}")
print(f"Validation RMSE: {val_rmse:.2f}")
print(f"Persistence Baseline RMSE: {rmse_persistence:.2f}")


In [None]:
def create_sliding_window_test(test_df, lookback):
    X_test = []
    # Use the same feature selection as training (drop 'No' and 'pm2.5' if present)
    features_to_drop = ['No']
    if 'pm2.5' in test_df.columns:
        features_to_drop.append('pm2.5')
    test_values = test_df.drop(features_to_drop, axis=1).values
    
    for i in range(lookback, len(test_df)):
        X_test.append(test_values[i-lookback:i])
    return np.array(X_test)


In [None]:
lookback = n_lags


# Ensure test data has same features as training
test_for_prediction = test.copy()
if 'pm2.5' not in test_for_prediction.columns:
    # If test doesn't have pm2.5, add dummy lag features
    for lag in range(1, n_lags + 1):
        test_for_prediction[f'pm2.5_lag{lag}'] = 0

# Build test sequences
X_test_seq = create_sliding_window_test(test, lookback)

# Make predictions
predictions = model.predict(X_test_seq)
predictions = np.nan_to_num(predictions)
predictions = np.round(predictions).astype(int)

# Prepare submission (skip first 'lookback' rows)
submission = pd.DataFrame({
    'row ID': pd.to_datetime(test.index[lookback:]).strftime('%Y-%m-%d %-H:%M:%S'),
    'pm2.5': predictions.flatten()
})
submission = submission.sort_values(by='row ID')
submission.to_csv('data/subm_fixed.csv', index=False)
