# LSTM

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import matplotlib.pyplot as plt



In [2]:
df = pd.read_csv('data/interpolated_co2.csv')
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date')

# Train: Up to 2024-12-31 | Test: 2025-01-01 to 2025-04-13
train_data = df[df['Date'] <= '2024-12-31']
test_data = df[(df['Date'] >= '2025-01-01') & (df['Date'] <= '2025-04-13')]

# Normalize only the CO2 column
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train_data[['CO2']])

In [3]:
def create_sequences(data, seq_length=60):
    x, y = [], []
    for i in range(seq_length, len(data)):
        x.append(data[i-seq_length:i])
        y.append(data[i])
    return np.array(x), np.array(y)

seq_length = 60
x_train, y_train = create_sequences(train_scaled, seq_length)
x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))

In [4]:
model = Sequential([
    LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)),
    LSTM(units=50),
    Dense(units=1)
])

model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(x_train, y_train, epochs=10, batch_size=32)

Epoch 1/10


  super().__init__(**kwargs)


[1m576/576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - loss: 0.0122
Epoch 2/10
[1m576/576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 5.5904e-05
Epoch 3/10
[1m576/576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - loss: 5.4802e-05
Epoch 4/10
[1m576/576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 5.6175e-05
Epoch 5/10
[1m576/576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 5.9989e-05
Epoch 6/10
[1m576/576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 5.4358e-05
Epoch 7/10
[1m576/576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 7.4121e-05
Epoch 8/10
[1m576/576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 4.7184e-05
Epoch 9/10
[1m576/576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 5.1754e-05
Epoch 10/10
[1m576/576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x308c5cb50>

In [5]:
# Combine train and test CO2 for sliding window
full_data = pd.concat((train_data[['CO2']], test_data[['CO2']]), axis=0)
full_scaled = scaler.transform(full_data)

# Create test sequences
x_test = []
for i in range(len(train_scaled), len(full_scaled)):
    x_test.append(full_scaled[i-seq_length:i])
x_test = np.array(x_test)
x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))

# Predict
predictions_scaled = model.predict(x_test)
predictions = scaler.inverse_transform(predictions_scaled)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


In [10]:
# Get the last N rows from test_data matching prediction length
actual_values = test_data['CO2'].values[-len(predictions):]
pred_dates = test_data['Date'].values[-len(predictions):]

# Calculate metrics
rmse = sqrt(mean_squared_error(actual_values, predictions))
mae = mean_absolute_error(actual_values, predictions)
accuracy = 100 - (np.mean(np.abs((actual_values - predictions.reshape(-1)) / actual_values)) * 100)

# Save predictions in requested format
results = pd.DataFrame({
    'Date': pred_dates,
    'Actual': actual_values,
    'Predicted': predictions.reshape(-1)
})

os.makedirs('Predictions', exist_ok=True)
results.to_csv('Predictions/lstm_predictions.csv', index=False)

print(f'RMSE: {rmse:.4f}')
print(f'MAE: {mae:.4f}')
print(f'Accuracy: {accuracy:.2f}%')
print('Saved to Predictions/lstm_predictions.csv')

RMSE: 0.8750
MAE: 0.6739
Accuracy: 99.84%
Saved to Predictions/lstm_predictions.csv
