# Decision Trees on Mauna Loa CO2 data

## Import required libraries and packages

In [None]:
# 1) Standard library imports
import datetime

# 2) Third party library imports
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# 3) Local application/library imports

## Load input data

In [None]:
co2 = fetch_openml(data_id=41187, as_frame=True, parser="pandas")
co2.frame.head()

## Data preprocessing

In [None]:
# Do necessary data type conversion and extract only required columns
co2_data = co2.frame
co2_data["date"] = pd.to_datetime(co2_data[["year", "month", "day"]])
co2_data = co2_data[["date", "co2"]].set_index("date")
co2_data.head()

In [None]:
co2_data.index.min(), co2_data.index.max()

### Visualize raw data

In [None]:
co2_data.plot()
plt.ylabel("CO$_2$ concentration (ppm)")
_ = plt.title("Raw air samples measurements from the Mauna Loa Observatory")

### Resample data to take monthly averages

In [None]:
co2_data = co2_data.resample("M").mean().dropna(axis="index", how="any")
co2_data.plot()
plt.ylabel("Monthly average of CO$_2$ concentration (ppm)")
_ = plt.title(
    "Monthly average of air samples measurements\nfrom the Mauna Loa Observatory"
)

### Split input features and target variable

In [None]:
X = (co2_data.index.year + co2_data.index.month / 12).to_numpy().reshape(-1, 1)
y = co2_data["co2"].to_numpy()

## Model Training

### Decision Tree

#### Predict Absolute CO2 levels

In [None]:
regr_1 = tree.DecisionTreeRegressor(max_depth=2)
regr_2 = tree.DecisionTreeRegressor(max_depth=11)
regr_1.fit(X, y)
regr_2.fit(X, y)

In [None]:
today = datetime.datetime.now()
current_month = today.year + today.month / 12
X_test = np.linspace(start=1958, stop=current_month, num=1_000).reshape(-1, 1)

In [None]:
y_1 = regr_1.predict(X_test)
y_2 = regr_2.predict(X_test)

In [None]:
plt.figure()
plt.scatter(X, y, s=20, edgecolor="black", c="darkorange", label="data")
plt.plot(X_test, y_1, color="cornflowerblue", label=f"max_depth={regr_1.max_depth}", linewidth=2)
plt.plot(X_test, y_2, color="yellowgreen", label=f"max_depth={regr_2.max_depth}", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()

#### Predict CO2 level differences using Decision Trees

In [None]:
def preprocess_data(input_data, train_window):
    # Take derivative
    input_data = np.concatenate([[0], np.diff(input_data)])
    
    # Scale values
    scaler = StandardScaler()
    normalized_data = scaler.fit_transform(input_data.reshape(-1, 1)).reshape(-1)
    
    # Create input-output pairs
    in_seq = []
    out_seq = []
    L = len(normalized_data)
    for i in range(L-train_window-1):
        train_seq = normalized_data[i:i+train_window]
        train_label = normalized_data[i+train_window:i+train_window+1]
        
        in_seq.append(train_seq)
        out_seq.append(train_label)
    return normalized_data, in_seq, out_seq, scaler

train_window = 50
normalized_data, in_data, out_data, scaler = preprocess_data(list(co2_data["co2"]), train_window)


In [None]:
regr_3 = tree.DecisionTreeRegressor(max_depth=2)
regr_3.fit(in_data, out_data)

regr_4 = tree.DecisionTreeRegressor(max_depth=10)
regr_4.fit(in_data, out_data)

regr_5 = tree.DecisionTreeRegressor(max_depth=25)
regr_5.fit(in_data, out_data)

In [None]:
dates = pd.period_range("1958", "2023", freq='M').to_timestamp()

test_inputs = list(normalized_data)
fut_pred_3 = test_inputs.copy()
fut_pred_4 = test_inputs.copy()
fut_pred_5 = test_inputs.copy()
fut_pred_num = len(dates) - len(test_inputs)  # Number of predictions to make.
for i in range(fut_pred_num):
    seq = fut_pred_3[-train_window:]
    prediction = regr_3.predict([seq])
    fut_pred_3.append(prediction[0])
    
    seq = fut_pred_4[-train_window:]
    prediction = regr_4.predict([seq])
    fut_pred_4.append(prediction[0])
    
    seq = fut_pred_5[-train_window:]
    prediction = regr_5.predict([seq])
    fut_pred_5.append(prediction[0])

In [None]:
plt.figure()
plt.plot(dates, fut_pred_3, label=f"max_depth={regr_3.max_depth}", linewidth=2)
plt.plot(dates, fut_pred_4, label=f"max_depth={regr_4.max_depth}", linewidth=2)
plt.plot(dates, fut_pred_5, label=f"max_depth={regr_5.max_depth}", linewidth=2)
plt.plot(dates[0:521], normalized_data, label=f"true value", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression on Derivative")
plt.legend()
plt.show()

#### Convert CO2 level differences to absolute CO2 levels

In [None]:
def postprocess_data(output_data, scaler, first_input):
    #unscale the output
    output = scaler.inverse_transform(np.array(output_data).reshape(-1, 1)).reshape(-1)
    
    output = np.cumsum(output) + first_input
    
    return output

decoded_3 = postprocess_data(fut_pred_3, scaler, list(co2_data["co2"])[0])
decoded_4 = postprocess_data(fut_pred_4, scaler, list(co2_data["co2"])[0])
decoded_5 = postprocess_data(fut_pred_5, scaler, list(co2_data["co2"])[0])

In [None]:
plt.figure()
plt.plot(dates, decoded_3, label=f"max_depth={regr_3.max_depth}", linewidth=2)
plt.plot(dates, decoded_4, label=f"max_depth={regr_4.max_depth}", linewidth=2)
plt.plot(dates, decoded_5, label=f"max_depth={regr_5.max_depth}", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()