This notebook generates regression trees from our honey production and air quality combined dataset, which is read in in the second cell<br>
We use sklearn's implementation of regression trees, which is covered in detail in the report, as the inner-workings are not visible here<br>
Cross validation is performed and an MSE is output near the end.

In [1]:
import pandas as pd
from scipy.stats import norm
import numpy as np
import math
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeRegressor

In [2]:
data = pd.read_csv("data/completeFeatureVectors.csv")

X = data[['o3','co','so2','no2','pm25_frm', 'pressure', 'temperature', 'wind', 'year']].to_numpy()
# subtract 1998 from the year so that it starts at zero
X[:,8] = X[:,8]-1998

y = data[['yield_per_col']].to_numpy()

In [3]:
regressor = DecisionTreeRegressor(random_state=0)

k = 10

# the size of the testing set for each fold
chunk_size = X.shape[0] // k

# shuffle X and y together
Xy_shuffled = np.append(X, y, axis=1)
np.random.shuffle(Xy_shuffled)

sq_errors = []

# iterate through k folds
for i in range(k):

    # split out testing and training data
    X_k_test = Xy_shuffled[chunk_size*i:chunk_size*(i+1),:9]
    y_k_test = Xy_shuffled[chunk_size*i:chunk_size*(i+1),9]

    if i == 0:
        X_k_train = Xy_shuffled[chunk_size:,:9]
        y_k_train = Xy_shuffled[chunk_size:,9]
    elif i == k-1:
        X_k_train = Xy_shuffled[:chunk_size*i,:9]
        y_k_train = Xy_shuffled[:chunk_size*i,9]
    else:
        X_k_train = np.append(Xy_shuffled[:chunk_size*i,:9], Xy_shuffled[chunk_size*(i+1):,:9], axis=0)
        y_k_train = np.append(Xy_shuffled[:chunk_size*i,9], Xy_shuffled[chunk_size*(i+1):,9], axis=0)

    # train on training data to get trained decission tree
    regressor.fit(X_k_train, y_k_train)
    # test out the decision tree that we found on the testing chunk
    y_pred = regressor.predict(X_k_test)

    for i in range(X_k_test.shape[0]):
        sq_errors.append((y_pred[i] - y_k_test[i])**2)
    
mean_sq_error = np.mean(sq_errors)
print(mean_sq_error)

259.71666666666664
