## Setup


In [2]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.decomposition import PCA

In [3]:
import warnings
import logging

logging.getLogger("tensorflow").setLevel(logging.ERROR)
warnings.filterwarnings('ignore')

# %config IPCompleter.use_jedi = False
%matplotlib inline

np.set_printoptions(precision=2, suppress=True)
plt.style.use("ggplot")


In [4]:
SEED = 2020_2024

np.random.seed(SEED)


## Loading Data


In [5]:
X_data, X_sg, y_data, X_pca= np.load("combined_data.npz").values()


In [6]:
X_columns = [f"W{x}" for x in np.arange(350, 2501, 10)]
y_columns = ["PHH2O", "PHKCL", "CA", "MG", "NA", "K", "CLAY", "ORGC"]


In [7]:
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_data, y_data, train_size=0.70)
X_test_orig, X_valid_orig, y_test_orig, y_valid_orig = train_test_split(X_test_orig, y_test_orig, train_size=0.50)


X_train_sg, X_test_sg, y_train_sg, y_test_sg = train_test_split(X_sg, y_data, train_size=0.70)
X_test_sg, X_valid_sg, y_test_sg, y_valid_sg = train_test_split(X_test_sg, y_test_sg, train_size=0.50)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y_data, train_size=0.70)
X_test_pca, X_valid_pca, y_test_pca, y_valid_pca = train_test_split(X_test_pca, y_test_pca, train_size=0.50)

## Models


In [8]:
def show_performance(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    print(f"Overall:\tR2={r2:.2f}\tMSE={mse:.2f}")

    for i in range(8):
        r2 = r2_score(y_true[:, i], y_pred[:, i])
        mse = mean_squared_error(y_true[:, i], y_pred[:, i])
        print(f"{y_columns[i]}:\tR2={r2:.2f}\tMSE={mse:.2f}")


### Random Forests


In [9]:
model = RandomForestRegressor()
model.fit(X_train_orig, y_train_orig)

y_pred = model.predict(X_test_orig)
show_performance(y_test_orig, y_pred)

Overall:	R2=0.43	MSE=39.71
PHH2O:	R2=0.49	MSE=1.05
PHKCL:	R2=0.43	MSE=1.00
CA:	R2=0.45	MSE=121.82
MG:	R2=0.56	MSE=13.88
NA:	R2=0.31	MSE=2.49
K:	R2=0.24	MSE=0.19
CLAY:	R2=0.67	MSE=172.60
ORGC:	R2=0.33	MSE=4.67


In [10]:
model = RandomForestRegressor()
model.fit(X_train_sg, y_train_sg)

y_pred = model.predict(X_test_sg)
show_performance(y_test_sg, y_pred)


Overall:	R2=0.60	MSE=30.57
PHH2O:	R2=0.73	MSE=0.55
PHKCL:	R2=0.70	MSE=0.51
CA:	R2=0.62	MSE=109.39
MG:	R2=0.69	MSE=7.12
NA:	R2=0.42	MSE=1.90
K:	R2=0.32	MSE=0.19
CLAY:	R2=0.76	MSE=123.93
ORGC:	R2=0.54	MSE=0.95


In [11]:
model = RandomForestRegressor()
model.fit(X_train_pca, y_train_pca)

y_pred = model.predict(X_test_pca)
show_performance(y_test_pca, y_pred)


Overall:	R2=0.54	MSE=28.66
PHH2O:	R2=0.61	MSE=0.77
PHKCL:	R2=0.58	MSE=0.72
CA:	R2=0.69	MSE=75.31
MG:	R2=0.51	MSE=16.36
NA:	R2=0.44	MSE=4.85
K:	R2=0.21	MSE=0.21
CLAY:	R2=0.74	MSE=129.74
ORGC:	R2=0.54	MSE=1.33


### PLSR


In [12]:
model = PLSRegression(n_components=70)
model.fit(X_train_orig, y_train_orig)

y_pred = model.predict(X_test_orig)
show_performance(y_test_orig, y_pred)


Overall:	R2=0.48	MSE=36.66
PHH2O:	R2=0.65	MSE=0.71
PHKCL:	R2=0.60	MSE=0.70
CA:	R2=0.52	MSE=107.30
MG:	R2=0.56	MSE=13.98
NA:	R2=0.27	MSE=2.66
K:	R2=0.22	MSE=0.20
CLAY:	R2=0.68	MSE=163.01
ORGC:	R2=0.32	MSE=4.70


In [13]:
model = PLSRegression(n_components=70)
model.fit(X_train_sg, y_train_sg)

y_pred = model.predict(X_test_sg)
show_performance(y_test_sg, y_pred)


Overall:	R2=0.46	MSE=41.35
PHH2O:	R2=0.67	MSE=0.67
PHKCL:	R2=0.62	MSE=0.66
CA:	R2=0.54	MSE=133.92
MG:	R2=0.52	MSE=11.24
NA:	R2=0.08	MSE=3.04
K:	R2=0.25	MSE=0.21
CLAY:	R2=0.65	MSE=179.73
ORGC:	R2=0.38	MSE=1.30


In [14]:
model = PLSRegression(n_components=20)
model.fit(X_train_pca, y_train_pca)

y_pred = model.predict(X_test_pca)
show_performance(y_test_pca, y_pred)


Overall:	R2=0.37	MSE=50.95
PHH2O:	R2=0.50	MSE=0.99
PHKCL:	R2=0.43	MSE=0.97
CA:	R2=0.52	MSE=117.11
MG:	R2=0.35	MSE=21.69
NA:	R2=0.11	MSE=7.77
K:	R2=0.14	MSE=0.23
CLAY:	R2=0.49	MSE=256.98
ORGC:	R2=0.37	MSE=1.83


### XGBoost


In [15]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [16]:
model = XGBRegressor()
model.fit(X_train_orig, y_train_orig)

y_pred = model.predict(X_test_orig)
show_performance(y_test_orig, y_pred)


: 

: 

In [None]:
model = XGBRegressor()
model.fit(X_train_sg, y_train_sg)

y_pred = model.predict(X_test_sg)
show_performance(y_test_sg, y_pred)

In [None]:
model = XGBRegressor()
model.fit(X_train_pca, y_train_pca)

y_pred = model.predict(X_test_pca)
show_performance(y_test_pca, y_pred)


Overall:	R2=0.59	MSE=27.77
PHH2O:	R2=0.69	MSE=0.62
PHKCL:	R2=0.64	MSE=0.63
CA:	R2=0.72	MSE=67.57
MG:	R2=0.49	MSE=16.91
NA:	R2=0.40	MSE=5.24
K:	R2=0.31	MSE=0.19
CLAY:	R2=0.74	MSE=130.11
ORGC:	R2=0.70	MSE=0.87
