In [1]:
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from nldg.new.archive.utils import gen_data_v3, max_mse, min_xplvar
from nldg.new.archive.rf import MaggingRF, RF4DG, MaggingRF_PB
from sklearn.ensemble import RandomForestRegressor
from adaXT.random_forest import RandomForest


In [2]:
dtr, dts = gen_data_v3(n_train=1000, n_test=500, train_setting=2, test_setting=2)
Xtr, Xts = np.array(dtr.drop(columns=['E', 'Y'])), np.array(dts.drop(columns=['E', 'Y']))
Ytr, Yts = np.array(dtr['Y']), np.array(dts['Y'])
Etr = np.array(dtr['E'])
min_samples_leaf = 10
n_estimators = 50
random_state = 42

## Inefficient, but easy to understand

### Maximin

In [3]:
mrf = RF4DG(criterion='maximin', n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, parallel=True, random_state=random_state)
mrf.fit(Xtr, Ytr, Etr)
mpreds = mrf.predict(Xts)
mfitted = mrf.predict(Xtr)

100%|██████████| 50/50 [00:03<00:00, 13.87it/s]


In [4]:
mean_squared_error(Ytr, mfitted), r2_score(Ytr, mfitted)

(40.16657491488834, 0.12867487134496736)

In [5]:
mean_squared_error(Yts, mpreds), r2_score(Yts, mpreds)

(9.787894340420301, 0.38303258218237624)

In [6]:
max_mse(Ytr, mfitted, Etr, verbose=True)

Environment 0 MSE: 37.479799544632584
Environment 1 MSE: 39.27866772107278
Environment 2 MSE: 43.741257478959675


np.float64(43.741257478959675)

In [7]:
min_xplvar(Ytr, mfitted, Etr, verbose=True)

Environment 0 explained variance: -1.049480802108583
Environment 1 explained variance: -3.274301869009143
Environment 2 explained variance: 28.133423192169154


np.float64(-3.274301869009143)

### Default

In [8]:
rf = RF4DG(criterion='mse', n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, parallel=True, random_state=random_state)
rf.fit(Xtr, Ytr, Etr)
preds = rf.predict(Xts)
fitted = rf.predict(Xtr)

100%|██████████| 50/50 [00:00<00:00, 80.33it/s]


In [9]:
mean_squared_error(Ytr, fitted), r2_score(Ytr, fitted)

(25.11030796060338, 0.45528732881253553)

In [10]:
mean_squared_error(Yts, preds), r2_score(Yts, preds)

(28.310778442036522, -0.7845337581607037)

In [11]:
max_mse(Ytr, fitted, Etr, verbose=True)

Environment 0 MSE: 25.42306864859348
Environment 1 MSE: 25.76697543942067
Environment 2 MSE: 24.14087979379599


np.float64(25.76697543942067)

In [12]:
min_xplvar(Ytr, fitted, Etr, verbose=True)

Environment 0 explained variance: 11.007250093930523
Environment 1 explained variance: 10.237390412642966
Environment 2 explained variance: 47.73380087733284


np.float64(10.237390412642966)

### Magging - Forest

In [13]:
rfm = MaggingRF_PB(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, parallel=True, random_state=random_state)
wfitted, wpreds = rfm.fit_predict_magging(Xtr, Ytr, Etr, Xts)
wmag = rfm.get_weights()

100%|██████████| 50/50 [00:00<00:00, 419.01it/s]
100%|██████████| 50/50 [00:00<00:00, 537.50it/s]
100%|██████████| 50/50 [00:00<00:00, 360.15it/s]


In [14]:
mean_squared_error(Ytr, wfitted), r2_score(Ytr, wfitted)

(37.77141298353625, 0.18063261935826447)

In [15]:
mean_squared_error(Yts, wpreds), r2_score(Yts, wpreds)

(23.342130197163915, -0.47134136242510727)

In [16]:
max_mse(Ytr, wfitted, Etr, verbose=True)

Environment 0 MSE: 25.580746973937888
Environment 1 MSE: 26.19074647489119
Environment 2 MSE: 61.542745501779685


np.float64(61.542745501779685)

In [17]:
wmag

array([0.46634621, 0.48050911, 0.05314468])

In [18]:
min_xplvar(Ytr, wfitted, Etr, verbose=True)

Environment 0 explained variance: 10.849571768586113
Environment 1 explained variance: 9.813619377172447
Environment 2 explained variance: 10.331935169349144


np.float64(9.813619377172447)

### Magging - Trees

In [19]:
rfm2 = MaggingRF(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state=random_state)
rfm2.fit(Xtr, Ytr)
wpreds2, _ = rfm2.predict_maximin(Xtr, Xts)
wfitted2, _ = rfm2.predict_maximin(Xtr, Xtr)

In [20]:
mean_squared_error(Ytr, wfitted2), r2_score(Ytr, wfitted2)

(29.370510828841766, 0.36287163690626756)

In [21]:
mean_squared_error(Yts, wpreds2), r2_score(Yts, wpreds2)

(25.597681149048274, -0.6135171356957061)

In [22]:
max_mse(Ytr, wfitted2, Etr, verbose=True)

Environment 0 MSE: 28.084572676792458
Environment 1 MSE: 25.95257406198103
Environment 2 MSE: 34.074385747751805


np.float64(34.074385747751805)

In [23]:
min_xplvar(Ytr, wfitted2, Etr, verbose=True)

Environment 0 explained variance: 8.345746065731543
Environment 1 explained variance: 10.051791790082607
Environment 2 explained variance: 37.800294923377024


np.float64(8.345746065731543)

## Efficient solution: adaXT

### Maximin

In [24]:
mrf_adaxt = RandomForest("MaximinRegression", n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, seed=random_state)
mrf_adaxt.fit(Xtr, Ytr, Etr)
mpreds_adaxt = mrf_adaxt.predict(Xts)
mfitted_adaxt = mrf_adaxt.predict(Xtr)

In [25]:
mean_squared_error(Ytr, mfitted_adaxt), r2_score(Ytr, mfitted_adaxt)

(39.97986666745992, 0.1327250943987438)

In [26]:
mean_squared_error(Yts, mpreds_adaxt), r2_score(Yts, mpreds_adaxt)

(12.878348244194116, 0.1882297677484548)

In [27]:
max_mse(Ytr, mfitted_adaxt, Etr, verbose=True)

Environment 0 MSE: 31.94937215571709
Environment 1 MSE: 34.856003040483685
Environment 2 MSE: 53.13422480617896


np.float64(53.13422480617896)

In [28]:
min_xplvar(Ytr, mfitted_adaxt, Etr, verbose=True)

Environment 0 explained variance: 4.480946586806912
Environment 1 explained variance: 1.1483628115799505
Environment 2 explained variance: 18.740455864949865


np.float64(1.1483628115799505)

### Default

In [29]:
rf_skl = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state=random_state)
rf_skl.fit(Xtr, Ytr)
preds_skl = rf_skl.predict(Xts)
fitted_skl = rf_skl.predict(Xtr)
#rf_adaxt = RandomForest("Regression", n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, seed=random_state)
#rf_adaxt.fit(Xtr, Ytr)
#preds_adaxt = rf_adaxt.predict(Xts)
#fitted_adaxt = rf_adaxt.predict(Xtr)

In [30]:
mean_squared_error(Ytr, fitted_skl), r2_score(Ytr, fitted_skl)

(27.648368385616326, 0.4002297135927988)

In [31]:
mean_squared_error(Yts, preds_skl), r2_score(Yts, preds_skl)

(27.166602536243637, -0.7124120913777883)

In [32]:
max_mse(Ytr, fitted_skl, Etr, verbose=True)

Environment 0 MSE: 27.866480350270987
Environment 1 MSE: 28.192826525990945
Environment 2 MSE: 26.885798280587043


np.float64(28.192826525990945)

In [33]:
min_xplvar(Ytr, fitted_skl, Etr, verbose=True)

Environment 0 explained variance: 8.563838392253015
Environment 1 explained variance: 7.81153932607269
Environment 2 explained variance: 44.98888239054179


np.float64(7.81153932607269)

### Magging - Forest

In [34]:
rfm_skl = MaggingRF_PB(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, backend="sklearn", random_state=random_state)
wfitted_skl, wpreds_skl = rfm_skl.fit_predict_magging(Xtr, Ytr, Etr, Xts)
wmag_skl = rfm_skl.get_weights()

In [35]:
mean_squared_error(Ytr, wfitted_skl), r2_score(Ytr, wfitted_skl)

(37.71578581602704, 0.18183932789613177)

In [36]:
mean_squared_error(Yts, wpreds_skl), r2_score(Yts, wpreds_skl)

(23.58922740380694, -0.486916819232291)

In [37]:
max_mse(Ytr, wfitted_skl, Etr, verbose=True)

Environment 0 MSE: 25.58734194345175
Environment 1 MSE: 26.54033729148976
Environment 2 MSE: 61.01967821313962


np.float64(61.01967821313962)

In [38]:
wmag_skl

array([0.47020345, 0.4742219 , 0.05557465])

In [39]:
min_xplvar(Ytr, wfitted_skl, Etr, verbose=True)

Environment 0 explained variance: 10.84297679907225
Environment 1 explained variance: 9.464028560573876
Environment 2 explained variance: 10.855002457989208


np.float64(9.464028560573876)

### Magging - Trees

In [40]:
rfm2 = MaggingRF(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state=random_state)
rfm2.fit(Xtr, Ytr)
wpreds2, _ = rfm2.predict_maximin(Xtr, Xts)
wfitted2, _ = rfm2.predict_maximin(Xtr, Xtr)

In [41]:
mean_squared_error(Ytr, wfitted2), r2_score(Ytr, wfitted2)

(29.370510828841766, 0.36287163690626756)

In [42]:
mean_squared_error(Yts, wpreds2), r2_score(Yts, wpreds2)

(25.597681149048274, -0.6135171356957061)

In [43]:
max_mse(Ytr, wfitted2, Etr, verbose=True)

Environment 0 MSE: 28.084572676792458
Environment 1 MSE: 25.95257406198103
Environment 2 MSE: 34.074385747751805


np.float64(34.074385747751805)

In [44]:
min_xplvar(Ytr, wfitted2, Etr, verbose=True)

Environment 0 explained variance: 8.345746065731543
Environment 1 explained variance: 10.051791790082607
Environment 2 explained variance: 37.800294923377024


np.float64(8.345746065731543)

## Variable importance

In [45]:
dtr, dts = gen_data_v3(n_train=1000, n_test=500, train_setting=1)
Xtr = np.array(dtr.drop(columns=['E', 'Y']))
Ytr = np.array(dtr['Y'])
min_samples_leaf = 10
n_estimators = 50
random_state = 42

rf = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state=random_state)
rf.fit(Xtr, Ytr)
rf.feature_importances_

array([0.95341065, 0.04658935])

In [46]:
dtr, dts = gen_data_v3(n_train=1000, n_test=500, train_setting=2)
Xtr = np.array(dtr.drop(columns=['E', 'Y']))
Ytr = np.array(dtr['Y'])

rf = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state=random_state)
rf.fit(Xtr, Ytr)
rf.feature_importances_

array([0.57477797, 0.42522203])