In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.datasets import make_regression
from regerssion_random_forest import MyForestReg



In [2]:
from sklearn.datasets import load_diabetes

data = load_diabetes(as_frame=True)
X, y = data['data'], data['target']
X.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=42, random_state=42)
X_train.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
284,0.041708,0.05068,-0.022373,0.028758,-0.066239,-0.045155,-0.061809,-0.002592,0.002861,-0.054925
402,0.110727,0.05068,-0.033151,-0.022885,-0.004321,0.020293,-0.061809,0.07121,0.015568,0.044485
199,0.041708,-0.044642,-0.045007,0.034508,0.043837,-0.015719,0.037595,-0.014401,0.089897,0.007207
82,-0.016412,-0.044642,-0.035307,-0.026328,0.03283,0.017162,0.100183,-0.039493,-0.070209,-0.079778
77,-0.096328,-0.044642,-0.036385,-0.074527,-0.03872,-0.027618,0.015505,-0.039493,-0.074093,-0.001078


In [4]:
forest = MyForestReg(oob_score='mae')
print(forest)

MyForestReg class: n_estimators=10, max_features=0.5, max_samples=0.5, max_depth=5, min_samples_split=2, max_leaves=20, bins=16, random_state=42


In [5]:
forest.fit(X_train, y_train)

In [6]:
forest.oob_score_

51.29621743729016

In [7]:
forest.fi

{'age': 894.4846295232716,
 'sex': 169.05238613817625,
 'bmi': 2575.057799909323,
 'bp': 1049.5534933985914,
 's1': 726.5638964385912,
 's2': 632.3183158083866,
 's3': 1003.086206989147,
 's4': 1603.6421751831786,
 's5': 5714.1099597350685,
 's6': 827.3019006881902}

In [8]:
preds = forest.predict(X_test)

In [9]:
mean_absolute_error(y_test, preds), mean_squared_error(y_test, preds)**0.5

(48.202557952565996, 57.84248521764967)

In [10]:
forest_2 = MyForestReg(20, 0.7, 0.6, 7, 5, 20)

In [11]:
forest_2

MyForestReg class: n_estimators=20, max_features=0.7, max_samples=0.6, max_depth=7, min_samples_split=5, max_leaves=20, bins=16, random_state=42

In [12]:
forest_2.fit(X_train, y_train)

In [13]:
preds_2 = forest_2.predict(X_test)

In [14]:
mean_absolute_error(y_test, preds_2), mean_squared_error(y_test, preds_2)

(49.79453542944822, 3588.9551976272933)

In [15]:
forest_2.fi

{'age': 1427.703690553057,
 'sex': 368.7154698254293,
 'bmi': 11757.102140280029,
 'bp': 1886.0583016796395,
 's1': 609.372877370896,
 's2': 760.7389237227535,
 's3': 1545.5272416526504,
 's4': 3788.457516632846,
 's5': 8793.721360664651,
 's6': 1090.7648771329816}

In [16]:
X, y = make_regression(n_samples=1000, n_features=14, n_informative=10, random_state=42)
X = pd.DataFrame(X).round(2)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]
X_test = X.sample(20)

In [17]:
y_test = y[X_test.index]

In [18]:
forest_3 = MyForestReg(100, bins=10, oob_score='mse')

In [19]:
forest_3.fit(X, y)

In [20]:
forest_3.fi

{'col_0': 30292.018027828217,
 'col_1': 51017.203352403725,
 'col_2': 45817.44585325928,
 'col_3': 50015.807793286425,
 'col_4': 3477.948429881678,
 'col_5': 3155.9561614431673,
 'col_6': 4195.756241373561,
 'col_7': 4117.163656933041,
 'col_8': 114501.76420977483,
 'col_9': 4207.623013004395,
 'col_10': 6350.6031482687085,
 'col_11': 29577.95413425552,
 'col_12': 4877.807078827364,
 'col_13': 90398.50869910014}

In [21]:
forest_3.oob_score_

10785.142475558932

In [22]:
preds_3 = forest_3.predict(X_test)

In [23]:
mean_absolute_error(y_test, preds_3), mean_squared_error(y_test, preds_3)**0.5

(70.58335283561686, 89.45913904983644)

In [25]:
%%timeit 
forest_3 = MyForestReg(100, bins=10, oob_score='mse')
forest_3.fit(X, y)

1min 51s ± 1.81 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
from regression_multiprocess_random_forest import MultiProcessForestReg

In [27]:
forest_4 = MultiProcessForestReg(100, bins=10, oob_score='mse')

In [28]:
forest_4

MyForestReg class: n_estimators=100, max_features=0.5, max_samples=0.5, max_depth=5, min_samples_split=2, max_leaves=20, bins=10, random_state=42

In [29]:
forest_4.fit(X, y)

In [30]:
forest_4.fi

{'col_0': 30292.018027828217,
 'col_1': 51017.203352403725,
 'col_2': 45817.44585325928,
 'col_3': 50015.807793286425,
 'col_4': 3477.948429881678,
 'col_5': 3155.9561614431673,
 'col_6': 4195.756241373561,
 'col_7': 4117.163656933041,
 'col_8': 114501.76420977483,
 'col_9': 4207.623013004395,
 'col_10': 6350.6031482687085,
 'col_11': 29577.95413425552,
 'col_12': 4877.807078827364,
 'col_13': 90398.50869910014}

In [31]:
forest_4.oob_score_

10785.142475558932

In [33]:
%%timeit 
forest_4 = MultiProcessForestReg(100, bins=10, oob_score='mse')
forest_4.fit(X, y)

1min 13s ± 6.41 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
