In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from datetime import datetime
from collections import Counter

import sklearn.ensemble
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
#from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

# Utils

## Load housing data

In [15]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

#def get_housing():
housing_df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/lazyprogrammer/data/housing.data', header=None, delim_whitespace=True)
housing_df.columns = [
    'crim', # numerical
    'zn', # numerical
    'nonretail', # numerical
    'river', # binary
    'nox', # numerical
    'rooms', # numerical
    'age', # numerical
    'dis', # numerical
    'rad', # numerical
    'tax', # numerical
    'ptratio', # numerical
    'b', # numerical
    'lstat', # numerical
    'medv', # numerical -- this is the target
  ]

if housing_df.isna().sum().max() == 0:
  print('There is no NA values')
else:
  print(f'There are {housing_df.isna().sum().max()} NAs' )

HOUSING_NUMERICAL_COLS = [
  'crim', # numerical
  'zn', # numerical
  'nonretail', # numerical
  'nox', # numerical
  'rooms', # numerical
  'age', # numerical
  'dis', # numerical
  'rad', # numerical
  'tax', # numerical
  'ptratio', # numerical
  'b', # numerical
  'lstat', # numerical
]

#NO_TRANSFORM = ['river']

housing_df.head()

Mounted at /content/gdrive/
There is no NA values


Unnamed: 0,crim,zn,nonretail,river,nox,rooms,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [16]:
def get_housing_data(test_size=0.3):
  class HousingDataTransformer:
    def __init__(self, numerical_columns):
      self.numerical_columns = numerical_columns

    def fit(self, X):
      self.transformers = []
      self.features_dim = X.shape[1]
      for c in range(self.features_dim):
        if (X.columns[c] in self.numerical_columns):
          scaler = StandardScaler()
          scaler.fit(X.iloc[:,c].values.reshape(-1, 1))
          self.transformers.append(scaler)
        else:
          self.transformers.append(None)
    
    def transform(self, X):
      result = np.zeros((len(X), self.features_dim))
      i = 0
      for c in range(self.features_dim):
        scaler = self.transformers[c]
        if (X.columns[c] in self.numerical_columns):
          result[:,i] = scaler.transform(X.iloc[:,c].values.reshape(-1, 1)).flatten()
        else:
          result[:,i] = X.iloc[:,c]
        i += 1
      return result

    def fit_transform(self, X):
      self.fit(X)
      return self.transform(X)

  df = housing_df.copy()
  X = df.iloc[:,:-1]
  Y = df['medv']
  N = len(X)
  transformer = HousingDataTransformer(HOUSING_NUMERICAL_COLS)

  if test_size > 0.0:
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=42)
  else:
    X_train, X_test, Y_train, Y_test = X, None, Y, None

  X_train_transformed = transformer.fit_transform(X_train)
  if X_test is not None:
    return X_train_transformed, transformer.transform(X_test), Y_train.values, Y_test.values
  else:
    return X_train_transformed, None, Y_train.values, None

## Load mushroom data

In [17]:
mushroom_df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/lazyprogrammer/data/agaricus-lepiota.data', header=None)

if mushroom_df.isna().sum().max() == 0:
  print('There is no NA values')
else:
  print(f'There are {mushroom_df.isna().sum().max()} NAs' )

MUSHROOM_NUMERICAL_COLS = ()
MUSHROOM_CATEGORICAL_COLS = np.arange(22) + 1 # 1..22 inclusive

mushroom_df.head()

There is no NA values


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g


In [18]:
from future.utils import iteritems

class MushroomDataTransformer:
  def __init__(self, numerical_columns, categorical_columns):
    self.numerical_columns = numerical_columns
    self.categorical_columns = categorical_columns

  def fit(self, df):
    self.labelEncoders = {}
    self.scalers = {}
    for col in self.numerical_columns:
      scaler = StandardScaler()
      scaler.fit(df[col].reshape(-1, 1))
      self.scalers[col] = scaler

    for col in self.categorical_columns:
      encoder = LabelEncoder()
      # in case the train set does not have 'missing' value but test set does
      values = df[col].tolist()
      values.append('missing')
      encoder.fit(values)
      self.labelEncoders[col] = encoder

    # find dimensionality
    self.D = len(self.numerical_columns)
    for col, encoder in iteritems(self.labelEncoders):
      self.D += len(encoder.classes_)
    print("dimensionality:", self.D)

  def transform(self, df):
    N, _ = df.shape
    X = np.zeros((N, self.D))
    i = 0
    for col, scaler in iteritems(self.scalers):
      X[:,i] = scaler.transform(df[col].values.reshape(-1, 1)).flatten()
      i += 1

    for col, encoder in iteritems(self.labelEncoders):
      # print "transforming col:", col
      K = len(encoder.classes_)
      X[np.arange(N), encoder.transform(df[col]) + i] = 1
      i += K
    return X

  def fit_transform(self, df):
    self.fit(df)
    return self.transform(df)


In [19]:
def replace_missing(df, numerical_columns, categorical_columns, special_missing_category='missing'):
  # standard method of replacement for numerical columns is median
  for col in numerical_columns:
    if np.any(df[col].isnull()):
      med = np.median(df[ col ][ df[col].notnull() ])
      df.loc[ df[col].isnull(), col ] = med

  # set a special value = 'missing'
  for col in categorical_columns:
    if np.any(df[col].isnull()):
      print(col)
      df.loc[ df[col].isnull(), col ] = special_missing_category


def get_mushroom_data(test_size=0.3):
  df = mushroom_df.copy()
  # replace label column: e/p --> 0/1, e = edible = 0, p = poisonous = 1
  df[0] = df.apply(lambda row: 0 if row[0] == 'e' else 1, axis=1)

  replace_missing(df, MUSHROOM_NUMERICAL_COLS, MUSHROOM_CATEGORICAL_COLS)
  transformer = MushroomDataTransformer(MUSHROOM_NUMERICAL_COLS, MUSHROOM_CATEGORICAL_COLS)

  X = df
  Y = df[0]
  if test_size > 0.0:
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=42)
  else:
    X_train, X_test, Y_train, Y_test = X, None, Y, None

  X_train_transformed = transformer.fit_transform(X_train)
  if X_test is not None:
    return X_train_transformed, transformer.transform(X_test), Y_train.values, Y_test.values
  else:
    return X_train_transformed, None, Y_train.values, None

## Util functions

In [20]:
from sklearn.model_selection import KFold

def my_cross_val_score(estimator, X, Y, cv, shuffle=False, random_state=None):
  N = len(Y)
  if isinstance(cv, KFold):
    kf = cv
  elif isinstance(cv, int):
    kf = KFold(n_splits=cv, random_state=random_state, shuffle=shuffle)
  else:
    raise Exception(f'cv param can be int or KFold but was {type(cv)} and had value of {cv}')

  kf_scores = []
  for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    estimator.fit(X_train, Y_train)
    estimator.score(X_test, Y_test)
    kf_scores.append(estimator.score(X_test, Y_test))
  return np.array(kf_scores)

# Ramdom Forest model class

## Decision tree class from Superviced Machine Learning course

In [21]:
def binary_entropy(y):
    # assume y is binary - 0 or 1
    N = len(y)
    s1 = (y == 1).sum()
    if 0 == s1 or N == s1:
        return 0
    p1 = float(s1) / N
    p0 = 1 - p1
    return -p0 * np.log2(p0) - p1 * np.log2(p1)


class BinaryTreeNode:
    def __init__(self, depth, max_depth=None, max_bucket_size=None, trace_logs=True):
        # print(f'Creating new node with depth={depth}')
        self.depth = depth
        self.max_depth = max_depth
        self.max_bucket_size = max_bucket_size
        self.trace_logs = trace_logs
        if self.max_depth is not None and self.max_depth < self.depth:
            raise Exception(f'depth > max_depth:{depth > max_depth}, depth={depth}, max_depth={max_depth}')
        self.split_column_idx = None
        self.split_value = None
        self.left_child = None
        self.right_child = None
        self.prediction = None
        self.information_gain = None

    def fit(self, X, Y):
        #if (self.trace_logs == True):
            #print(f'fit (depth:{self.depth}) - Start fit')
            # print(f'fit (depth:{self.depth}) - X:{X}')
            # print(f'fit (depth:{self.depth}) - Y:{Y}')
        if (self._is_fitted() == False):
            if (self._can_split(Y)):
                # print('Is allowed to split')
                split_column, self.split_value, self.information_gain = self._find_best_split(X, Y)
                if (self.trace_logs == True):
                    print(f'fit (depth:{self.depth}) - (best_split_col, best_split_value, max_ig):{(split_column, self.split_value, self.information_gain)}')
                if (split_column is None):
                    #print(f'fit (depth:{self.depth}) - no splits found, will make this node a leaf')
                    self.prediction = self._calc_prediction(Y)
                    # print(f'fit (depth:{self.depth}) - Leaf on level {self.depth}, calculated prediction={self.prediction}')
                    return

                self.split_column_idx = int(split_column)
                left_split_mask = self._get_left_split_mask(X[:, self.split_column_idx], self.split_value)
                X_left, X_right = X[left_split_mask], X[~left_split_mask]
                Y_left, Y_right = Y[left_split_mask], Y[~left_split_mask]
                # print(f'fit (depth:{self.depth}) - Y_left len:{len(Y_left)}, Y_right len:{len(Y_right)}')
                # print(f'fit (depth:{self.depth}) - Y before split len:{len(Y)}, Y:{Y}')
                self.left_child = self._make_child_node(self.depth + 1, self.max_depth, self.max_bucket_size,
                                                        trace_logs=self.trace_logs)
                self.left_child.fit(X_left, Y_left)
                self.right_child = self._make_child_node(self.depth + 1, self.max_depth, self.max_bucket_size,
                                                         trace_logs=self.trace_logs)
                self.right_child.fit(X_right, Y_right)
            else:
                #print(f'fit (depth:{self.depth}) - Is not allowed to split')
                #print('fit (depth:{self.depth}) - Y:', Y)
                self.prediction = self._calc_prediction(Y)
                # print(f'fit (depth:{self.depth}) - Calculated prediction={self.prediction}')

    def predict(self, X):
        result = np.zeros(len(X))
        #print(f'predict - node level={self.depth}')
        if (self._is_leaf() == True):
            #print('Node has not childrens')
            #print('predict - self.prediction:', self.prediction)
            return self.prediction
        left_split_mask = self._get_left_split_mask(X[:, self.split_column_idx], self.split_value)

        left_predictions = self.left_child.predict(X[left_split_mask])
        right_predictions = self.right_child.predict(X[~left_split_mask])
        #print(f'predict - left_predictions:{left_predictions}, left_split_mask:{left_split_mask}')
        #print(f'predict - right_predictions:{right_predictions}, right_split_mask:{~left_split_mask}')

        #print('predict - left_split_mask:', left_split_mask)
        result[left_split_mask] = left_predictions
        result[~left_split_mask] = right_predictions
        #print('predict - result:', result)
        return result

    def get_importance(self):
        # tabs = '\t'*self.depth
        # print(f'{tabs}get_importance - node level {self.depth}')
        if (self._is_fitted() == False):
            raise Exception(f'Node on level {self.depth} is not fitted yet')
        if (self._is_leaf()):  # no split no gain
            return np.array([(0, 0)])
        left_importance = self.left_child.get_importance()
        # print(f'{tabs}get_importance on level {self.depth} - left_importance:{left_importance}')
        right_importance = self.right_child.get_importance()
        # print(f'{tabs}get_importance on level {self.depth} - right_importance:{right_importance}')
        return self._calc_node_level_total_importance(left_importance, right_importance)

    def self_make_child_node(self, depth, max_depth=None, max_bucket_size=None, trace_logs=True):
        raise NotImplementedError()

    def _calc_prediction(self, y):
        raise NotImplementedError()

    def _calc_node_cost(self, y):
        raise NotImplementedError()

    def _calc_information_gain(self, y, split_mask):
        y0 = y[split_mask]
        y1 = y[~split_mask]
        # print(f'_calc_information_gain - y0:{y0}')
        # print(f'_calc_information_gain - y1:{y1}')
        N_0 = len(y0)
        N_1 = len(y1)
        N = N_0 + N_1
        if (N_0 == 0 or N_1 == 0):
            # print(f'_calc_information_gain - one leg (left:{N_0}, right:{N_1}) is lenght of 0 so 0 is returned as information gain')
            return 0
        # print(f'_calc_information_gain - self._calc_node_cost(y):{self._calc_node_cost(y)}, self._calc_node_cost(y0):{self._calc_node_cost(y0)}, self._calc_node_cost(y1):{self._calc_node_cost(y1)}')
        return self._calc_node_cost(y) - (N_0 * self._calc_node_cost(y0) + N_1 * self._calc_node_cost(y1)) / N

    def _calc_node_level_total_importance(self, left_child_importance, right_child_importance):
        tabs = '\t' * self.depth
        importances = np.concatenate(
            (left_child_importance, right_child_importance, np.array([[self.split_column_idx, self.information_gain]])))
        # print(f'{tabs}_calc_node_level_total_importance on level {self.depth} - not summed importances:')
        # print(importances)
        importances_df = pd.DataFrame(importances, columns=['col_idx', 'information_gain'])
        importance = importances_df.groupby('col_idx', as_index=False).sum().values
        # print(f'{tabs}_calc_node_level_total_importance on level {self.depth} - summed importance:')
        # print(importance)
        return importance

    def _is_fitted(self):
        has_no_split_details = self.split_column_idx is None and self.split_value is None and self.left_child is None and self.right_child is None
        if (self.prediction is None and has_no_split_details):  # not a leaf neither splitted
            return False
        if (self.prediction is not None and has_no_split_details == True):  # is leaf
            return True
        has_split_details = self.split_column_idx is not None and self.split_value is not None and self.left_child is not None and self.right_child is not None
        if (self.prediction is None and has_split_details == True):  # is splitted
            return True
        raise Exception(f'There are conflicting values in self.prediction and other attributes related to node split')

    def _is_leaf(self):
        if (self._is_fitted()):
            return self.prediction is not None
        return False

    def _can_split(self, y):
        # True if all below
        # 1. depth not bigger than allowed => self.depth <= self.max_depth
        # 2. num of obserwations bigger than requested => len(y) > self.max_bucket_size
        # 3. there is any variation in labels => (len(set(y)) == 1) > 1
        allowed = True
        # print('should_try_split - result:', allowed)
        if (self.max_depth is not None):
            # print(f'self.max_depth is not None and node.depth <= self.max_depth={node.depth <= self.max_depth}, node.depth={node.depth}, self.max_depth={self.max_depth}')
            allowed = allowed and self.depth < self.max_depth
        # print('should_try_split - result:', allowed)
        if (self.max_bucket_size is not None):
            # print(f'self.max_bucket_size is not None and bucket_size > self.max_bucket_size={bucket_size > self.max_bucket_size}, node.bucket_size={bucket_size}, self.max_bucket_size={self.max_bucket_size}')
            allowed = allowed and len(y) > self.max_bucket_size
        if (len(y) == 1 or len(set(y)) == 1):
            return False
        # print('should_try_split - result:', allowed)
        return allowed

    def _find_best_split(self, x, y):
        # print(f'_find_best_split - x:{x}')
        splits = self._get_split_candidates(x, y)
        # print('_find_best_split - split candicates:', x)
        # print('_find_best_split - split candicates:', pd.DataFrame(splits, columns=['column_idx', 'split_value', 'ig']))
        if (len(splits) == 0):
            return (None, None, None)
        return splits[np.argmax(splits[:, 2])]

    def _get_split_candidates(self, x, y):
        splits = []
        for i in range(x.shape[1]):
            x_col = x[:, i]
            if (len(set(x_col)) == 1):
                # print(f'_find_all_splits - all split column {i} valueas are same (={x[:,i][0]}) and should no split further')
                continue
            sort_idx = np.argsort(x_col)
            x_col_sorted = x_col[sort_idx]
            y_sorted = y[sort_idx]
            steps_idx = self._get_steps(y_sorted)
            # print(f'_find_all_splits - column={i}, steps_idx:{steps_idx}, x_col_sorted:{x_col_sorted}, y_sorted:{y_sorted}')
            for s_idx in steps_idx:
                split_point = (x_col_sorted[s_idx] + x_col_sorted[s_idx + 1]) / 2.0
                # print('_find_all_splits - split_point:', split_point)
                left_split_mask = self._get_left_split_mask(x_col, split_point)
                ig = self._calc_information_gain(y, left_split_mask)
                if (ig > 0.0):
                    # print('_find_all_splits - calculated information gain:', ig)
                    splits.append([i, split_point, ig])
                # else:
                    # print(f'_find_all_splits - calculated information gain is {0} so will skip point {split_point} this as candicate')
        return np.array(splits)

    def _get_left_split_mask(self, x, split_by_value):
        # print(f'_get_left_split_mask - split_by_value:{split_by_value}')
        # print(f'_get_left_split_mask - x:{x}')
        left_split_mask = x < split_by_value
        # print('_get_split_mask - left_split_mask:', left_split_mask)
        return left_split_mask

    def _get_steps(self, y):
        return np.nonzero(y[:-1] != y[1:])[0]


class BinaryTreeClassifierNode(BinaryTreeNode):
    def __init__(self, depth, max_depth=None, max_bucket_size=None, trace_logs=True):
        super().__init__(depth, max_depth, max_bucket_size, trace_logs)
        self.prediction_features = None

    def _calc_prediction(self, y):
        if (len(y) == 1 or len(set(y)) == 1):
            return y[0]
        return int(np.round(y.mean()))

    def _calc_node_cost(self, y):
        return binary_entropy(y)

    def _make_child_node(self, depth, max_depth=None, max_bucket_size=None, trace_logs=True):
        return BinaryTreeClassifierNode(depth, max_depth, max_bucket_size, trace_logs)


class BinaryTreeRegressorNode(BinaryTreeNode):
    def __init__(self, depth, max_depth=None, max_bucket_size=None, trace_logs=True):
        super().__init__(depth, max_depth, max_bucket_size, trace_logs)
        self.prediction_features = None

    def _calc_prediction(self, y):
        if (len(y) == 1 or len(set(y)) == 1):
            result = y[0]
        result =  y.mean()
        print (f'_calc_prediction - Calculated leaf prediction based on {len(y)} targets (y:{y}). Prediction={result}')
        if (np.isnan(result)):
            raise Exception(f'_calc_prediction calculated nan for the leaf, result={result}, len(y)={len(y)}')
        return result

    def _calc_node_cost(self, y):
        return np.var(y)

    def _make_child_node(self, depth, max_depth=None, max_bucket_size=None, trace_logs=True):
        return BinaryTreeRegressorNode(depth, max_depth, max_bucket_size, trace_logs)


class BinaryTreeBase():
    def __init__(self, max_depth=10, max_bucket_size=10, trace_logs=True):
        self.max_depth = max_depth
        self.max_bucket_size = max_bucket_size
        self.trace_logs = trace_logs

    def predict(self, X):
        return self.head.predict(X)

    def score(self, X, Y):
        predictions = self.predict(X)
        return np.mean(predictions == Y)

    def get_importance(self):
        return self.head.get_importance()


class BinaryTreeClassifier(BinaryTreeBase):
    def __init__(self, max_depth=10, max_bucket_size=10, trace_logs=True):
        super().__init__(max_depth, max_bucket_size, trace_logs)

    def fit(self, X, Y):
        self.head = BinaryTreeClassifierNode(1, self.max_depth, self.max_bucket_size, trace_logs=self.trace_logs)
        self.head.fit(X, Y)


class BinaryTreeRegressor(BinaryTreeBase):
    def __init__(self, max_depth=10, max_bucket_size=10, trace_logs=True):
        super().__init__(max_depth, max_bucket_size, trace_logs)

    def fit(self, X, Y):
        self.head = BinaryTreeRegressorNode(1, self.max_depth, self.max_bucket_size, trace_logs=self.trace_logs)
        self.head.fit(X, Y)


In [22]:

class BaggingBase():
  def __init__(self, model_factory_fun, n_models, sample_size=None):
    self.model_factory_fun = model_factory_fun
    self.n_models = n_models
    self.sample_size = sample_size

  def fit(self, X, Y):
    N = len(Y)
    b_sample_size = self.sample_size if self.sample_size is not None else N
    self.models = []
    for k in range(self.n_models):
      print(f'Fitting {k+1}-th model out of {self.n_models}')
      sample_idx = np.random.choice(N, size=b_sample_size, replace=True)
      #sample_idx = np.random.choice(N, size=b_sample_size, replace=False)

      Xb = X[sample_idx]
      Yb = Y[sample_idx]

      model = self.model_factory_fun()
      model.fit(Xb, Yb)
      self.models.append(model)

  def _predictions(self, X):
    predictions = []
    for model in self.models:
      pred = model.predict(X)
      predictions.append(pred)
    return np.array(predictions)


class BaggingClassifier(BaggingBase):
  def __init__(self, model_factory_fun, n_models, sample_size=None):
    super().__init__(model_factory_fun, n_models, sample_size)

  def predict(self, X):
    predictions = super()._predictions(X)
    if (len(predictions[0].shape) > 1):
      raise Exception(
        f'Only non sparse prediction output is supported. Shape of single model prediction is:{predictions[0].shape}')
    else:
      N = len(X)
      Y_hat = np.zeros(N)
      for i in range(N):
        Y_hat[i] = self._most_frequent(predictions[:, i])
      return Y_hat

  def score(self, X, Y):
    Y_hat = self.predict(X)
    return (Y_hat == Y).mean()

  def _most_frequent(self, array1d):
    occurence_count = Counter(array1d)
    return occurence_count.most_common(1)[0][0]


class BaggingRegressor(BaggingBase):
  def __init__(self, model_factory_fun, n_models, sample_size=None):
    super().__init__(model_factory_fun, n_models, sample_size)

  def predict(self, X):
    predictions = super()._predictions(X)
    # print('BaggingRegressor - predict - predictions:', predictions)
    return predictions.mean(axis=0)

  def score(self, X, Y):
    Y_hat = self.predict(X)
    return r2_score(Y_hat, Y)

In [23]:
class RandomForestBinaryTreeNode(BinaryTreeNode):
  def __init__(self, depth, max_depth=None, max_bucket_size=None, n_features=None, trace_logs=True):
    self.n_features = n_features
    super().__init__(depth, max_depth, max_bucket_size, trace_logs)
    self.split_features_subset = None

  
  def _find_best_split(self, x, y):
    if (self.n_features is not None):
      if (self.n_features > x.shape[1]):
        raise Exception(f'n_features={self.n_features} can not be bigger than number of features in X={x.shape[1]}')
      self.split_features_subset = np.sort(np.random.choice(x.shape[1], size=self.n_features, replace=False))
      # print(f'_find_best_split - self.split_features_subset:{self.split_features_subset} out of all x features x.shape[1]:{x.shape[1]}')
      best_split = super()._find_best_split(x[:, self.split_features_subset], y)
      # print(f'_find_best_split - type(best_split):{type(best_split)}, best_split:{best_split}')
      best_split_col_idx, best_split_point, best_ig = best_split[0], best_split[1], best_split[2]
      if (best_split_col_idx is None):
        return (None, None, None)
      # print(f'_find_best_split - self.split_features_subset[best_split_col_idx]:{self.split_features_subset[int(best_split_col_idx)]}')
      return [self.split_features_subset[int(best_split_col_idx)], best_split_point, best_ig]
    else:
      return super()._find_best_split(x, y)


class RandomForestTreeClassifierNode(RandomForestBinaryTreeNode):
  def __init__(self, depth, max_depth=None, max_bucket_size=None, n_features=None, trace_logs=True):
    super().__init__(depth, max_depth, max_bucket_size, n_features, trace_logs)

  def _calc_prediction(self, y):
    #print('_calc_prediction - len(y):', len(y))
    if (len(y) == 1 or len(set(y)) == 1):
      result = y[0]
    result = int(np.round(y.mean()))
    #print (f'_calc_prediction - Calculated leaf prediction based on {len(y)} targets (y:{y}). Prediction={result}')
    if (np.isnan(result)):
      raise Exception(f'_calc_prediction calculated nan for the leaf, result={result}, len(y)={len(y)}')
    return result

  def _calc_node_cost(self, y):
    return binary_entropy(y)

  def _make_child_node(self, depth, max_depth=None, max_bucket_size=None, trace_logs=True):
    return RandomForestTreeClassifierNode(depth, max_depth, max_bucket_size, self.n_features, trace_logs)


class RandomForestTreeRegressorNode(RandomForestBinaryTreeNode):
  def __init__(self, depth, max_depth=None, max_bucket_size=None, n_features=None, trace_logs=True):
    self.n_features = n_features
    super().__init__(depth, max_depth, max_bucket_size, n_features, trace_logs)

  def _calc_prediction(self, y):
    if (len(y) == 1 or len(set(y)) == 1):
      result = y[0]
    result = y.mean()
    #print (f'_calc_prediction - Calculated leaf prediction based on {len(y)} targets (y:{y}). Prediction={result}')
    if (np.isnan(result)):
      raise Exception(f'_calc_prediction calculated nan for the leaf, result={result}, len(y)={len(y)}')
    return result

  def _calc_node_cost(self, y):
    if (y is None or len(y) <= 1):
      return 0
    return np.var(y)

  def _make_child_node(self, depth, max_depth=None, max_bucket_size=None, trace_logs=True):
    return RandomForestTreeRegressorNode(depth, max_depth, max_bucket_size, self.n_features, trace_logs)


class BinaryTreeForRandomForestClassifier(BinaryTreeBase):
  def __init__(self, max_depth=10, max_bucket_size=10, n_features=None, trace_logs=True):
    self.n_features = n_features
    super().__init__(max_depth, max_bucket_size, trace_logs)
  
  def fit(self, X, Y):
    self.head = RandomForestTreeClassifierNode(1, self.max_depth, self.max_bucket_size, self.n_features, trace_logs=self.trace_logs)
    self.head.fit(X, Y)


class BinaryTreeForRandomForestRegressor(BinaryTreeBase):
  def __init__(self, max_depth=10, max_bucket_size=10, n_features=None, trace_logs=True):
    self.n_features = n_features
    super().__init__(max_depth, max_bucket_size, trace_logs)
  
  def fit(self, X, Y):
    self.head = RandomForestTreeRegressorNode(1, self.max_depth, self.max_bucket_size, self.n_features, trace_logs=self.trace_logs)
    self.head.fit(X, Y)


class RandomForestClassifier():
  def __init__(self, n_models, sample_size=None, n_features=None, max_depth=None, max_bucket_size=None, trace_logs=True):
    self.n_models = n_models
    self.sample_size = sample_size
    self.n_features = n_features
    self.max_depth = max_depth
    self.max_bucket_size = max_bucket_size
    self.trace_logs = trace_logs
  
  def fit(self, X, Y):
    model_factory = lambda : BinaryTreeForRandomForestClassifier(self.max_depth, self.max_bucket_size, self.n_features, self.trace_logs)
    self.bagged_tree = BaggingClassifier(model_factory, self.n_models, self.sample_size)
    self.bagged_tree.fit(X, Y)
  
  def predict(self, X):
    predictions = self.bagged_tree.predict(X)
    #print('predictions.shape', predictions.shape)
    #print('predict - super()._predictions(X):', predictions)
    return predictions
    
  def score(self, X, Y):
    return self.bagged_tree.score(X, Y)


class RandomForestRegressor():
  def __init__(self, n_models, sample_size=None, n_features=None, max_depth=None, max_bucket_size=None, trace_logs=True):
    self.n_models = n_models
    self.sample_size = sample_size
    self.n_features = n_features
    self.max_depth = max_depth
    self.max_bucket_size = max_bucket_size
    self.trace_logs = trace_logs
  
  def fit(self, X, Y):
    model_factory = lambda : BinaryTreeForRandomForestRegressor(self.max_depth, self.max_bucket_size, self.n_features, self.trace_logs)
    self.bagged_tree = BaggingRegressor(model_factory, self.n_models, self.sample_size)
    self.bagged_tree.fit(X, Y)
  
  def predict(self, X):
    predictions = self.bagged_tree.predict(X)
    return predictions
    
  def score(self, X, Y):
    return self.bagged_tree.score(X, Y)

# Test Random Forest model

## Simple sanity test

In [24]:
def _test():
  q=np.array([1,4,-999,1,2,8,19,2,7,16,3,1,34,12,19,2,1111,55,3,55,34,9,9,27788,657456456456,16,77,1,2,8,19,2,1111,16,3,1,34,12,19,2,12,55,3,55,34,9,9,5645,1,16,-1239,1,2,8,19,2,8,16,3,1,34,12,19,2,1111,55,3,55,34,9,9,27788,1,16,-1239,1,2,8,19,2,1111,16,3,1,34,12,19,2,1111,55,3,55,34,9,9,27788]).reshape((-1,3))
  qy=np.array([0,1,1,1,0,1,0,1,0,1,1,1,0,1,0,1,0,1,1,1,0,1,0,1,0,1,1,1,0,1,0,1])

  #q=np.array([7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]).reshape((-1,2))
  #qy=np.array([0,1,1,1,0,0,1,1,1,0])

  #model = RandomForestRegressor(n_models=4)
  model = RandomForestRegressor(n_models=100, n_features=1, trace_logs=False)
  model.fit(q, qy)

  #q_pred=np.array([1,1,1,1,1]).reshape((-1,1))
  #qy_pred=np.array([0,1,1,1,])

  #q_pred=np.array([1,1,19,1,19,1,19,1,19]).reshape((-1,1))

  #print(q+0.5)
  #predictions = model.predict(q+.5)
  #print('final predictions.shape', predictions.shape)
  
  #print(predictions == qy)
  
  print("test my score forest:", model.score(q+.5, qy))

  #imp = model.get_importance()
  #print(f'test result imporatnce:{imp}')

_test()

Fitting 1-th model out of 100
Fitting 2-th model out of 100
Fitting 3-th model out of 100
Fitting 4-th model out of 100
Fitting 5-th model out of 100
Fitting 6-th model out of 100
Fitting 7-th model out of 100
Fitting 8-th model out of 100
Fitting 9-th model out of 100
Fitting 10-th model out of 100
Fitting 11-th model out of 100
Fitting 12-th model out of 100
Fitting 13-th model out of 100
Fitting 14-th model out of 100
Fitting 15-th model out of 100
Fitting 16-th model out of 100
Fitting 17-th model out of 100
Fitting 18-th model out of 100
Fitting 19-th model out of 100
Fitting 20-th model out of 100
Fitting 21-th model out of 100
Fitting 22-th model out of 100
Fitting 23-th model out of 100
Fitting 24-th model out of 100
Fitting 25-th model out of 100
Fitting 26-th model out of 100
Fitting 27-th model out of 100
Fitting 28-th model out of 100
Fitting 29-th model out of 100
Fitting 30-th model out of 100
Fitting 31-th model out of 100
Fitting 32-th model out of 100
Fitting 33-th mod

## Use Random Forest Regressor in housing data

In [25]:
ESTIMATORS = 100 
X_train, X_test, Y_train, Y_test = get_housing_data(test_size=0.3)

baseline = LinearRegression()
single_tree = DecisionTreeRegressor()
sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=ESTIMATORS)
my_rf_regressor = RandomForestRegressor(n_models=ESTIMATORS, n_features=5, trace_logs=False)

single_tree.fit(X_train, Y_train)
baseline.fit(X_train, Y_train)
sklearn_rf.fit(X_train, Y_train)
my_rf_regressor.fit(X_train, Y_train)

single_tree_scores = my_cross_val_score(single_tree, X_train, Y_train, cv=5, shuffle=True, random_state=123)
baseline_scores = my_cross_val_score(baseline, X_train, Y_train, cv=5, shuffle=True, random_state=123)
sklearn_rf_scores = my_cross_val_score(sklearn_rf, X_train, Y_train, cv=5, shuffle=True, random_state=123)
my_rf_regressor_scores = my_cross_val_score(my_rf_regressor, X_train, Y_train, cv=5, shuffle=True, random_state=123)

print("test score single tree:", single_tree.score(X_test, Y_test))
print("test score baseline:", baseline.score(X_test, Y_test))
print("test sklearn score forest:", sklearn_rf.score(X_test, Y_test))
print("test my score forest:", my_rf_regressor.score(X_test, Y_test))


print("train score single tree:", single_tree.score(X_train, Y_train))
print("train score baseline:", baseline.score(X_train, Y_train))
print("train sklearn score forest:", sklearn_rf.score(X_train, Y_train))
print("train my score forest:", my_rf_regressor.score(X_train, Y_train))

print("CV single tree:", single_tree_scores.mean())
print("CV baseline:", baseline_scores.mean())
print("CV sklearn forest:", sklearn_rf_scores.mean())
print("CV sklearn forest:", my_rf_regressor_scores.mean())

Fitting 1-th model out of 100
Fitting 2-th model out of 100
Fitting 3-th model out of 100
Fitting 4-th model out of 100
Fitting 5-th model out of 100
Fitting 6-th model out of 100
Fitting 7-th model out of 100
Fitting 8-th model out of 100
Fitting 9-th model out of 100
Fitting 10-th model out of 100
Fitting 11-th model out of 100
Fitting 12-th model out of 100
Fitting 13-th model out of 100
Fitting 14-th model out of 100
Fitting 15-th model out of 100
Fitting 16-th model out of 100
Fitting 17-th model out of 100
Fitting 18-th model out of 100
Fitting 19-th model out of 100
Fitting 20-th model out of 100
Fitting 21-th model out of 100
Fitting 22-th model out of 100
Fitting 23-th model out of 100
Fitting 24-th model out of 100
Fitting 25-th model out of 100
Fitting 26-th model out of 100
Fitting 27-th model out of 100
Fitting 28-th model out of 100
Fitting 29-th model out of 100
Fitting 30-th model out of 100
Fitting 31-th model out of 100
Fitting 32-th model out of 100
Fitting 33-th mod

## Use Random Forest Classifier in mushroom data

In [26]:
ESTIMATORS = 10
X_train, X_test, Y_train, Y_test = get_mushroom_data(test_size=0.3)

baseline = LogisticRegression()
single_tree = DecisionTreeClassifier()
sklearn_rf = sklearn.ensemble.RandomForestClassifier(n_estimators=ESTIMATORS)
my_rf_classifier = RandomForestRegressor(n_models=ESTIMATORS, n_features=int(np.sqrt(X_train.shape[1])), trace_logs=False)

single_tree.fit(X_train, Y_train)
baseline.fit(X_train, Y_train)
sklearn_rf.fit(X_train, Y_train)
my_rf_classifier.fit(X_train, Y_train)

single_tree_scores = my_cross_val_score(single_tree, X_train, Y_train, cv=5, shuffle=True, random_state=123)
baseline_scores = my_cross_val_score(baseline, X_train, Y_train, cv=5, shuffle=True, random_state=123)
sklearn_rf_scores = my_cross_val_score(sklearn_rf, X_train, Y_train, cv=5, shuffle=True, random_state=123)
my_rf_classifier_scores = my_cross_val_score(my_rf_classifier, X_train, Y_train, cv=5, shuffle=True, random_state=123)

print("test score single tree:", single_tree.score(X_test, Y_test))
print("test score baseline:", baseline.score(X_test, Y_test))
print("test sklearn score forest:", sklearn_rf.score(X_test, Y_test))
print("test my score forest:", my_rf_classifier.score(X_test, Y_test))


print("train score single tree:", single_tree.score(X_train, Y_train))
print("train score baseline:", baseline.score(X_train, Y_train))
print("train sklearn score forest:", sklearn_rf.score(X_train, Y_train))
print("train my score forest:", my_rf_classifier.score(X_train, Y_train))

print("CV single tree:", single_tree_scores.mean())
print("CV baseline:", baseline_scores.mean())
print("CV sklearn forest:", sklearn_rf_scores.mean())
print("CV sklearn forest:", my_rf_classifier_scores.mean())

dimensionality: 139
Fitting 1-th model out of 10
Fitting 2-th model out of 10
Fitting 3-th model out of 10
Fitting 4-th model out of 10
Fitting 5-th model out of 10
Fitting 6-th model out of 10
Fitting 7-th model out of 10
Fitting 8-th model out of 10
Fitting 9-th model out of 10
Fitting 10-th model out of 10
Fitting 1-th model out of 10
Fitting 2-th model out of 10
Fitting 3-th model out of 10
Fitting 4-th model out of 10
Fitting 5-th model out of 10
Fitting 6-th model out of 10
Fitting 7-th model out of 10
Fitting 8-th model out of 10
Fitting 9-th model out of 10
Fitting 10-th model out of 10
Fitting 1-th model out of 10
Fitting 2-th model out of 10
Fitting 3-th model out of 10
Fitting 4-th model out of 10
Fitting 5-th model out of 10
Fitting 6-th model out of 10
Fitting 7-th model out of 10
Fitting 8-th model out of 10
Fitting 9-th model out of 10
Fitting 10-th model out of 10
Fitting 1-th model out of 10
Fitting 2-th model out of 10
Fitting 3-th model out of 10
Fitting 4-th model o