In [1]:
import numpy as np
import pandas as pd
from math import sqrt
from sklearn import metrics
from sklearn.metrics import mean_squared_error

# Load a kaggle data from example

Data [House Prices: Advanced Regression Techniques](https://www.kaggle.com/c/house-prices-advanced-regression-techniques)

In [2]:
import requests

url1 = "https://raw.githubusercontent.com/cockles98/MachineLearning_from_Scratch/master/data/house_prices/train.csv"
url2 = "https://raw.githubusercontent.com/cockles98/MachineLearning_from_Scratch/master/data/house_prices/test.csv"
response1 = requests.get(url1)
response2 = requests.get(url2)

with open("house_prices_train.csv", "wb") as file:
    file.write(response1.content)
    
with open("house_prices_test.csv", "wb") as file:
    file.write(response2.content)

In [3]:
df_train = pd.read_csv('house_prices_train.csv')
df_test = pd.read_csv('house_prices_test.csv')

# Decision Tree

In [4]:
X = df_train[['OverallQual', 'GrLivArea', 'GarageCars']]
y = df_train['SalePrice']

In [5]:
# Cost funtion
def rmse(h, y):
    # Root mean squared error
    return sqrt(mean_squared_error(h, y))

In [6]:
class Node:

    def __init__(self, x, y, idxs, min_leaf=5):
        self.x = x 
        self.y = y
        self.idxs = idxs 
        self.min_leaf = min_leaf
        self.row_count = len(idxs)
        self.col_count = x.shape[1]
        self.val = np.mean(y[idxs])
        self.score = float('inf')
        self.find_varsplit()
        
    def find_varsplit(self):
        for c in range(self.col_count): self.find_better_split(c)
        if self.is_leaf: return
        x = self.split_col
        lhs = np.nonzero(x <= self.split)[0]
        rhs = np.nonzero(x > self.split)[0]
        self.lhs = Node(self.x, self.y, self.idxs[lhs], self.min_leaf)
        self.rhs = Node(self.x, self.y, self.idxs[rhs], self.min_leaf)
        
    def find_better_split(self, var_idx):
      
        x = self.x.values[self.idxs, var_idx]

        for r in range(self.row_count):
            lhs = x <= x[r]
            rhs = x > x[r]
            if rhs.sum() < self.min_leaf or lhs.sum() < self.min_leaf: continue

            curr_score = self.find_score(lhs, rhs)
            if curr_score < self.score: 
                self.var_idx = var_idx
                self.score = curr_score
                self.split = x[r]
                
    def find_score(self, lhs, rhs):
        y = self.y[self.idxs]
        lhs_std = y[lhs].std()
        rhs_std = y[rhs].std()
        return lhs_std * lhs.sum() + rhs_std * rhs.sum()
                
    @property
    def split_col(self): return self.x.values[self.idxs,self.var_idx]
                
    @property
    def is_leaf(self): return self.score == float('inf')                

    def predict(self, x):
        return np.array([self.predict_row(xi) for xi in x])

    def predict_row(self, xi):
        if self.is_leaf: return self.val
        node = self.lhs if xi[self.var_idx] <= self.split else self.rhs
        return node.predict_row(xi)

In [7]:
class DecisionTreeRegressor:
  
    def fit(self, X, y, min_leaf = 5):
        self.dtree = Node(X, y, np.array(np.arange(len(y))), min_leaf)
        return self
  
    def predict(self, X):
        return self.dtree.predict(X.values)

In [8]:
regressor = DecisionTreeRegressor().fit(X, y)
preds = regressor.predict(X)

In [9]:
# R2 score
metrics.r2_score(y, preds)

0.8504381072711565

In [10]:
# Root MSE
rmse(preds, y)

30712.460628635836

In [11]:
# Standard deviation of the test dataframe
std = df_test.apply(pd.to_numeric, errors='coerce').std() 
std

Id                421.321334
MSSubClass         42.746880
MSZoning                 NaN
LotFrontage        22.376841
LotArea          4955.517327
                    ...     
MiscVal           630.806978
MoSold              2.722432
YrSold              1.301740
SaleType                 NaN
SaleCondition            NaN
Length: 80, dtype: float64

# Using scikit-learn 

In [12]:
from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor(n_estimators=1, max_depth=2, bootstrap=False, random_state=42)
reg.fit(X, y)

In [13]:
preds = reg.predict(X)
metrics.r2_score(y, preds)

0.6336246655552089

In [14]:
# R2 score
metrics.r2_score(y, preds)

0.6336246655552089

In [15]:
# Root MSE
rmse(preds, y)

48069.23940764968