In [1]:
import numpy as np
import pandas as pd



In [3]:
raw_data=pd.read_csv("03_realestate_dataset.csv",delimiter=";",decimal=",")
raw_data.columns=['X0','X1','X2','X3','X4','X5','X6','Y']
raw_data=raw_data.drop(['X1'], axis=1)
raw_data.head(5)
#raw_data.dtypes

#isinstance(data, pd.DataFrame)
#raw_data.shape
#list(data.columns)

Unnamed: 0,X0,X2,X3,X4,X5,X6,Y
0,1,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,5.0,390.5684,5,24.97937,121.54245,43.1


In [4]:
class Node():
    def __init__(self, feature_index=None, threshold_value=None, left_child=None, right_child=None, variance_reduction=None, value=None):
        self.feature_index=feature_index
        self.threshold_value=threshold_value
        self.left_child=left_child
        self.right_child=right_child
        self.variance_reduction=variance_reduction
        self.value=value

In [5]:



class RegressionTree():
    def __init__(self,min_samples_in_split=2,max_tree_depth=2):  # for the stopping criteria to avoid overfitting problem
        self.root=None 
        self.min_samples_in_split=min_samples_in_split
        self.max_tree_depth=max_tree_depth
        
        
        
    def variance_reduction(self,parent_node,left_child,right_child):
        weight_left_child=len(left_child)/len(parent_node) # is used for scaling
        weight_right_child=len(right_child)/len(parent_node)
        variance_reduction=np.var(parent_node) - (weight_left_child*np.var(left_child) + weight_right_child*np.var(right_child))  
        return variance_reduction
            
    
#define the tree building method
 
    def build_tree(self,dataset,current_depth=0):
        X,Y = dataset[:,:-1], dataset[:,-1]  #seperate the variables from the target variable
        num_samples,num_features=np.shape(X)
        best_split_pool={}
        if num_samples>=self.min_samples_in_split and current_depth<=self.max_tree_depth:
            best_split_pool=self.find_best_split(dataset,num_samples,num_features)
            if best_split_pool["variance_reduction"]>0: 
                left_subtree=self.build_tree(best_split_pool["left_data"],current_depth+1)
                right_subtree=self.build_tree(best_split_pool["right_data"],current_depth+1)
                
                #create the decision node
                return Node(best_split_pool["feature_index"],best_split_pool["threshold_value"], left_subtree,right_subtree,best_split_pool["variance_reduction"])

        
        leaf_value=self.leaf_node_value(Y) #  we use the calculate_leaf_value function to calculate leaf node value
        return Node(value=leaf_value)   
    
    def split(self, dataset,feature_index, threshold_value):
        left_data= np.array([row for row in dataset if row[feature_index]<=threshold_value])
        right_data= np.array([row for row in dataset if row[feature_index]>threshold_value])
        return left_data, right_data
    
    
    def find_best_split(self,dataset,num_samples,num_features):
        best_split_pool={} 
        max_variance_reduction=-float("inf")
        
        for feature_index in range(num_features): #loop through all features and all possible threshold values for that feature
            feature_values=dataset[:,feature_index]
            possible_thresholds=np.unique(feature_values)  
            
            for threshold_value in possible_thresholds:
                left_data, right_data=self.split(dataset, feature_index,threshold_value)
                if len(left_data)>0 and len(right_data)>0: 
                    Y,left_Y,right_Y=dataset[:,-1], left_data[:,-1],right_data[:,-1] 
                    curr_variance_reduction=self.variance_reduction(Y,left_Y,right_Y)  #to calculate the IG or reduction in impurity, use variance reduction
                    
                    if curr_variance_reduction>max_variance_reduction: # if this IG is greater than the max IG then update best split 
                        best_split_pool["feature_index"]=feature_index
                        best_split_pool["threshold_value"]=threshold_value
                        best_split_pool["left_data"]=left_data
                        best_split_pool["right_data"]=right_data
                        best_split_pool["variance_reduction"]=curr_variance_reduction
                        max_variance_reduction=curr_variance_reduction
                    
        return best_split_pool
    
    
    def leaf_node_value(self,Y):
        leaf_val=np.mean(Y)
        return leaf_val
    

    #to print the decision tree     
    
    def print_tree(self, tree=None, indent=" "):
        if not tree:
            tree=self.root
            
        
        if tree.value is not None:
            print(tree.value)
            
        else:
            print("X_"+str(tree.feature_index), "<=", tree.threshold_value, "?", tree.variance_reduction)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left_child, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right_child, indent +  indent)
            
    
    def fit(self, X,Y):
        
        dataset=np.concatenate((X,Y), axis=1)
        self.root= self.build_tree(dataset)  #build the tree to train the model
        
    
    def make_prediction(self,x,tree): # take a single data point and find the corresponding y value
        # start with root node, if it meets the conditions, then it goes to left child else moves to the right node. repeat 
        # till you reach to the leaf node
        
        if tree.value!=None: 
            return tree.value
        feature_val=x[tree.feature_index]
        if feature_val<= tree.threshold_value:
            return self.make_prediction(x,tree.left_child)
        else:
            return self.make_prediction(x,tree.right_child)
        
        
        
    def predict(self,X):
        predictions=[self.make_prediction(x,self.root) for x in X]
        return predictions  #an array of target values
            
        
        

In [7]:

# Training and test split

X=raw_data.iloc[:,:-1].values
Y= raw_data.iloc[:,-1].values.reshape(-1,1)

from sklearn.model_selection import train_test_split
X_train, X_test,Y_train,Y_test=train_test_split(X,Y, test_size=0.20, random_state=41)


X_train.shape
X_test.shape



(83, 6)

In [8]:
# now fit the model

regressor= RegressionTree(min_samples_in_split=3,max_tree_depth=3)
regressor.fit(X_train,Y_train)
regressor.print_tree()

X_2 <= 967.4 ? 82.36533475987822
 left:X_1 <= 8.5 ? 20.92490340212737
  left:X_2 <= 383.8624 ? 8.604444444444454
    left:X_5 <= 121.54102 ? 12.12001851851852
        left:49.98666666666668
        right:57.17777777777778
    right:X_2 <= 577.9615 ? 8.712
        left:46.67
        right:38.75
  right:X_2 <= 330.0854 ? 12.728022081608842
    left:X_5 <= 121.54026 ? 22.64988587104476
        left:39.07058823529412
        right:49.266666666666666
    right:X_4 <= 24.96398 ? 9.182353578336553
        left:29.581818181818186
        right:39.47659574468085
 right:X_5 <= 121.51046 ? 16.877603697493974
  left:X_1 <= 18.0 ? 2.0310443333100174
    left:X_0 <= 330.0 ? 2.5986170703575544
        left:17.05333333333333
        right:12.05
    right:X_2 <= 4136.271 ? 4.55013888888889
        left:22.375
        right:17.85
  right:X_4 <= 24.9832 ? 13.405732981548788
    left:X_1 <= 21.7 ? 8.801822909685551
        left:26.82857142857143
        right:18.2
    right:X_3 <= 0.0 ? 40.833333333333336

In [9]:
# Test the model by calculating MSE
Y_pred=regressor.predict(X_test)
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(Y_test,Y_pred))


10.877171469733579