In [44]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import tree


def fetch_data(path):
    # fetch dataset 
    X = pd.read_csv(path, delimiter=',')
    X['waterbody'] = X['waterbody'].fillna('No')
    y = X['price']
    X = X.drop(columns="price")
    return X, y

def test_tree(X: pd.DataFrame, y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    return mean_squared_error(pred, y_test)





# Posebej obravnavamo kategorične atribute: naprimer vejitev waterbody = River in waterbody != River
# Zvezne atribute razdelimo v območja recimo 100 intervalov in za vsako mejo:
# izračunamo povprečje točk na levi strani in povprečje točk na desni strani.
# Seštejemo te dve povprečji in si shranimo to napako za to mejo.
# To naredimo za vsak atribut za vsako mejo in najdemo minimum vseh.
# Potem mamo prvo vozlišče in nato dodajamo. Sami si izberemo kdaj nehamo dodajati


In [40]:
class Node:
    def __init__(self, attribute=None, threshold=None, value=None, left=None, right=None, is_categorical=False):
        self.attribute = attribute  # Index of attribute to split on
        self.threshold = threshold  # Threshold value for the feature
        self.value = value  # Prediction value for the node (for leaf nodes)
        self.left = left  # Left subtree
        self.right = right  # Right subtree
        self.is_categorical = is_categorical



class RegressionTree:
    def __init__(self, max_depth=None, min_samples_split=10):
        self.max_depth = max_depth  # Maximum depth of the tree
        self.min_samples_split = min_samples_split  # Minimum samples required to split a node

    def fit(self, X: pd.DataFrame, y):
        self.root = self._fit(X, y, depth=0)
        
    def _fit(self, X: pd.DataFrame, y, depth):
        # Check termination conditions
        if depth == self.max_depth or len(y) < self.min_samples_split or len(set(y)) == 1:
            return Node(value=np.mean(y))

        num_attributes = len(X.columns)
        min_mse = float('inf')
        best_attribute = None
        best_attribute_ix = 0
        best_threshold = None
        categorical = False

        # Iterate over all features and thresholds to find the best split
        for ix, attr in enumerate(X.columns):
            thresholds = None
            # If current attribute is categoric, iterate through all the distinct values
            if X[attr].dtype == 'object':
                thresholds = list(X[attr].unique())
            else:                    
                thresholds = np.linspace(X[attr].min(), X[attr].max(), 100)
            for threshold in thresholds:
                left_mask = None
                if X[attr].dtype == 'object':
                    left_mask = X[attr] == threshold
                else:
                    left_mask = X[attr] <= threshold
                right_mask = ~left_mask
                if len(y[left_mask]) > 0 and len(y[right_mask]) > 0:
                    mse = self._calculate_mse(y[left_mask]) + self._calculate_mse(y[right_mask])
                    if mse < min_mse:
                        min_mse = mse
                        best_attribute = attr
                        best_attribute_ix = ix
                        best_threshold = threshold

        if best_attribute is not None:
            left_mask = None
            if X[best_attribute].dtype == 'object':
                left_mask = X[best_attribute] == best_threshold
                categorical = True
            else:
                left_mask = X[best_attribute] <= best_threshold
            right_mask = ~left_mask
            left_subtree = self._fit(X[left_mask], y[left_mask], depth + 1)
            right_subtree = self._fit(X[right_mask], y[right_mask], depth + 1)
            return Node(attribute=best_attribute_ix, threshold=best_threshold, left=left_subtree, right=right_subtree, is_categorical=categorical)
        else:
            return Node(value=y.mean())

    def predict(self, X):
        return np.array([self._predict(x, self.root) for x in X.to_numpy()])

    def _predict(self, x, node: Node):
        if node.value is not None:
            return node.value
        #print(node.attribute)
        if node.is_categorical:
            if x[node.attribute] == node.threshold:
                return self._predict(x, node.left)
            else:
                return self._predict(x, node.right)
        elif x[node.attribute] <= node.threshold:
            return self._predict(x, node.left)
        else:
            return self._predict(x, node.right)

    def _calculate_mse(self, y):
        return np.mean((y - y.mean())**2)

In [53]:
def k_fold_cross_validation(X, y, k=5, min_samples_split=10):
    kf = KFold(n_splits=k)

    encoded_data = pd.get_dummies(X[['airport', 'waterbody', 'bus_ter']], drop_first=True)
    my_mse_scores = []
    sk_mse_scores = []

    for ix, (train_index, test_index) in enumerate(kf.split(X)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        X_train_sk, X_test_sk = encoded_data.iloc[train_index], encoded_data.iloc[test_index]

        # Train the model
        print(f"Training our model {ix+1}/{k}")
        my_model = RegressionTree(min_samples_split=min_samples_split)
        my_model.fit(X_train, y_train)
        print(f"Our model predictions")
        # Make predictions on the test set
        y_pred = my_model.predict(X_test)
        # Calculate Mean Squared Error for this fold
        mse = mean_squared_error(y_test, y_pred)
        my_mse_scores.append(mse)

        print(f"Training sk model")
        sk_model = tree.DecisionTreeRegressor(min_samples_split=min_samples_split)
        sk_model.fit(X_train_sk, y_train)
        print(f"SK model predictions")
        y_pred = sk_model.predict(X_test_sk)
        mse = mean_squared_error(y_test, y_pred)
        sk_mse_scores.append(mse)


    # Calculate the overall mean of the Mean Squared Errors
    my_mse = np.mean(my_mse_scores)
    sk_mse = np.mean(sk_mse_scores)
    return my_mse, sk_mse

X, y = fetch_data("House_Price.csv")
my_mse, sk_mse = k_fold_cross_validation(X, y, k=5, min_samples_split=10)
print(f"My error: {my_mse}")
print(f"SK error: {sk_mse}")

Training our model 1/5
Our model predictions
Training sk model
SK model predictions
Training our model 2/5
Our model predictions
Training sk model
SK model predictions
Training our model 3/5
Our model predictions
Training sk model
SK model predictions
Training our model 4/5
Our model predictions
Training sk model
SK model predictions
Training our model 5/5
Our model predictions
Training sk model
SK model predictions
My error: 55.28019572919011
SK error: 92.82847331533853
