In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression


def fetch_data(path):
    # fetch dataset 
    X = pd.read_csv(path, delimiter=',')
    X['waterbody'] = X['waterbody'].fillna('No')
    y = X['price']
    X = X.drop(columns="price")

    #X = auto_mpg.data.features 
    #y = auto_mpg.data.targets 
    #X = X.drop(columns=['cylinders'])
    ## Add the targets column to the features
    #X['targets'] = y
    #X = X[X['horsepower'].notna()]
    #y = X['targets']
    #X = X.drop(columns='targets')
    return X, y

X, y = fetch_data("House_Price.csv")
print(X['airport'])
print(X['waterbody'].dtype)
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Posebej obravnavamo kategorične atribute: naprimer vejitev waterbody = River in waterbody != River
# Zvezne atribute razdelimo v območja recimo 100 intervalov in za vsako mejo:
# izračunamo povprečje točk na levi strani in povprečje točk na desni strani.
# Seštejemo te dve povprečji in si shranimo to napako za to mejo.
# To naredimo za vsak atribut za vsako mejo in najdemo minimum vseh.
# Potem mamo prvo vozlišče in nato dodajamo. Sami si izberemo kdaj nehamo dodajati


0      YES
1       NO
2       NO
3      YES
4       NO
      ... 
501     NO
502    YES
503     NO
504    YES
505    YES
Name: airport, Length: 506, dtype: object
object
0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    19.0
Name: price, Length: 506, dtype: float64


In [None]:
class Node:
    def __init__(self, feature_index=None, threshold=None, value=None, left=None, right=None):
        self.feature_index = feature_index  # Index of feature to split on
        self.threshold = threshold  # Threshold value for the feature
        self.value = value  # Prediction value for the node (for leaf nodes)
        self.left = left  # Left subtree
        self.right = right  # Right subtree


class RegressionTree:
    def __init__(self, max_depth=None, min_samples_split=10):
        self.max_depth = max_depth  # Maximum depth of the tree
        self.min_samples_split = min_samples_split  # Minimum samples required to split a node

    def fit(self, X, y):
        self.root = self._fit(X, y, depth=0)
        
    def _fit(self, X, y, depth):
        # Check termination conditions
        if depth == self.max_depth or len(y) < self.min_samples_split or len(set(y)) == 1:
            return Node(value=np.mean(y))

        num_features = X.shape[1]
        min_mse = float('inf')
        best_feature_index = None
        best_threshold = None

        # Iterate over all features and thresholds to find the best split
        for feature_index in range(num_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                left_mask = X[:, feature_index] <= threshold
                right_mask = ~left_mask
                if len(y[left_mask]) > 0 and len(y[right_mask]) > 0:
                    mse = self._calculate_mse(y[left_mask]) + self._calculate_mse(y[right_mask])
                    if mse < min_mse:
                        min_mse = mse
                        best_feature_index = feature_index
                        best_threshold = threshold

        if best_feature_index is not None:
            left_mask = X[:, best_feature_index] <= best_threshold
            right_mask = ~left_mask
            left_subtree = self._fit(X[left_mask], y[left_mask], depth + 1)
            right_subtree = self._fit(X[right_mask], y[right_mask], depth + 1)
            return Node(feature_index=best_feature_index, threshold=best_threshold, left=left_subtree, right=right_subtree)
        else:
            return Node(value=np.mean(y))

    def predict(self, X):
        return np.array([self._predict(x, self.root) for x in X])

    def _predict(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self._predict(x, node.left)
        else:
            return self._predict(x, node.right)

    def _calculate_mse(self, y):
        return np.mean((y - np.mean(y))**2)