In [12]:
class DecisionTreeRegressor:
    def __init__(self, max_depth=None, min_samples_split=2, random_feature_fraction=0.5):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.random_feature_fraction = random_feature_fraction
        self.tree = None
        self.feature_importances = None # Opsional

    def fit(self, X, y):
        self.feature_importances = np.zeros(X.shape[1])
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        unique_targets = np.unique(y)

        if len(unique_targets) == 1:
            return unique_targets[0]
        if num_samples < self.min_samples_split:
            return np.mean(y) if len(y) > 0 else 0
        if self.max_depth is not None and depth >= self.max_depth:
            return np.mean(y) if len(y) > 0 else 0

        feature_indices = np.random.choice(num_features, int(self.random_feature_fraction * num_features), replace=False)

        best_split = self._best_split(X, y, feature_indices)
        if best_split is None:
            return np.mean(y) if len(y) > 0 else 0

        left_mask, right_mask = self._split_data(X[:, best_split['feature']], best_split['value'])

        left_target, right_target = y[left_mask], y[right_mask]
        mse_before = self._calculate_mse(y, y)
        mse_after = self._calculate_mse(left_target, right_target)
        reduction_in_mse = mse_before - mse_after

        self.feature_importances[best_split['feature']] += reduction_in_mse

        left_tree = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_tree = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return {
            'feature': best_split['feature'],
            'value': best_split['value'],
            'left': left_tree,
            'right': right_tree
        }

    def _best_split(self, X, y, feature_indices):
        best_mse = float('inf')
        best_split = None

        for feature in feature_indices:    
            values = np.unique(X[:, feature])    
            for value in values:        
                left_mask, right_mask = self._split_data(X[:, feature], value)
                        
                if len(left_mask) == 0 or len(right_mask) == 0:
                    continue
        
                mse = self._calculate_mse(y[left_mask], y[right_mask])        
                if mse < best_mse:
                    best_mse = mse
                    best_split = {'feature': feature, 'value': value}
        
        return best_split

    def _split_data(self, feature_column, value):
        left_mask = feature_column <= value
        right_mask = ~left_mask
        return left_mask, right_mask

    def _calculate_mse(self, left_target, right_target):
        if len(left_target) == 0 or len(right_target) == 0:
            return float('inf')
        left_mse = np.mean((left_target - np.mean(left_target)) ** 2) if len(left_target) > 0 else 0
        right_mse = np.mean((right_target - np.mean(right_target)) ** 2) if len(right_target) > 0 else 0
        return (len(left_target) * left_mse + len(right_target) * right_mse) / (len(left_target) + len(right_target))

    def predict(self, X):
        return np.array([self._predict_single(x, self.tree) for x in X])

    def _predict_single(self, x, tree):
        if isinstance(tree, dict):
            if x[tree['feature']] <= tree['value']:
                return self._predict_single(x, tree['left'])
            else:
                return self._predict_single(x, tree['right'])
        else:
            return tree

class RandomForestRegressor:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, random_feature_fraction=0.5):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.random_feature_fraction = random_feature_fraction
        self.trees = []
        self.feature_importances_ = None # Opsional

    def fit(self, X, y):
        self.trees = []
        feature_importances = np.zeros(X.shape[1])
        for _ in range(self.n_estimators):
    
            bootstrap_indices = np.random.choice(len(X), len(X), replace=True)
            X_bootstrap = X[bootstrap_indices]
            y_bootstrap = y[bootstrap_indices]

            tree = DecisionTreeRegressor(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                random_feature_fraction=self.random_feature_fraction
            )
            tree.fit(X_bootstrap, y_bootstrap)
            self.trees.append(tree)

            feature_importances += tree.feature_importances
        self.feature_importances_ = feature_importances / self.n_estimators

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(tree_preds, axis=0)

In [20]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

data = pd.read_csv("Realtor20k-Sample.csv")

print("Data sample:")
print(data.head())

X = data[['bed', 'bath', 'acre_lot', 'house_size', 'state_number']].values
y = data['price'].values

def train_test_split(X, y, test_size=0.2):

    split_idx = int(len(X) * (1 - test_size))
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print("\nTraining set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])

rf = RandomForestRegressor(n_estimators=100, max_depth=5, random_feature_fraction=0.4)
rf.fit(X_train, y_train)

y_test_pred = rf.predict(X_test)
y_test_pred_rounded = np.round(y_test_pred, 2)

comparison_test = pd.DataFrame({
    'True Price': y_test,
    'Predicted Price': y_test_pred_rounded
})

print("\nSample comparison (True vs Predicted on Test Set):")
print(comparison_test.head(10))

r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

print("\nModel Evaluation Metrics on Test Set:")
print(f"R² Score: {r2_test:.4f}")
print(f"Mean Squared Error (MSE): {mse_test:.4f}")
print(f"Mean Absolute Error (MAE): {mae_test:.4f}")

print("Feature Importances:", rf.feature_importances_)

Data sample:
      price  bed  bath  acre_lot  house_size  state_number
0  480000.0  3.0   3.0      0.08      1648.0             1
1  932995.0  2.0   3.0      0.15      2045.0             1
2  525000.0  5.0   3.0      0.14      2616.0             1
3  315000.0  2.0   2.0      0.21      1633.0             1
4  650000.0  4.0   3.0      0.21      2577.0             1

Training set size: 16000
Test set size: 4000

Sample comparison (True vs Predicted on Test Set):
   True Price  Predicted Price
0    147000.0        231632.70
1    639000.0        276075.32
2    829900.0        848995.11
3    549900.0        487780.71
4    785000.0        303893.97
5   1495000.0        852630.52
6    710000.0        696790.83
7    535000.0        450191.83
8    585000.0        537146.01
9   2100000.0       1424114.90

Model Evaluation Metrics on Test Set:
R² Score: 0.4218
Mean Squared Error (MSE): 239696292373.2661
Mean Absolute Error (MAE): 244041.2767
Feature Importances: [3.43174822e+13 5.14357021e+13 7.7

In [22]:
import joblib
joblib.dump(rf, 'RandomForest_Model-20K_Data.joblib')

print("Model saved!")

Model saved!


In [18]:
loaded_rf = joblib.load('RandomForest_Model-20K.joblib')

print("Model loaded successfully")

Model loaded successfully
