In [1]:
import numpy as np

class DecisionTreeRegressor:
    def __init__(self, max_depth=None, min_samples_split=2, random_feature=5):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.random_feature = random_feature
        self.tree = None
        self.feature_importances = None  # Optional

    def fit(self, X, y):
        self.feature_importances = np.zeros(X.shape[1])
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        unique_targets = np.unique(y)

        if len(unique_targets) == 1:
            return unique_targets[0]
        if num_samples < self.min_samples_split:
            return np.mean(y) if len(y) > 0 else 0
        if self.max_depth is not None and depth >= self.max_depth:
            return np.mean(y) if len(y) > 0 else 0

        # Ensure that random_feature is within bounds
        num_random_features = min(self.random_feature, num_features)

        # Randomly select `num_random_features` features
        feature_indices = np.random.choice(num_features, num_random_features, replace=False)

        best_split = self._best_split(X, y, feature_indices)
        if best_split is None:
            return np.mean(y) if len(y) > 0 else 0

        left_mask, right_mask = self._split_data(X[:, best_split['feature']], best_split['value'])

        left_target, right_target = y[left_mask], y[right_mask]
        mse_before = self._calculate_mse(y, y)
        mse_after = self._calculate_mse(left_target, right_target)
        reduction_in_mse = mse_before - mse_after

        self.feature_importances[best_split['feature']] += reduction_in_mse

        left_tree = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_tree = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return {
            'feature': best_split['feature'],
            'value': best_split['value'],
            'left': left_tree,
            'right': right_tree
        }

    def _best_split(self, X, y, feature_indices):
        best_mse = float('inf')
        best_split = None

        for feature in feature_indices:
            values = np.unique(X[:, feature])
            for value in values:
                left_mask, right_mask = self._split_data(X[:, feature], value)

                if len(left_mask) == 0 or len(right_mask) == 0:
                    continue

                mse = self._calculate_mse(y[left_mask], y[right_mask])
                if mse < best_mse:
                    best_mse = mse
                    best_split = {'feature': feature, 'value': value}

        return best_split

    def _split_data(self, feature_column, value):
        left_mask = feature_column <= value
        right_mask = ~left_mask
        return left_mask, right_mask

    def _calculate_mse(self, left_target, right_target):
        if len(left_target) == 0 or len(right_target) == 0:
            return float('inf')
        left_mse = np.mean((left_target - np.mean(left_target)) ** 2) if len(left_target) > 0 else 0
        right_mse = np.mean((right_target - np.mean(right_target)) ** 2) if len(right_target) > 0 else 0
        return (len(left_target) * left_mse + len(right_target) * right_mse) / (len(left_target) + len(right_target))

    def predict(self, X):
        return np.array([self._predict_single(x, self.tree) for x in X])

    def _predict_single(self, x, tree):
        if isinstance(tree, dict):
            if x[tree['feature']] <= tree['value']:
                return self._predict_single(x, tree['left'])
            else:
                return self._predict_single(x, tree['right'])
        else:
            return tree

class RandomForestRegressor:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, random_feature=5):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.random_feature = random_feature
        self.trees = []
        self.feature_importances_ = None  # Optional

    def fit(self, X, y):
        self.trees = []
        feature_importances = np.zeros(X.shape[1])
        for _ in range(self.n_estimators):

            bootstrap_indices = np.random.choice(len(X), len(X), replace=True)
            X_bootstrap = X[bootstrap_indices]
            y_bootstrap = y[bootstrap_indices]

            tree = DecisionTreeRegressor(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                random_feature=self.random_feature
            )
            tree.fit(X_bootstrap, y_bootstrap)
            self.trees.append(tree)

            feature_importances += tree.feature_importances
        self.feature_importances_ = feature_importances / self.n_estimators

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(tree_preds, axis=0)
    
def TrainTest_Split(X, y, test_size=0.4, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)

    indices = np.random.permutation(len(X))
    X_shuffled = X[indices]
    y_shuffled = y[indices]

    split_idx = int(len(X) * (1 - test_size))
    X_train, X_test = X_shuffled[:split_idx], X_shuffled[split_idx:]
    y_train, y_test = y_shuffled[:split_idx], y_shuffled[split_idx:]

    return X_train, X_test, y_train, y_test

In [14]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

data = pd.read_csv("Realtor50K.csv")

print("Data sample:")
print(data.head())

X = data[['brokered_by', 'bed', 'bath', 'acre_lot', 'street', 'zip_code', 'house_size', 'city_encoded', 'state_encoded']].values
y = data['price'].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])

rf = RandomForestRegressor(n_estimators=20, max_depth=10, random_feature=3)
rf.fit(X_train, y_train)

y_test_pred = rf.predict(X_test)
y_test_pred_rounded = np.round(y_test_pred, 2)

Data sample:
   brokered_by    status      price  bed  bath  acre_lot     street  \
0      92147.0  for_sale   110000.0  7.0   3.0      0.09  1842706.0   
1      94933.0  for_sale   950000.0  5.0   4.0      0.99  1260473.0   
2     103341.0  for_sale  6899000.0  4.0   6.0      0.83    17467.0   
3      21163.0  for_sale   525000.0  3.0   3.0      0.45  1813270.0   
4      67455.0  for_sale   289900.0  3.0   2.0      0.36  1698080.0   

           city           state  zip_code  house_size prev_sold_date  \
0        Dorado     Puerto Rico     949.0      1192.0     2019-06-28   
1  Saint Thomas  Virgin Islands     802.0      5000.0     2013-10-11   
2  Saint Thomas  Virgin Islands     802.0      4600.0     2018-04-05   
3        Agawam   Massachusetts    1001.0      2314.0     2014-06-25   
4        Agawam   Massachusetts    1001.0      1276.0     2012-10-12   

   city_encoded  state_encoded  
0             0              0  
1             1              1  
2             1             

In [15]:
comparison_test = pd.DataFrame({
    'True Price': y_test,
    'Predicted Price': y_test_pred_rounded
})

print("\nSample comparison (True vs Predicted on Test Set):")
print(comparison_test.head(10))

r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

print("\nModel Evaluation Metrics on Test Set:")
print(f"R² Score: {r2_test:.4f}")
print(f"Mean Squared Error (MSE): {mse_test:.4f}")
print(f"Mean Absolute Error (MAE): {mae_test:.4f}")

print("\nFeature Importances:", rf.feature_importances_)


Sample comparison (True vs Predicted on Test Set):
   True Price  Predicted Price
0    889000.0        802107.03
1    363000.0        353569.85
2    629900.0        714790.59
3   1325000.0       1590487.44
4    299000.0        290850.18
5    139999.0        276144.21
6    349900.0        606312.51
7    679900.0        454093.95
8    599000.0        462517.74
9    164900.0        183858.62

Model Evaluation Metrics on Test Set:
R² Score: 0.5659
Mean Squared Error (MSE): 694077106458.5076
Mean Absolute Error (MAE): 258081.5723

Feature Importances: [2.13378555e+14 1.09593331e+14 1.11061593e+14 2.53378438e+14
 2.14692850e+14 2.41202550e+14 3.50684328e+14 2.14162447e+14
 5.47714302e+13]


In [16]:
import joblib
joblib.dump(rf, 'RandomForest_Model.joblib')

print("Model saved!")

loaded_rf = joblib.load('RandomForest_Model.joblib')

print("Model loaded successfully")

Model saved!
Model loaded successfully


In [None]:
# Alternative Function

# Library scikit-learn: Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Library numpy: Calculation of r2 score, MSE, MAE
import numpy as np
def r2_score_manual(true_values, predicted_values):
    true_values = np.array(true_values)
    predicted_values = np.array(predicted_values)
    
    true_mean = np.mean(true_values)
    ss_total = np.sum((true_values - true_mean) ** 2)
    ss_residual = np.sum((true_values - predicted_values) ** 2)
    
    return 1 - (ss_residual / ss_total)

def mse_manual(true_values, predicted_values):
    true_values = np.array(true_values)
    predicted_values = np.array(predicted_values)
    return np.mean((true_values - predicted_values) ** 2)

def mae_manual(true_values, predicted_values):
    true_values = np.array(true_values)
    predicted_values = np.array(predicted_values)
    return np.mean(np.abs(true_values - predicted_values))

# Library scikit-learn: Categorical Encoding & Machine Learning Algorithm
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv("Realtor50K.csv")
label_encoders = {}
for column in ['state', 'city']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

X = df.drop(columns=['price', 'status', 'prev_sold_date'])
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)