In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_excel("houses_for_rent_madrid.xlsx")

In [3]:
data.dtypes

Id                int64
District         object
Address          object
Number           object
Area             object
Rent              int64
Bedrooms        float64
Sq.Mt             int64
Floor           float64
Outer           float64
Elevator        float64
Penthouse         int64
Cottage           int64
Duplex            int64
Semidetached      int64
dtype: object

In [4]:
# Data preparation: Remove Number, Address, Id, drop rows with missing values,
# convert, get one-hot (dummy) encoding for the categoricals.
# Split to 80%-20% train-test.
from sklearn.model_selection import train_test_split
data_prepared = data.drop(columns=["Number", "Address", "Id"])
data_prepared = data_prepared.dropna(axis=0)
data_prepared["District"] = data["District"].astype("category")
data_prepared["Area"] = data["Area"].astype("category")
data_prepared = pd.get_dummies(data_prepared)
data_train, data_test = train_test_split(data_prepared, train_size = 0.8)

In [5]:
# How many features do we have now?
data_train.shape

(1452, 170)

In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

## Your job
Train a regressor for "Rent" using decision trees.  Start with no restriction on the tree growth, and then try to fiddle with the "max_depth" parameter
(to control the depth of the tree) and the "min_samples_split", which prevents splitting nodes that have too little training data in them.  Use a real number, for example, 0.05 means that the algorithm won't split nodes with less than 0.05*n training samples, where n is the size of the training set.

Either using a loop or manually, find the best choice of min_samples_split and max_depth on the test set (which is here used as a validation set only).

In [None]:
# Documentation for DecisionTreeRegressor:
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
clf = DecisionTreeRegressor()

In [7]:

X = data_prepared.drop('Rent', axis=1)
y = data_prepared['Rent']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

best_score = float('-inf')
best_params = {'max_depth': None, 'min_samples_split': 2}

max_depths = [None, 5, 10, 15, 20]
min_samples_splits = [2, 0.05, 0.1, 0.2]

for max_depth in max_depths:
    for min_samples_split in min_samples_splits:
        model = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=min_samples_split)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        score = r2_score(y_test, y_pred)

        if score > best_score:
            best_score = score
            best_params = {'max_depth': max_depth, 'min_samples_split': min_samples_split}

print(f"Best R^2 Score: {best_score}")
print(f"Best Parameters: {best_params}")

Best R^2 Score: 0.6238807728432401
Best Parameters: {'max_depth': 10, 'min_samples_split': 0.05}
