## 1. Data Collection

In [75]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [16]:
apartment_df = pd.read_csv('rent_apartments.csv')
apartment_df.head(10)

Unnamed: 0,address,area,constraction_year,rooms,bedrooms,bathrooms,balcony,storage,parking,furnished,garage,garden,energy,facilities,zip,neighborhood,rent
0,1071 HN Amsterdam (Cornelis Schuytbuurt),167.0,1870,3,2,2,yes,no,no,yes,no,Not present,D,Roof terrace,1071 HN,Cornelis Schuytbuurt,4500
1,1071 HK Amsterdam (Concertgebouwbuurt),150.0,1890,3,2,2,yes,no,yes,yes,no,Not present,A,"Cable TV, Internet connection, Fireplace, Bath...",1071 HK,Concertgebouwbuurt,3450
2,1071 HK Amsterdam (Concertgebouwbuurt),150.0,1890,3,2,2,yes,no,yes,yes,no,Not present,A,"Cable TV, Internet connection, Fireplace, Bath...",1071 HK,Concertgebouwbuurt,3450
3,1071 WV Amsterdam (Hondecoeterbuurt),90.0,1923,3,2,1,yes,no,no,yes,no,Not present,,"Shower, Toilet",1071 WV,Hondecoeterbuurt,2000
4,1071 WV Amsterdam (Hondecoeterbuurt),104.0,1923,3,2,1,no,no,no,no,no,Present (47 m²),D,"Shower, Bath, Toilet",1071 WV,Hondecoeterbuurt,3250
5,1071 WV Amsterdam (Hondecoeterbuurt),90.0,1923,3,2,1,yes,no,no,yes,no,Not present,,"Shower, Toilet",1071 WV,Hondecoeterbuurt,2000
6,1071 WV Amsterdam (Hondecoeterbuurt),104.0,1923,3,2,1,no,no,no,no,no,Present (47 m²),D,"Shower, Bath, Toilet",1071 WV,Hondecoeterbuurt,3250
7,1071 XL Amsterdam (Duivelseiland),67.0,1923,3,2,1,no,no,no,no,no,Not present,C,"Shower, Bath, Toilet",1071 XL,Duivelseiland,1950
8,1071 XL Amsterdam (Duivelseiland),67.0,1923,3,2,1,no,no,no,no,no,Not present,C,"Shower, Bath, Toilet",1071 XL,Duivelseiland,1850
9,1071 AC Amsterdam (P.C. Hooftbuurt),85.0,1900,2,1,1,no,no,yes,yes,no,Not present,,"Bath, Toilet",1071 AC,P.C. Hooftbuurt,2650


## 2. Data Preparation

In [17]:
apartment_df.dtypes

address               object
area                 float64
constraction_year      int64
rooms                  int64
bedrooms               int64
bathrooms              int64
balcony               object
storage               object
parking               object
furnished             object
garage                object
garden                object
energy                object
facilities            object
zip                   object
neighborhood          object
rent                   int64
dtype: object

In [18]:
cols_to_encode = ['balcony', 'storage', 'parking', 'furnished', 'garage']

encoded_df = pd.get_dummies(apartment_df, columns=cols_to_encode, drop_first=True)

In [19]:
import re

def handle_garden(x: pd.Series):
    if x == "Not present":
        return 0

    return int(re.findall(r'\d+', x)[0])

try:
    encoded_df.garden = encoded_df.garden.map(handle_garden)
except TypeError:
    print("Already converted to int")

In [20]:
# location is often one of the most important vectors for determining real estate prices
# because there are so many neighborhoods, we can simplify the names by using only the
# first word of the neighborhood
def handle_neighborhood(x: pd.Series):
    return x.split()[0]

encoded_df.neighborhood = encoded_df.neighborhood.map(handle_neighborhood)

# then we will get the dummies for the neighborhood
encoded_df = pd.get_dummies(encoded_df, columns=['neighborhood'], drop_first=True)

In [21]:
# because we encoded the rough location, we can drop uninterpretable columns related to location
# such as address and zip
encoded_df = encoded_df.drop(columns=['address', 'zip'])

In [None]:
# now we can encode the facilities in the apartment listing by extracting a
# normalized version of the tokens and binarizing the labels
def clean_facility(tokens: str):
    if type(tokens) == float:
        return []

    res = []

    for token in tokens:
        token = token.strip().lower()
        # turn spaces or hyphens into underscores
        token = re.sub(r"[\s\-]+", "_", token)
        res.append(token)

    return res

In [81]:
facility_list = encoded_df["facilities"].str.split(r",\s*")
encoded_df["facility_list"] = facility_list.apply(lambda x: clean_facility(x))

In [84]:
# lets get a cleaned list of possible facilities
facilities_set = set()

for s in encoded_df.facilities.values:
    try:
        s = s.lower()
    except AttributeError:
        continue
    facility_list = s.split(',')
    for facility in facility_list:
        facilities_set.add(facility.strip())

cleaned_facilities = []

for facility in facilities_set:

    cleaned_s = facility.replace(" ", "_")
    cleaned_s = cleaned_s.replace("-", "_")

    cleaned_facilities.append(cleaned_s)

In [85]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=cleaned_facilities)
fac_dummies = pd.DataFrame(
    mlb.fit_transform(encoded_df["facility_list"]), columns=mlb.classes_, index=encoded_df.index
)

# merge back into your df
encoded_df = pd.concat([encoded_df, fac_dummies], axis=1)
encoded_df["n_facilities"] = encoded_df["facility_list"].str.len()

In [86]:
# after this process we can drop the uninterpretable columns such as facility_list and facilities
encoded_df = encoded_df.drop(columns=["facility_list", "facilities"])

In [87]:
# the last variable we need to handle is the "energy" column
# ordered list from worst → best
ordered_grades = [
    "G",
    "F",
    "E",
    "D",
    "C",
    "B",
    "A",
    "A+",
    "A++",
    "A+++",
    "A++++",
]

# make an ordered dtype
cat_type = pd.api.types.CategoricalDtype(categories=ordered_grades, ordered=True)

# convert and pull out codes
encoded_df["energy_cat"] = encoded_df["energy"].astype(cat_type)
# codes will be 0 … 10; we'll shift to 1 … 11
encoded_df["energy_code"] = encoded_df["energy_cat"].cat.codes.replace(-1, np.nan) + 1

In [88]:
# energy grade missing indicator
encoded_df["energy_missing"] = encoded_df["energy_code"].isna().astype(int)

# 2) fill with median of observed codes
median_code = encoded_df["energy_code"].median()
encoded_df["energy_code"].fillna(median_code, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  encoded_df["energy_code"].fillna(median_code, inplace=True)


In [89]:
# lets prune unecessary columns
encoded_df = encoded_df.drop(columns=["energy", "energy_cat"])

## 3. Model Building

In [104]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.base import RegressorMixin

In [None]:
# problem: predict rental price of apartments
X_df = encoded_df.drop(columns=["rent"])
y_df = encoded_df.rent
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, train_size=0.8, test_size=0.2)

In [121]:
linreg = LinearRegression().fit(X_train, y_train)
ridgereg = Ridge(alpha=2).fit(X_train, y_train)
lassoreg = Lasso(alpha=1).fit(X_train, y_train)
rfreg = RandomForestRegressor().fit(X_train, y_train)

models = [linreg, ridgereg, lassoreg, rfreg]

In [122]:
def regression_scorer(model: RegressorMixin, X_test: pd.DataFrame, y_test: pd.DataFrame):
    y_pred = model.predict(X_test)
    return root_mean_squared_error(y_true=y_test, y_pred=y_pred)


In [123]:
for model in models:
    print(f"{model}: ", regression_scorer(model, X_test=X_test, y_test=y_test))

LinearRegression():  430.73913464611417
Ridge(alpha=2):  418.8963917069548
Lasso(alpha=1):  432.1042736326563
RandomForestRegressor():  376.65379731073614


In [125]:
# looks like the random forest model is the best, lets tune our hyper parameters now
from sklearn.model_selection import GridSearchCV

In [126]:
grid_space = {'n_estimators': [100, 200, 300], 'max_depth': [3, 6, 9, 12, 24, 48, 96]}

In [128]:
grid = GridSearchCV(RandomForestRegressor(), grid_space, cv=5, scoring="r2")
model_grid = grid.fit(X_train, y_train)

In [129]:
print(f"Best hyperparameters are {model_grid.best_params_}, score = {model_grid.best_score_}")

Best hyperparameters are {'max_depth': 96, 'n_estimators': 200}, score = 0.7818370670449909


## 4. Model Management

In [130]:
import pickle as pk

In [131]:
final_model = RandomForestRegressor(max_depth=96, n_estimators=200).fit(X_train, y_train)

In [133]:
pk.dump(final_model, open("models/rf_v1", "wb"))

In [134]:
rf_v1 = pk.load(open("models/rf_v1", "rb"))